From 2460cb7a85ee21da0529bfe43d164abb2270da44 Mon Sep 17 00:00:00 2001 From: YoelShoshan Date: Fri, 19 Jul 2024 17:42:33 -0400 Subject: [PATCH 1/5] adding injector tokenizer to support scalars and vectors injection --- .../injectortokenizer/injector_tokenizer.py | 192 ++++++++++++++++++ fusedrug/data/tokenizer/ops/__init__.py | 1 + .../tokenizer/ops/injector_tokenizer_ops.py | 144 +++++++++++++ 3 files changed, 337 insertions(+) create mode 100644 fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py create mode 100644 fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py diff --git a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py new file mode 100644 index 00000000..accf5b45 --- /dev/null +++ b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py @@ -0,0 +1,192 @@ +from fusedrug.data.tokenizer.modulartokenizer.modular_tokenizer import ModularTokenizer +from typing import Dict +from typing import Optional, List, Union, Tuple, Any +from tokenizers import Encoding +import omegaconf +import torch +from collections.abc import Iterable +import re + + +class InjectorTokenizer(ModularTokenizer): + """ + InjectorTokenizer builds on top of ModularTokenizer. + + Its purpose is to extend beyond "standard" input tokens as integers as input for a model. + Instead, it provides control on *vectors* that are to be used as input for a model. + + Example use cases: + 1. Providing scalars (floating point) as inputs + 2. Providing vectors of embeddings - for example of a protein embedding + + Each input "token" becomes a tensor of a defined size, and is built of: + 1. Header + made of 4 floats + [ + 0.0 or 1.0 #is this a sentinel/mask or not + 0.0 or 1.0 #is this a standard vocabulary token + 0.0 or 1.0 #is this a scalar + 0.0 or 1.0 #is this a full injected vector (e.g. an embedding) + ] + 2. Content + the rest of each input vector is made of input_dim-4 float elements. + + + Note - in the "standard vocabulary token" - we support providing an external embeding layer (like in vanilla T5), + as it's part of the trained weights. + + """ + + def __init__( + self, + input_dim: int, + embedding_layer: torch.nn.Module, + tokenizers_info: Union[List, omegaconf.listconfig.ListConfig], + load_adjusted_jsons: Optional[bool] = False, + special_tokens_dict: Optional[Dict] = None, + additional_tokens_list: Optional[List] = None, + max_possible_token_id: Optional[int] = None, + max_special_token_id: Optional[int] = None, + **kwargs: Any, + ) -> None: + """ + input_dim: the size of a vector of each input. The total output will be [sequence length, input_dim] + + + """ + self._input_dim = input_dim + self._embedding_layer = embedding_layer + self._modular_tokenizer = ModularTokenizer( + tokenizers_info=tokenizers_info, + load_adjusted_jsons=load_adjusted_jsons, + special_tokens_dict=special_tokens_dict, + additional_tokens_list=additional_tokens_list, + max_possible_token_id=max_possible_token_id, + max_special_token_id=max_special_token_id, + **kwargs, + ) + + print("") + + def encode_list( + self, + typed_input_list: List, + max_len: Optional[int] = None, + padding_token_id: Optional[int] = None, + padding_token: Optional[str] = "", + pad_type_id: Optional[int] = None, + return_overflow_info: Optional[bool] = False, + on_unknown: Optional[str] = "warn", + verbose: int = 1, + ) -> Union[Encoding, Tuple[Encoding, str]]: + """_summary_ + + Args: + typed_input_list (List): list of collections.namedtuple("input_type", ["input_string", "max_len"]), with + input type: the name of input type, + input_string: the string to be encoded + max_len: maximal length of the encoding (in tokens). Only relevant for truncation, as we do not need to + pad individual sub-tokenizer encodings - we only pad the final encoding of the ModularTokenizer. + The smallest value between config-defined and tuple-defined is used. If None, the max_len + that was defined for the sub-tokenizer in the config is used. + max_len (Optional[int], optional): _description_. Defaults to None. + padding_token_id (Optional[str], optional): _description_. Defaults to 0. TODO: default to None and infer it + padding_token (Optional[str], optional): _description_. Defaults to "". + pad_type_id (Optional[int], optional): _description_. Defaults to 0. (TODO: raise exception) + return_overflow_info (Optional[bool], optional): If True return an additional string with overflow information. Defaults to False. + on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to ) are encountered: 'raise' or 'warn' + verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning + with full data. Defaults to 1. + Returns: + Encoding: _description_ + """ + + raise NotImplementedError + + def encode( + self, + sequence: str, + max_len: Optional[int] = None, + padding_token_id: Optional[int] = 0, + padding_token: Optional[str] = "", + pad_type_id: Optional[int] = 0, + return_overflow_info: Optional[bool] = False, + on_unknown: Optional[str] = "warn", + verbose: Optional[int] = 1, + ) -> Encoding: + # (self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True) + """Receives a user-supplied string that contains, in addition to the text that is to be tokenized, special delimiters signifying the type + of input within each span of text (e.g. <@TOKENIZER-TYPE=AA> sequence, <@TOKENIZER-TYPE=SMILES>, etc.). These determine the type of tokenizer to use on each span, + and are not encoded. + Optionaly, you may also describe maximum length per section, for example: + "<@TOKENIZER-TYPE=AA>QKPGQAPRLLIYG<@TOKENIZER-TYPE=AA@MAX-LEN=122>SGSDFSDFSFD" + would not have a local limitation of the first AA section, but will have a local maximum length of 122 on the second section. + local in this context means that the maximum length will be imposed on the individual section prior to applying any global "entire sequence" maximum size limitations (if any). + + Args: + input_string (str): _description_ + max_len (Optional[int], optional): _description_. Defaults to None. + padding_token_id (Optional[str], optional): _description_. Defaults to 0. + padding_token (Optional[str], optional): _description_. Defaults to "". + pad_type_id (Optional[int], optional): _description_. Defaults to 0. + return_overflow_info (Optional[bool], optional): _description_. If True return an additional string with overflow information. Defaults to False. + on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to ) are encountered: 'raise' or 'warn' + verbose (int, optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning + with full data. Defaults to 1. + Returns: + Encoding: _description_ + str: _description_ information on overflow, if return_overflow_info=True + """ + + raise NotImplementedError + + def decode(self, ids: Iterable, skip_special_tokens: Optional[bool] = False) -> str: + """Receives a list of IDs and returns a string of tokens + TODO: possibly output also the type of token (AA, SMILES, etc) + Args: + ids (Iterable): _description_ + skip_special_tokens (Optional[bool], optional): _description_. Defaults to False. + + Returns: + str: _description_ + """ + + raise NotImplementedError + + @staticmethod + def build_placeholder_meta_tokenization(sequence: str) -> Tuple[str, List[str]]: + """ + In order to avoid modifying and rewriting the logic in modular tokenizer, especially regarding padding, limitation of max length of certain sub-parts, + we put placeholders to make sure that the total size is known/fixed and respects the meta instructions to the modular tokenizer + """ + hints_and_subseq = re.split("<@TOKENIZER-TYPE=([^>]*)>", sequence)[ + 1: + ] # the first element is blank - removing it + assert ( + len(hints_and_subseq) > 0 and len(hints_and_subseq) % 2 == 0 + ), f"Error: expecting leading modular tokenizer hints followed by a sequence to tokenize, got {sequence}" + + with_placeholders = [] + + for tokenizer_type, subseq in zip( + hints_and_subseq[::2], hints_and_subseq[1::2] + ): + if tokenizer_type == "FLOAT": + with_placeholders.append( + "<@TOKENIZER-TYPE=AA>" + ) # won't use AA tokens, just an arbitrary one to be able to use a token like <1> + values = subseq.split(",") + seq = "<1>" * len(values) + with_placeholders.append(seq) + elif tokenizer_type == "VECTOR": + with_placeholders.append( + "<@TOKENIZER-TYPE=AA>" + ) # won't use AA tokens, just an arbitrary one to be able to use a token like <1> + values = subseq.split("@") + seq = "<1>" * len(values) + with_placeholders.append(seq) + else: + with_placeholders.append(tokenizer_type) + with_placeholders.append(subseq) + + return "".join(with_placeholders), with_placeholders diff --git a/fusedrug/data/tokenizer/ops/__init__.py b/fusedrug/data/tokenizer/ops/__init__.py index d7f4adfd..3bd6c89f 100644 --- a/fusedrug/data/tokenizer/ops/__init__.py +++ b/fusedrug/data/tokenizer/ops/__init__.py @@ -1,5 +1,6 @@ from .fast_tokenizer_ops import FastTokenizer from .modular_tokenizer_ops import FastModularTokenizer +from .injector_tokenizer_ops import InjectorTokenizerOp try: from .pytoda_tokenizer import Op_pytoda_SMILESTokenizer, Op_pytoda_ProteinTokenizer diff --git a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py new file mode 100644 index 00000000..c5222609 --- /dev/null +++ b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py @@ -0,0 +1,144 @@ +from fuse.utils import NDict + +# from fuse.data import OpBase, get_sample_id +from fusedrug.data.tokenizer.injectortokenizer.injector_tokenizer import ( + InjectorTokenizer, +) + +# from fusedrug.data.tokenizer.modulartokenizer.modular_tokenizer import ModularTokenizer +from fusedrug.data.tokenizer.ops import FastModularTokenizer + +# from warnings import warn +# from collections import defaultdict +from typing import Optional, Union, Any + +# import os +# import re +# import torch + + +class InjectorTokenizerOp(FastModularTokenizer): + """ + applies a injector tokenizer + + injector tokenizer builds on top of modular tokenizer. + its purpose is to build inputs_emb for the model (instead of input_ids) + this allows to support more advanced inputs beyond token ids, like: + * scalars inputs + * embeddings vector within a single input + """ + + def __init__( + self, + input_dim: int, + tokenizer_path: str, + max_size: Union[int, None] = None, + pad_token: Union[str, None] = None, + pad_type_id: Union[int, None] = None, + validate_ends_with_eos: Optional[bool] = True, + eos: Optional[str] = "", + verbose: Optional[bool] = False, + **kwargs: Any, + ) -> None: + """ + + Args: + tokenizer_path: full path to a directory that the tokenizer will be loaded from + max_size: sequences below this size will be padded, and above this size will be truncated + pad: a string of the pad token + pad_type_id: see tokenizers.Tokenizer.enable_padding() docstring + validate_ends_with_eos: during encoder request (a _call_ to the op) will make sure that it ends with the provided eos token, and raise exception otherwise. + having an eos (end of sentence) token in the end is useful for multiple scenarios, for example in a generative transformer (like T5 encoder-decoder) + verbose: + """ + if verbose: + print( + f"DEBUG:InjectorTokenizerOp __init__ called for path {tokenizer_path}" + ) + + super().__init__( + tokenizer_path=tokenizer_path, + max_size=max_size, + pad_token=pad_token, + pad_type_id=pad_type_id, + validate_ends_with_eos=validate_ends_with_eos, + eos=eos, + verbose=verbose, + **kwargs, + ) + + self._input_dim = input_dim + + def __call__( + self, + sample_dict: NDict, + embedding_layer_key_in: str, # should point to a torch.nn.Module of an embedding layer + key_in: str, + key_out_tokenized_object: Optional[str] = None, + key_out_tokens_ids: Optional[str] = None, + key_out_attention_mask: Optional[str] = None, + convert_attention_mask_to_bool: Optional[bool] = True, + max_seq_len: Optional[int] = None, + on_unknown: Optional[str] = "warn", + verbose: Optional[int] = 1, + validate_ends_with_eos: Optional[bool] = None, + ) -> NDict: + """_summary_ + + Args: + sample_dict (NDict): _description_ + key_in (str): key to either a: + (1) string that contains, in addition to the text that is to be tokenized, special delimiters signifying the type + of input within each span of text (e.g. <@TOKENIZER-TYPE=AA> sequence, <@TOKENIZER-TYPE=SMILES>, etc.). + (2) list of modular_tokenizer.TypedInput specifying the tokenizer type and the subsequence to tokenize + key_out_tokenized_object (Optional[str], optional): _description_. Defaults to None. + key_out_tokens_ids (Optional[str], optional): _description_. Defaults to None. + key_out_attention_mask (Optional[str], optional): _description_. Defaults to None. + convert_attention_mask_to_bool (Optional[bool], optional): _description_. Defaults to True. + max_seq_len (Optional[int], optional): set maximum sequence len dynamically, used for both padding and truncation.. Defaults to None. + on_unknown (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to ) are encountered: 'raise' or 'warn'. Defaults to "warn". + verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning + with full data. Defaults to 1. + validate_ends_with_eos (Optional[bool], optional): if not None, overrides self._validate_ends_with_eos + + Raises: + Exception: _description_ + Exception: _description_ + + Returns: + NDict: _description_ + """ + + print("FOR DEBUGGING! REMOVE !!!!!!") + # orig: '<@TOKENIZER-TYPE=AA> + # <@TOKENIZER-TYPE=AA>KSSCKRIPLYVDFSDVGWNDWIVAPPGYIAMYCIGECPFPLADILNSTNIAIVQTLVNSVNSKIPKACCVPTELSAISMLMLDENEKVVLKNYQDMVVEGCGCR + # <@TOKENIZER-TYPE=AA>WLITGTEASCENEGEVLIIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG + # <@TOKENIZER-TYPE=AA>WAITGTEASCENEGEVLAIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG' + sample_dict[key_in] = ( + "<@TOKENIZER-TYPE=AA>" + + "<@TOKENIZER-TYPE=FLOAT>12.7,3.2,14.8,99,9" + + "<@TOKENIZER-TYPE=AA>KSSCKRIPLYVDFSDVGWNDWIVAPPGYIAMYCIGECPFPLADILNSTNIAIVQTLVNSVNSKIPKACCVPTELSAISMLMLDENEKVVLKNYQDMVVEGCGCR" + + "<@TOKENIZER-TYPE=AA>WLITGTEASCENEGEVLIIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG" + + "<@TOKENIZER-TYPE=AA>WAITGTEASCENEGEVLAIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG" + ) + + ( + with_placeholders_str, + with_placeholders_per_meta, + ) = InjectorTokenizer.build_placeholder_meta_tokenization(sample_dict[key_in]) + sample_dict[key_in + "@with_placeholders"] = with_placeholders_str + + super().__call__( + sample_dict=sample_dict, + key_in=key_in + "@with_placeholders", + key_out_tokenized_object=key_out_tokenized_object, + key_out_tokens_ids=key_out_tokens_ids, + key_out_attention_mask=key_out_attention_mask, + convert_attention_mask_to_bool=convert_attention_mask_to_bool, + max_seq_len=max_seq_len, + on_unknown=on_unknown, + verbose=verbose, + validate_ends_with_eos=validate_ends_with_eos, + ) + + print("") From 3df904c0894f497109413dedf509eff5febce6bc Mon Sep 17 00:00:00 2001 From: YoelShoshan Date: Sat, 20 Jul 2024 16:09:22 -0400 Subject: [PATCH 2/5] FLOAT and VECTOR meta tokenizers support --- .../injectortokenizer/injector_tokenizer.py | 2 +- .../modulartokenizer/modular_tokenizer.py | 22 ++++++++++++++++--- .../tokenizer/ops/injector_tokenizer_ops.py | 15 +++++++++++-- .../tokenizer/ops/modular_tokenizer_ops.py | 14 ++++++++++-- 4 files changed, 45 insertions(+), 8 deletions(-) diff --git a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py index accf5b45..df781cca 100644 --- a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py +++ b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py @@ -186,7 +186,7 @@ def build_placeholder_meta_tokenization(sequence: str) -> Tuple[str, List[str]]: seq = "<1>" * len(values) with_placeholders.append(seq) else: - with_placeholders.append(tokenizer_type) + with_placeholders.append("<@TOKENIZER-TYPE=" + tokenizer_type + ">") with_placeholders.append(subseq) return "".join(with_placeholders), with_placeholders diff --git a/fusedrug/data/tokenizer/modulartokenizer/modular_tokenizer.py b/fusedrug/data/tokenizer/modulartokenizer/modular_tokenizer.py index 6a929d8e..01d40b97 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/modular_tokenizer.py +++ b/fusedrug/data/tokenizer/modulartokenizer/modular_tokenizer.py @@ -1006,7 +1006,13 @@ def encode_list( return_overflow_info: Optional[bool] = False, on_unknown: Optional[str] = "warn", verbose: int = 1, - ) -> Union[Encoding, Tuple[Encoding, str]]: + also_return_split: bool = False, + ) -> Union[ + Encoding, + Tuple[Encoding, str], + Tuple[Encoding, List[Encoding]], + Tuple[Encoding, str, List[Encoding]], + ]: """_summary_ Args: @@ -1025,6 +1031,7 @@ def encode_list( on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to ) are encountered: 'raise' or 'warn' verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning with full data. Defaults to 1. + also_return_split: defaults to False. If set to True, the return value will also contain a list that contains per meta-tokenizer-instruction element of Encoding Returns: Encoding: _description_ """ @@ -1150,9 +1157,15 @@ def encode_list( f"Unexpected on_unknown value {on_unknown}. Should be 'warn' or 'raise'" ) + if (not return_overflow_info) and (not also_return_split): + return merged_encoding + ans = [merged_encoding] if return_overflow_info: - return merged_encoding, overflow_info - return merged_encoding + ans += [overflow_info] + if also_return_split: + ans += [encoded_list] + + return tuple(ans) def decode(self, ids: Iterable, skip_special_tokens: Optional[bool] = False) -> str: """Receives a list of IDs and returns a string of tokens @@ -1190,6 +1203,7 @@ def encode( return_overflow_info: Optional[bool] = False, on_unknown: Optional[str] = "warn", verbose: Optional[int] = 1, + also_return_split: bool = False, ) -> Encoding: # (self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True) """Receives a user-supplied string that contains, in addition to the text that is to be tokenized, special delimiters signifying the type @@ -1210,6 +1224,7 @@ def encode( on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to ) are encountered: 'raise' or 'warn' verbose (int, optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning with full data. Defaults to 1. + also_return_split: also return the per-meta-instruction encoded parts as a list of Encoding elements Returns: Encoding: _description_ str: _description_ information on overflow, if return_overflow_info=True @@ -1251,6 +1266,7 @@ def encode( return_overflow_info=return_overflow_info, on_unknown=on_unknown, verbose=verbose, + also_return_split=also_return_split, ) def get_tokenizer_types(self) -> List: diff --git a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py index c5222609..7dea654e 100644 --- a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py +++ b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py @@ -126,11 +126,11 @@ def __call__( with_placeholders_str, with_placeholders_per_meta, ) = InjectorTokenizer.build_placeholder_meta_tokenization(sample_dict[key_in]) - sample_dict[key_in + "@with_placeholders"] = with_placeholders_str + sample_dict[key_in + ".with_placeholders"] = with_placeholders_str super().__call__( sample_dict=sample_dict, - key_in=key_in + "@with_placeholders", + key_in=key_in + ".with_placeholders", key_out_tokenized_object=key_out_tokenized_object, key_out_tokens_ids=key_out_tokens_ids, key_out_attention_mask=key_out_attention_mask, @@ -139,6 +139,17 @@ def __call__( on_unknown=on_unknown, verbose=verbose, validate_ends_with_eos=validate_ends_with_eos, + key_out_encoding_per_meta=key_in + + ".per_meta_part_encoding", # using the key_in as base for the name because key_out_* are optional ) + # TODO 1: call embedding layer on all tokens to get a [sequence_length, model_dim] matrix. Make sure that gradients are allowed to flow to it when needed + # what is the best way to provide the model embedding layer here? the data-pipeline seems to be created BEFORE the model is constructed + # if we want to call the model with the entire minibatch, then we can go with option 1: + # option 1 - only prepare data towards that, and actually run the last part of the logic inside the *_step in pl_module + # option 2 - call the model embedding layer per individual sample, and also somehow load the model BEFORE the data pipeline (less likely we'll go with this option...) + # TODO 2: override per injecting meta tokenizer type (FLOAT and VECTOR) the + print("") + + return sample_dict diff --git a/fusedrug/data/tokenizer/ops/modular_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/modular_tokenizer_ops.py index ffdf4b0f..ca2134dc 100644 --- a/fusedrug/data/tokenizer/ops/modular_tokenizer_ops.py +++ b/fusedrug/data/tokenizer/ops/modular_tokenizer_ops.py @@ -193,6 +193,7 @@ def __call__( on_unknown: Optional[str] = "warn", verbose: Optional[int] = 1, validate_ends_with_eos: Optional[bool] = None, + key_out_encoding_per_meta: Optional[str] = None, ) -> NDict: """_summary_ @@ -211,6 +212,7 @@ def __call__( verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning with full data. Defaults to 1. validate_ends_with_eos (Optional[bool], optional): if not None, overrides self._validate_ends_with_eos + key_out_encoding_per_meta: optional key out. If set to a string will put in it the per-meta-instruction encoded parts as a list of Encoding elements Raises: Exception: _description_ @@ -240,22 +242,30 @@ def __call__( ) if isinstance(data, str): - encoded, overflow_info = self._tokenizer.encode( + _ans = self._tokenizer.encode( data, max_len=max_seq_len, return_overflow_info=True, on_unknown=on_unknown, verbose=verbose, + also_return_split=key_out_encoding_per_meta is not None, ) else: - encoded, overflow_info = self._tokenizer.encode_list( + _ans = self._tokenizer.encode_list( data, max_len=max_seq_len, return_overflow_info=True, on_unknown=on_unknown, verbose=verbose, + also_return_split=key_out_encoding_per_meta is not None, ) + if key_out_encoding_per_meta is not None: + encoded, overflow_info, per_meta_encoded = _ans + sample_dict[key_out_encoding_per_meta] = per_meta_encoded + else: + encoded, overflow_info = _ans + expected_max_len = self.get_max_len(override_max_len=max_seq_len) if ( expected_max_len is not None From 44419b308c8754e770b8729629b1f4018fc6974f Mon Sep 17 00:00:00 2001 From: YoelShoshan Date: Sun, 21 Jul 2024 15:47:10 -0400 Subject: [PATCH 3/5] scalars support --- .../injectortokenizer/injector_tokenizer.py | 331 +++++++++++------- .../tokenizer/ops/injector_tokenizer_ops.py | 32 +- 2 files changed, 237 insertions(+), 126 deletions(-) diff --git a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py index df781cca..821363e6 100644 --- a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py +++ b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py @@ -1,11 +1,9 @@ from fusedrug.data.tokenizer.modulartokenizer.modular_tokenizer import ModularTokenizer -from typing import Dict -from typing import Optional, List, Union, Tuple, Any +from typing import Optional, List, Tuple, Dict from tokenizers import Encoding -import omegaconf import torch -from collections.abc import Iterable import re +from fuse.utils import NDict class InjectorTokenizer(ModularTokenizer): @@ -37,127 +35,136 @@ class InjectorTokenizer(ModularTokenizer): """ - def __init__( - self, - input_dim: int, - embedding_layer: torch.nn.Module, - tokenizers_info: Union[List, omegaconf.listconfig.ListConfig], - load_adjusted_jsons: Optional[bool] = False, - special_tokens_dict: Optional[Dict] = None, - additional_tokens_list: Optional[List] = None, - max_possible_token_id: Optional[int] = None, - max_special_token_id: Optional[int] = None, - **kwargs: Any, - ) -> None: - """ - input_dim: the size of a vector of each input. The total output will be [sequence length, input_dim] + # def __init__( + # self, + # input_dim: int, + # embedding_layer: torch.nn.Module, + # tokenizers_info: Union[List, omegaconf.listconfig.ListConfig], + # load_adjusted_jsons: Optional[bool] = False, + # special_tokens_dict: Optional[Dict] = None, + # additional_tokens_list: Optional[List] = None, + # max_possible_token_id: Optional[int] = None, + # max_special_token_id: Optional[int] = None, + # **kwargs: Any, + # ) -> None: + # """ + # input_dim: the size of a vector of each input. The total output will be [sequence length, input_dim] + # """ + # self._input_dim = input_dim + # self._embedding_layer = embedding_layer + # self._modular_tokenizer = ModularTokenizer( + # tokenizers_info=tokenizers_info, + # load_adjusted_jsons=load_adjusted_jsons, + # special_tokens_dict=special_tokens_dict, + # additional_tokens_list=additional_tokens_list, + # max_possible_token_id=max_possible_token_id, + # max_special_token_id=max_special_token_id, + # **kwargs, + # ) - """ - self._input_dim = input_dim - self._embedding_layer = embedding_layer - self._modular_tokenizer = ModularTokenizer( - tokenizers_info=tokenizers_info, - load_adjusted_jsons=load_adjusted_jsons, - special_tokens_dict=special_tokens_dict, - additional_tokens_list=additional_tokens_list, - max_possible_token_id=max_possible_token_id, - max_special_token_id=max_special_token_id, - **kwargs, - ) - - print("") - - def encode_list( - self, - typed_input_list: List, - max_len: Optional[int] = None, - padding_token_id: Optional[int] = None, - padding_token: Optional[str] = "", - pad_type_id: Optional[int] = None, - return_overflow_info: Optional[bool] = False, - on_unknown: Optional[str] = "warn", - verbose: int = 1, - ) -> Union[Encoding, Tuple[Encoding, str]]: - """_summary_ + # print("") - Args: - typed_input_list (List): list of collections.namedtuple("input_type", ["input_string", "max_len"]), with - input type: the name of input type, - input_string: the string to be encoded - max_len: maximal length of the encoding (in tokens). Only relevant for truncation, as we do not need to - pad individual sub-tokenizer encodings - we only pad the final encoding of the ModularTokenizer. - The smallest value between config-defined and tuple-defined is used. If None, the max_len - that was defined for the sub-tokenizer in the config is used. - max_len (Optional[int], optional): _description_. Defaults to None. - padding_token_id (Optional[str], optional): _description_. Defaults to 0. TODO: default to None and infer it - padding_token (Optional[str], optional): _description_. Defaults to "". - pad_type_id (Optional[int], optional): _description_. Defaults to 0. (TODO: raise exception) - return_overflow_info (Optional[bool], optional): If True return an additional string with overflow information. Defaults to False. - on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to ) are encountered: 'raise' or 'warn' - verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning - with full data. Defaults to 1. - Returns: - Encoding: _description_ - """ + # def encode_list( + # self, + # typed_input_list: List, + # max_len: Optional[int] = None, + # padding_token_id: Optional[int] = None, + # padding_token: Optional[str] = "", + # pad_type_id: Optional[int] = None, + # return_overflow_info: Optional[bool] = False, + # on_unknown: Optional[str] = "warn", + # verbose: int = 1, + # ) -> Union[Encoding, Tuple[Encoding, str]]: + # """_summary_ - raise NotImplementedError + # Args: + # typed_input_list (List): list of collections.namedtuple("input_type", ["input_string", "max_len"]), with + # input type: the name of input type, + # input_string: the string to be encoded + # max_len: maximal length of the encoding (in tokens). Only relevant for truncation, as we do not need to + # pad individual sub-tokenizer encodings - we only pad the final encoding of the ModularTokenizer. + # The smallest value between config-defined and tuple-defined is used. If None, the max_len + # that was defined for the sub-tokenizer in the config is used. + # max_len (Optional[int], optional): _description_. Defaults to None. + # padding_token_id (Optional[str], optional): _description_. Defaults to 0. TODO: default to None and infer it + # padding_token (Optional[str], optional): _description_. Defaults to "". + # pad_type_id (Optional[int], optional): _description_. Defaults to 0. (TODO: raise exception) + # return_overflow_info (Optional[bool], optional): If True return an additional string with overflow information. Defaults to False. + # on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to ) are encountered: 'raise' or 'warn' + # verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning + # with full data. Defaults to 1. + # Returns: + # Encoding: _description_ + # """ - def encode( - self, - sequence: str, - max_len: Optional[int] = None, - padding_token_id: Optional[int] = 0, - padding_token: Optional[str] = "", - pad_type_id: Optional[int] = 0, - return_overflow_info: Optional[bool] = False, - on_unknown: Optional[str] = "warn", - verbose: Optional[int] = 1, - ) -> Encoding: - # (self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True) - """Receives a user-supplied string that contains, in addition to the text that is to be tokenized, special delimiters signifying the type - of input within each span of text (e.g. <@TOKENIZER-TYPE=AA> sequence, <@TOKENIZER-TYPE=SMILES>, etc.). These determine the type of tokenizer to use on each span, - and are not encoded. - Optionaly, you may also describe maximum length per section, for example: - "<@TOKENIZER-TYPE=AA>QKPGQAPRLLIYG<@TOKENIZER-TYPE=AA@MAX-LEN=122>SGSDFSDFSFD" - would not have a local limitation of the first AA section, but will have a local maximum length of 122 on the second section. - local in this context means that the maximum length will be imposed on the individual section prior to applying any global "entire sequence" maximum size limitations (if any). + # raise NotImplementedError - Args: - input_string (str): _description_ - max_len (Optional[int], optional): _description_. Defaults to None. - padding_token_id (Optional[str], optional): _description_. Defaults to 0. - padding_token (Optional[str], optional): _description_. Defaults to "". - pad_type_id (Optional[int], optional): _description_. Defaults to 0. - return_overflow_info (Optional[bool], optional): _description_. If True return an additional string with overflow information. Defaults to False. - on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to ) are encountered: 'raise' or 'warn' - verbose (int, optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning - with full data. Defaults to 1. - Returns: - Encoding: _description_ - str: _description_ information on overflow, if return_overflow_info=True - """ + # def encode( + # self, + # sequence: str, + # max_len: Optional[int] = None, + # padding_token_id: Optional[int] = 0, + # padding_token: Optional[str] = "", + # pad_type_id: Optional[int] = 0, + # return_overflow_info: Optional[bool] = False, + # on_unknown: Optional[str] = "warn", + # verbose: Optional[int] = 1, + # ) -> Encoding: + # # (self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True) + # """Receives a user-supplied string that contains, in addition to the text that is to be tokenized, special delimiters signifying the type + # of input within each span of text (e.g. <@TOKENIZER-TYPE=AA> sequence, <@TOKENIZER-TYPE=SMILES>, etc.). These determine the type of tokenizer to use on each span, + # and are not encoded. + # Optionaly, you may also describe maximum length per section, for example: + # "<@TOKENIZER-TYPE=AA>QKPGQAPRLLIYG<@TOKENIZER-TYPE=AA@MAX-LEN=122>SGSDFSDFSFD" + # would not have a local limitation of the first AA section, but will have a local maximum length of 122 on the second section. + # local in this context means that the maximum length will be imposed on the individual section prior to applying any global "entire sequence" maximum size limitations (if any). - raise NotImplementedError + # Args: + # input_string (str): _description_ + # max_len (Optional[int], optional): _description_. Defaults to None. + # padding_token_id (Optional[str], optional): _description_. Defaults to 0. + # padding_token (Optional[str], optional): _description_. Defaults to "". + # pad_type_id (Optional[int], optional): _description_. Defaults to 0. + # return_overflow_info (Optional[bool], optional): _description_. If True return an additional string with overflow information. Defaults to False. + # on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to ) are encountered: 'raise' or 'warn' + # verbose (int, optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning + # with full data. Defaults to 1. + # Returns: + # Encoding: _description_ + # str: _description_ information on overflow, if return_overflow_info=True + # """ - def decode(self, ids: Iterable, skip_special_tokens: Optional[bool] = False) -> str: - """Receives a list of IDs and returns a string of tokens - TODO: possibly output also the type of token (AA, SMILES, etc) - Args: - ids (Iterable): _description_ - skip_special_tokens (Optional[bool], optional): _description_. Defaults to False. + # raise NotImplementedError - Returns: - str: _description_ - """ + # def decode(self, ids: Iterable, skip_special_tokens: Optional[bool] = False) -> str: + # """Receives a list of IDs and returns a string of tokens + # TODO: possibly output also the type of token (AA, SMILES, etc) + # Args: + # ids (Iterable): _description_ + # skip_special_tokens (Optional[bool], optional): _description_. Defaults to False. + + # Returns: + # str: _description_ + # """ - raise NotImplementedError + # raise NotImplementedError @staticmethod - def build_placeholder_meta_tokenization(sequence: str) -> Tuple[str, List[str]]: + def build_placeholder_meta_tokenization( + *, + sequence: str, + sample_dict: Optional[NDict] = None, + ) -> Tuple[str, List[str]]: """ In order to avoid modifying and rewriting the logic in modular tokenizer, especially regarding padding, limitation of max length of certain sub-parts, we put placeholders to make sure that the total size is known/fixed and respects the meta instructions to the modular tokenizer + + Returns: a tuple with 2 elements + ( + a single string with the full query containing placeholder tokens for FLOAT and VECTOR meta tokenizer parts, + a list of [meta-tokenizer name, data, meta-tokenizer name, data, meta-tokenizer name, data, ...] + ) """ hints_and_subseq = re.split("<@TOKENIZER-TYPE=([^>]*)>", sequence)[ 1: @@ -171,22 +178,104 @@ def build_placeholder_meta_tokenization(sequence: str) -> Tuple[str, List[str]]: for tokenizer_type, subseq in zip( hints_and_subseq[::2], hints_and_subseq[1::2] ): - if tokenizer_type == "FLOAT": - with_placeholders.append( - "<@TOKENIZER-TYPE=AA>" - ) # won't use AA tokens, just an arbitrary one to be able to use a token like <1> - values = subseq.split(",") - seq = "<1>" * len(values) - with_placeholders.append(seq) - elif tokenizer_type == "VECTOR": + if tokenizer_type.startswith("SCALARS_"): with_placeholders.append( "<@TOKENIZER-TYPE=AA>" ) # won't use AA tokens, just an arbitrary one to be able to use a token like <1> - values = subseq.split("@") - seq = "<1>" * len(values) + + if tokenizer_type == "SCALARS_LITERALS": + values = subseq.split(",") + elif tokenizer_type == "SCALARS_FROM_DICT": + if sample_dict is None: + raise Exception( + "SCALARS_FROM_DICT used but the provided sample_dict is None" + ) + values = sample_dict[subseq] + assert len(values.shape) == 1 + seq = "<1>" * len(values) # TODO: put a token instead with_placeholders.append(seq) + elif tokenizer_type.startswith("VECTORS_"): + raise Exception("VECTOR_* are not supported yet") else: with_placeholders.append("<@TOKENIZER-TYPE=" + tokenizer_type + ">") with_placeholders.append(subseq) - return "".join(with_placeholders), with_placeholders + return "".join(with_placeholders), hints_and_subseq + + @staticmethod + def prepare_info_for_model_step( + *, + per_meta_tokenizer_data: List[str], + per_meta_encoding_including_placeholders: List[Encoding], + sample_dict: Optional[NDict] = None, + ) -> Dict: + """ + since we: + 1. Need to use the model embedding layer (allowing gradients flow if needed) + 2. We prefer not to use the model during the data pipeline + + In this function we prepare everything so that during the train/val/test_step we'll be able to do what's needed before doing the forward pass + + Args: + per_meta_tokenizer_data: a list of [meta-tokenizer name, data, meta-tokenizer name, data, meta-tokenizer name, data, ...] + per_meta_encoding_including_placeholders: a list of Encoding elements. This is used to extract per tokenizer final tokens num (after all of the padding and cropping logic was already done) + sample_dict: a fuse sample_dict - optional. + needed only if the meta tokenizer instruction uses a syntax of lookup from the dictionary + + + """ + scalars_indices = None + scalars_values = None + prev_index_end = -1 + + for tokenizer_name, curr_str_data, curr_placeholder_encoding in zip( + per_meta_tokenizer_data[::2], + per_meta_tokenizer_data[1::2], + per_meta_encoding_including_placeholders, + ): + if tokenizer_name.startswith("SCALARS_"): + if "SCALARS_LITERALS" == tokenizer_name: + curr_str_data = curr_str_data.strip().split(",") + if len(curr_str_data) != len(curr_placeholder_encoding.ids): + raise Exception( + f"should match expected length. Found length {len(curr_str_data)} but placeholders length was {len(curr_placeholder_encoding.ids)}" + ) + curr_data = [float(_) for _ in curr_str_data] + curr_data = torch.tensor(curr_data, dtype=torch.float32) + assert len(curr_data.shape) == 1 + elif "SCALARS_FROM_DICT" == tokenizer_name: + if sample_dict is None: + raise Exception( + "SCALARS_FROM_DICT used but the provided sample_dict is None" + ) + curr_data = sample_dict[curr_str_data] + assert len(curr_data.shape) == 1 + else: + raise Exception( + "Only supported SCALARS_* tokenizers are SCALARS_LITERALS and SCALARS_FROM_DICT" + ) + + curr_indices = torch.arange( + prev_index_end + 1, prev_index_end + 1 + curr_data.shape[0] + ) + scalars_indices = ( + curr_indices + if scalars_indices is None + else torch.concat([scalars_indices, curr_indices]) + ) + scalars_values = ( + curr_data + if scalars_values is None + else torch.concat([scalars_values, curr_data]) + ) + + prev_index_end += curr_data.shape[0] + if tokenizer_name.startswith("VECTORS_"): + raise NotImplementedError + else: + prev_index_end += len(curr_placeholder_encoding.ids) + + return { + "scalars_indices": scalars_indices, + "scalars_values": scalars_values, + } diff --git a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py index 7dea654e..a6cdbb84 100644 --- a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py +++ b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py @@ -15,6 +15,7 @@ # import os # import re # import torch +import torch class InjectorTokenizerOp(FastModularTokenizer): @@ -72,7 +73,6 @@ def __init__( def __call__( self, sample_dict: NDict, - embedding_layer_key_in: str, # should point to a torch.nn.Module of an embedding layer key_in: str, key_out_tokenized_object: Optional[str] = None, key_out_tokens_ids: Optional[str] = None, @@ -82,6 +82,8 @@ def __call__( on_unknown: Optional[str] = "warn", verbose: Optional[int] = 1, validate_ends_with_eos: Optional[bool] = None, + key_out_scalars_indices: Optional[str] = None, + key_out_scalars_values: Optional[str] = None, ) -> NDict: """_summary_ @@ -100,6 +102,11 @@ def __call__( verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning with full data. Defaults to 1. validate_ends_with_eos (Optional[bool], optional): if not None, overrides self._validate_ends_with_eos + key_out_scalars_indices:str optional + if provided, will write to sample_dict in this key a 1D torch tensor with indices of all scalar elements. + key_out_scalars_values:str optional + if provided, will write to sample_dict in this key a 1D torch tensor with indices of all scalar values. + Raises: Exception: _description_ @@ -116,16 +123,22 @@ def __call__( # <@TOKENIZER-TYPE=AA>WAITGTEASCENEGEVLAIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG' sample_dict[key_in] = ( "<@TOKENIZER-TYPE=AA>" - + "<@TOKENIZER-TYPE=FLOAT>12.7,3.2,14.8,99,9" + + "<@TOKENIZER-TYPE=SCALARS_LITERALS>12.7,3.2,14.8,99,9" + "<@TOKENIZER-TYPE=AA>KSSCKRIPLYVDFSDVGWNDWIVAPPGYIAMYCIGECPFPLADILNSTNIAIVQTLVNSVNSKIPKACCVPTELSAISMLMLDENEKVVLKNYQDMVVEGCGCR" + + "<@TOKENIZER-TYPE=SCALARS_FROM_DICT>blah.model.banana" + "<@TOKENIZER-TYPE=AA>WLITGTEASCENEGEVLIIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG" + "<@TOKENIZER-TYPE=AA>WAITGTEASCENEGEVLAIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG" ) + sample_dict["blah.model.banana"] = torch.tensor( + [100.0, 200.0, 300.0], dtype=torch.float32 + ) ( with_placeholders_str, - with_placeholders_per_meta, - ) = InjectorTokenizer.build_placeholder_meta_tokenization(sample_dict[key_in]) + per_meta_orig, + ) = InjectorTokenizer.build_placeholder_meta_tokenization( + sequence=sample_dict[key_in], sample_dict=sample_dict + ) sample_dict[key_in + ".with_placeholders"] = with_placeholders_str super().__call__( @@ -150,6 +163,15 @@ def __call__( # option 2 - call the model embedding layer per individual sample, and also somehow load the model BEFORE the data pipeline (less likely we'll go with this option...) # TODO 2: override per injecting meta tokenizer type (FLOAT and VECTOR) the - print("") + prepared_data = InjectorTokenizer.prepare_info_for_model_step( + per_meta_tokenizer_data=per_meta_orig, + per_meta_encoding_including_placeholders=sample_dict[ + key_in + ".per_meta_part_encoding" + ], + sample_dict=sample_dict, + ) + + sample_dict[key_out_scalars_indices] = prepared_data["scalars_indices"] + sample_dict[key_out_scalars_values] = prepared_data["scalars_values"] return sample_dict From f642a078f94de822bb2605743b8ad4015dbad8a2 Mon Sep 17 00:00:00 2001 From: YoelShoshan Date: Wed, 24 Jul 2024 03:30:44 -0400 Subject: [PATCH 4/5] merge --- .../injectortokenizer/injector_tokenizer.py | 204 ++++++------------ ...with_aug_4272372_samples_balanced_1_1.json | 40 ++++ .../cell_attributes_tokenizer.json | 40 ++++ .../gene_tokenizer.json | 40 ++++ .../t5_tokenizer_AA_special.json | 40 ++++ ...with_aug_4272372_samples_balanced_1_1.json | 40 ++++ .../cell_attributes_tokenizer.json | 40 ++++ .../t5_tokenizer_AA_special.json | 40 ++++ ...with_aug_4272372_samples_balanced_1_1.json | 40 ++++ .../cell_attributes_tokenizer.json | 40 ++++ .../t5_tokenizer_AA_special.json | 40 ++++ .../modulartokenizer/special_tokens.py | 4 + .../tokenizer/ops/injector_tokenizer_ops.py | 62 +++--- 13 files changed, 500 insertions(+), 170 deletions(-) diff --git a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py index 821363e6..f0b720a9 100644 --- a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py +++ b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py @@ -35,121 +35,6 @@ class InjectorTokenizer(ModularTokenizer): """ - # def __init__( - # self, - # input_dim: int, - # embedding_layer: torch.nn.Module, - # tokenizers_info: Union[List, omegaconf.listconfig.ListConfig], - # load_adjusted_jsons: Optional[bool] = False, - # special_tokens_dict: Optional[Dict] = None, - # additional_tokens_list: Optional[List] = None, - # max_possible_token_id: Optional[int] = None, - # max_special_token_id: Optional[int] = None, - # **kwargs: Any, - # ) -> None: - # """ - # input_dim: the size of a vector of each input. The total output will be [sequence length, input_dim] - - # """ - # self._input_dim = input_dim - # self._embedding_layer = embedding_layer - # self._modular_tokenizer = ModularTokenizer( - # tokenizers_info=tokenizers_info, - # load_adjusted_jsons=load_adjusted_jsons, - # special_tokens_dict=special_tokens_dict, - # additional_tokens_list=additional_tokens_list, - # max_possible_token_id=max_possible_token_id, - # max_special_token_id=max_special_token_id, - # **kwargs, - # ) - - # print("") - - # def encode_list( - # self, - # typed_input_list: List, - # max_len: Optional[int] = None, - # padding_token_id: Optional[int] = None, - # padding_token: Optional[str] = "", - # pad_type_id: Optional[int] = None, - # return_overflow_info: Optional[bool] = False, - # on_unknown: Optional[str] = "warn", - # verbose: int = 1, - # ) -> Union[Encoding, Tuple[Encoding, str]]: - # """_summary_ - - # Args: - # typed_input_list (List): list of collections.namedtuple("input_type", ["input_string", "max_len"]), with - # input type: the name of input type, - # input_string: the string to be encoded - # max_len: maximal length of the encoding (in tokens). Only relevant for truncation, as we do not need to - # pad individual sub-tokenizer encodings - we only pad the final encoding of the ModularTokenizer. - # The smallest value between config-defined and tuple-defined is used. If None, the max_len - # that was defined for the sub-tokenizer in the config is used. - # max_len (Optional[int], optional): _description_. Defaults to None. - # padding_token_id (Optional[str], optional): _description_. Defaults to 0. TODO: default to None and infer it - # padding_token (Optional[str], optional): _description_. Defaults to "". - # pad_type_id (Optional[int], optional): _description_. Defaults to 0. (TODO: raise exception) - # return_overflow_info (Optional[bool], optional): If True return an additional string with overflow information. Defaults to False. - # on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to ) are encountered: 'raise' or 'warn' - # verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning - # with full data. Defaults to 1. - # Returns: - # Encoding: _description_ - # """ - - # raise NotImplementedError - - # def encode( - # self, - # sequence: str, - # max_len: Optional[int] = None, - # padding_token_id: Optional[int] = 0, - # padding_token: Optional[str] = "", - # pad_type_id: Optional[int] = 0, - # return_overflow_info: Optional[bool] = False, - # on_unknown: Optional[str] = "warn", - # verbose: Optional[int] = 1, - # ) -> Encoding: - # # (self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True) - # """Receives a user-supplied string that contains, in addition to the text that is to be tokenized, special delimiters signifying the type - # of input within each span of text (e.g. <@TOKENIZER-TYPE=AA> sequence, <@TOKENIZER-TYPE=SMILES>, etc.). These determine the type of tokenizer to use on each span, - # and are not encoded. - # Optionaly, you may also describe maximum length per section, for example: - # "<@TOKENIZER-TYPE=AA>QKPGQAPRLLIYG<@TOKENIZER-TYPE=AA@MAX-LEN=122>SGSDFSDFSFD" - # would not have a local limitation of the first AA section, but will have a local maximum length of 122 on the second section. - # local in this context means that the maximum length will be imposed on the individual section prior to applying any global "entire sequence" maximum size limitations (if any). - - # Args: - # input_string (str): _description_ - # max_len (Optional[int], optional): _description_. Defaults to None. - # padding_token_id (Optional[str], optional): _description_. Defaults to 0. - # padding_token (Optional[str], optional): _description_. Defaults to "". - # pad_type_id (Optional[int], optional): _description_. Defaults to 0. - # return_overflow_info (Optional[bool], optional): _description_. If True return an additional string with overflow information. Defaults to False. - # on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to ) are encountered: 'raise' or 'warn' - # verbose (int, optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning - # with full data. Defaults to 1. - # Returns: - # Encoding: _description_ - # str: _description_ information on overflow, if return_overflow_info=True - # """ - - # raise NotImplementedError - - # def decode(self, ids: Iterable, skip_special_tokens: Optional[bool] = False) -> str: - # """Receives a list of IDs and returns a string of tokens - # TODO: possibly output also the type of token (AA, SMILES, etc) - # Args: - # ids (Iterable): _description_ - # skip_special_tokens (Optional[bool], optional): _description_. Defaults to False. - - # Returns: - # str: _description_ - # """ - - # raise NotImplementedError - @staticmethod def build_placeholder_meta_tokenization( *, @@ -181,10 +66,19 @@ def build_placeholder_meta_tokenization( if tokenizer_type.startswith("SCALARS_"): with_placeholders.append( "<@TOKENIZER-TYPE=AA>" - ) # won't use AA tokens, just an arbitrary one to be able to use a token like <1> + ) # won't use AA tokens, just an arbitrary one to be able to use a token like - if tokenizer_type == "SCALARS_LITERALS": + if ( + tokenizer_type == "SCALARS_LITERALS" + ): # note: masking is only supported in literals (not in "from dict") values = subseq.split(",") + # seq = "" * len(values) + seq = "".join( + [ + "" if x == "" else "" + for x in values + ] + ) elif tokenizer_type == "SCALARS_FROM_DICT": if sample_dict is None: raise Exception( @@ -192,8 +86,17 @@ def build_placeholder_meta_tokenization( ) values = sample_dict[subseq] assert len(values.shape) == 1 - seq = "<1>" * len(values) # TODO: put a token instead + seq = "" * len(values) + else: + raise Exception(f"tokenizer_type={tokenizer_type} is not supported") + + # elif tokenizer_type == "SCALARS_MASKED": + # values = subseq.split(",") + # assert all([x=='' for x in values]) #only is currently supported + # seq = "" * len(values) + with_placeholders.append(seq) + elif tokenizer_type.startswith("VECTORS_"): raise Exception("VECTOR_* are not supported yet") else: @@ -224,8 +127,9 @@ def prepare_info_for_model_step( """ - scalars_indices = None - scalars_values = None + scalars_indices = [] + scalars_values = [] + scalars_masked_indices = [] prev_index_end = -1 for tokenizer_name, curr_str_data, curr_placeholder_encoding in zip( @@ -240,9 +144,25 @@ def prepare_info_for_model_step( raise Exception( f"should match expected length. Found length {len(curr_str_data)} but placeholders length was {len(curr_placeholder_encoding.ids)}" ) - curr_data = [float(_) for _ in curr_str_data] - curr_data = torch.tensor(curr_data, dtype=torch.float32) - assert len(curr_data.shape) == 1 + + curr_indices = [] + curr_data = [] + + for i, val in enumerate(curr_str_data): + if val != "": + curr_indices.append(i + prev_index_end + 1) + curr_data.append(float(val)) + else: + scalars_masked_indices.append(i + prev_index_end + 1) + + if len(curr_indices) > 0: + curr_indices = torch.tensor(curr_data, dtype=torch.int64) + curr_data = torch.tensor(curr_data, dtype=torch.float32) + + scalars_indices.append(curr_indices) + scalars_values.append(curr_data) + + assert len(curr_data.shape) == 1 elif "SCALARS_FROM_DICT" == tokenizer_name: if sample_dict is None: raise Exception( @@ -250,32 +170,38 @@ def prepare_info_for_model_step( ) curr_data = sample_dict[curr_str_data] assert len(curr_data.shape) == 1 + curr_indices = torch.arange( + prev_index_end + 1, prev_index_end + 1 + curr_data.shape[0] + ) + + scalars_indices.append(curr_indices) + scalars_values.append(curr_data) + + prev_index_end += curr_data.shape[0] + else: raise Exception( "Only supported SCALARS_* tokenizers are SCALARS_LITERALS and SCALARS_FROM_DICT" ) - curr_indices = torch.arange( - prev_index_end + 1, prev_index_end + 1 + curr_data.shape[0] - ) - scalars_indices = ( - curr_indices - if scalars_indices is None - else torch.concat([scalars_indices, curr_indices]) - ) - scalars_values = ( - curr_data - if scalars_values is None - else torch.concat([scalars_values, curr_data]) - ) - - prev_index_end += curr_data.shape[0] - if tokenizer_name.startswith("VECTORS_"): + elif tokenizer_name.startswith("VECTORS_"): raise NotImplementedError else: prev_index_end += len(curr_placeholder_encoding.ids) + if len(scalars_indices) > 0: + scalars_indices = torch.concat(scalars_indices) + scalars_values = torch.concat(scalars_values) + + if len(scalars_masked_indices) > 0: + scalars_masked_indices = torch.tensor( + scalars_masked_indices, dtype=torch.int64 + ) + else: + scalars_masked_indices = None + return { "scalars_indices": scalars_indices, "scalars_values": scalars_values, + "scalars_masked_indices": scalars_masked_indices, } diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json index 55ce4bd4..4932902a 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json @@ -2747,6 +2747,42 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 305, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 306, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 307, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 308, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3067,6 +3103,10 @@ "": 302, "": 303, "": 304, + "": 305, + "": 306, + "": 307, + "": 308, "#": 527, "%": 528, "(": 529, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json index e5d2ec64..b1f00797 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json @@ -2747,6 +2747,42 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 305, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 306, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 307, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 308, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3073,6 +3109,10 @@ "": 302, "": 303, "": 304, + "": 305, + "": 306, + "": 307, + "": 308, "[CL:0000499]": 3522, "[CL:2000060]": 3523, "[CL:0000235]": 3524, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json index 6a0ed97b..34977c75 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json @@ -2747,6 +2747,42 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 305, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 306, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 307, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 308, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3073,6 +3109,10 @@ "": 302, "": 303, "": 304, + "": 305, + "": 306, + "": 307, + "": 308, "[100130093]": 5000, "[100133445]": 5001, "[100286793]": 5002, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json index ced94e24..9e796ab9 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json @@ -2747,6 +2747,42 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 305, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 306, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 307, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 308, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3073,6 +3109,10 @@ "": 302, "": 303, "": 304, + "": 305, + "": 306, + "": 307, + "": 308, "A": 501, "B": 502, "C": 503, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json index 55ce4bd4..4932902a 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json @@ -2747,6 +2747,42 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 305, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 306, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 307, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 308, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3067,6 +3103,10 @@ "": 302, "": 303, "": 304, + "": 305, + "": 306, + "": 307, + "": 308, "#": 527, "%": 528, "(": 529, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json index e5d2ec64..b1f00797 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json @@ -2747,6 +2747,42 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 305, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 306, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 307, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 308, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3073,6 +3109,10 @@ "": 302, "": 303, "": 304, + "": 305, + "": 306, + "": 307, + "": 308, "[CL:0000499]": 3522, "[CL:2000060]": 3523, "[CL:0000235]": 3524, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json index ced94e24..9e796ab9 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json @@ -2747,6 +2747,42 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 305, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 306, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 307, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 308, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3073,6 +3109,10 @@ "": 302, "": 303, "": 304, + "": 305, + "": 306, + "": 307, + "": 308, "A": 501, "B": 502, "C": 503, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json index 55ce4bd4..4932902a 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json @@ -2747,6 +2747,42 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 305, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 306, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 307, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 308, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3067,6 +3103,10 @@ "": 302, "": 303, "": 304, + "": 305, + "": 306, + "": 307, + "": 308, "#": 527, "%": 528, "(": 529, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json index e5d2ec64..b1f00797 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json @@ -2747,6 +2747,42 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 305, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 306, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 307, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 308, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3073,6 +3109,10 @@ "": 302, "": 303, "": 304, + "": 305, + "": 306, + "": 307, + "": 308, "[CL:0000499]": 3522, "[CL:2000060]": 3523, "[CL:0000235]": 3524, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json index ced94e24..9e796ab9 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json @@ -2747,6 +2747,42 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 305, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 306, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 307, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 308, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3073,6 +3109,10 @@ "": 302, "": 303, "": 304, + "": 305, + "": 306, + "": 307, + "": 308, "A": 501, "B": 502, "C": 503, diff --git a/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py b/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py index 5ef9159a..61bd9b94 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py +++ b/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py @@ -326,6 +326,10 @@ "GENERAL_CHAIN", "CDR3_REGION", "MUTATED", + "SCALAR", + "VECTOR", + "MASKED_SCALAR", + "MASKED_VECTOR", ] AA_tokens = [ diff --git a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py index a6cdbb84..22b4df2f 100644 --- a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py +++ b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py @@ -15,7 +15,6 @@ # import os # import re # import torch -import torch class InjectorTokenizerOp(FastModularTokenizer): @@ -84,6 +83,7 @@ def __call__( validate_ends_with_eos: Optional[bool] = None, key_out_scalars_indices: Optional[str] = None, key_out_scalars_values: Optional[str] = None, + key_out_masked_scalars_indices: Optional[str] = None, ) -> NDict: """_summary_ @@ -102,10 +102,11 @@ def __call__( verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning with full data. Defaults to 1. validate_ends_with_eos (Optional[bool], optional): if not None, overrides self._validate_ends_with_eos - key_out_scalars_indices:str optional - if provided, will write to sample_dict in this key a 1D torch tensor with indices of all scalar elements. - key_out_scalars_values:str optional - if provided, will write to sample_dict in this key a 1D torch tensor with indices of all scalar values. + key_out_scalars_inputs_indices:str optional + if provided, will write to sample_dict in this key a 1D torch tensor with indices of all inputs scalar elements. + key_out_scalars_inputs_values:str optional + if provided, will write to sample_dict in this key a 1D torch tensor with indices of all inputs scalar values. + Raises: @@ -116,23 +117,6 @@ def __call__( NDict: _description_ """ - print("FOR DEBUGGING! REMOVE !!!!!!") - # orig: '<@TOKENIZER-TYPE=AA> - # <@TOKENIZER-TYPE=AA>KSSCKRIPLYVDFSDVGWNDWIVAPPGYIAMYCIGECPFPLADILNSTNIAIVQTLVNSVNSKIPKACCVPTELSAISMLMLDENEKVVLKNYQDMVVEGCGCR - # <@TOKENIZER-TYPE=AA>WLITGTEASCENEGEVLIIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG - # <@TOKENIZER-TYPE=AA>WAITGTEASCENEGEVLAIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG' - sample_dict[key_in] = ( - "<@TOKENIZER-TYPE=AA>" - + "<@TOKENIZER-TYPE=SCALARS_LITERALS>12.7,3.2,14.8,99,9" - + "<@TOKENIZER-TYPE=AA>KSSCKRIPLYVDFSDVGWNDWIVAPPGYIAMYCIGECPFPLADILNSTNIAIVQTLVNSVNSKIPKACCVPTELSAISMLMLDENEKVVLKNYQDMVVEGCGCR" - + "<@TOKENIZER-TYPE=SCALARS_FROM_DICT>blah.model.banana" - + "<@TOKENIZER-TYPE=AA>WLITGTEASCENEGEVLIIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG" - + "<@TOKENIZER-TYPE=AA>WAITGTEASCENEGEVLAIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG" - ) - sample_dict["blah.model.banana"] = torch.tensor( - [100.0, 200.0, 300.0], dtype=torch.float32 - ) - ( with_placeholders_str, per_meta_orig, @@ -156,13 +140,6 @@ def __call__( + ".per_meta_part_encoding", # using the key_in as base for the name because key_out_* are optional ) - # TODO 1: call embedding layer on all tokens to get a [sequence_length, model_dim] matrix. Make sure that gradients are allowed to flow to it when needed - # what is the best way to provide the model embedding layer here? the data-pipeline seems to be created BEFORE the model is constructed - # if we want to call the model with the entire minibatch, then we can go with option 1: - # option 1 - only prepare data towards that, and actually run the last part of the logic inside the *_step in pl_module - # option 2 - call the model embedding layer per individual sample, and also somehow load the model BEFORE the data pipeline (less likely we'll go with this option...) - # TODO 2: override per injecting meta tokenizer type (FLOAT and VECTOR) the - prepared_data = InjectorTokenizer.prepare_info_for_model_step( per_meta_tokenizer_data=per_meta_orig, per_meta_encoding_including_placeholders=sample_dict[ @@ -171,7 +148,30 @@ def __call__( sample_dict=sample_dict, ) - sample_dict[key_out_scalars_indices] = prepared_data["scalars_indices"] - sample_dict[key_out_scalars_values] = prepared_data["scalars_values"] + if key_out_scalars_indices is not None: + sample_dict[key_out_scalars_indices] = prepared_data["scalars_indices"] + else: + if prepared_data["scalars_indices"] is not None: + raise Exception( + "non None scalars_indices found but no key_out_scalars_indices found" + ) + + if key_out_scalars_values is not None: + sample_dict[key_out_scalars_values] = prepared_data["scalars_values"] + else: + if prepared_data["scalars_values"] is not None: + raise Exception( + "non None scalars_value found but no key_out_scalars_values found" + ) + + if key_out_masked_scalars_indices is not None: + sample_dict[key_out_masked_scalars_indices] = prepared_data[ + "scalars_masked_indices" + ] + else: + if prepared_data["scalars_masked_indices"] is not None: + raise Exception( + "non None scalars_masked_indices found but no key_out_masked_scalars_indices found" + ) return sample_dict From ab1672271538b21e2d770c88d7c00192c5c9804e Mon Sep 17 00:00:00 2001 From: YoelShoshan Date: Fri, 26 Jul 2024 02:20:28 -0400 Subject: [PATCH 5/5] PR comments --- .../injectortokenizer/injector_tokenizer.py | 56 +++++++++---------- .../tokenizer/ops/injector_tokenizer_ops.py | 40 ++++++------- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py index f0b720a9..6088b15f 100644 --- a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py +++ b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py @@ -1,4 +1,3 @@ -from fusedrug.data.tokenizer.modulartokenizer.modular_tokenizer import ModularTokenizer from typing import Optional, List, Tuple, Dict from tokenizers import Encoding import torch @@ -6,32 +5,38 @@ from fuse.utils import NDict -class InjectorTokenizer(ModularTokenizer): +class InjectorTokenizerHelpers: """ InjectorTokenizer builds on top of ModularTokenizer. + !!!! + Note - this file contains only few utility (static) functions for InjectorTokenizerOp + as a user, you are not expected to InjectorTokenizer directly, instead you should use fusedrug.data.tokenizer.ops.injector_tokenizer_ops.InjectorTokenizerOp + !!!! - Its purpose is to extend beyond "standard" input tokens as integers as input for a model. - Instead, it provides control on *vectors* that are to be used as input for a model. + applies a injector tokenizer - Example use cases: - 1. Providing scalars (floating point) as inputs - 2. Providing vectors of embeddings - for example of a protein embedding + injector tokenizer builds on top of modular tokenizer. + its purpose is to build inputs_emb for the model (instead of input_ids) + this allows to support more advanced inputs beyond token ids, like: + * scalars inputs + * embeddings vector within a single input - Each input "token" becomes a tensor of a defined size, and is built of: - 1. Header - made of 4 floats - [ - 0.0 or 1.0 #is this a sentinel/mask or not - 0.0 or 1.0 #is this a standard vocabulary token - 0.0 or 1.0 #is this a scalar - 0.0 or 1.0 #is this a full injected vector (e.g. an embedding) - ] - 2. Content - the rest of each input vector is made of input_dim-4 float elements. + supported syntax/format: + for text following <@TOKENIZER-TYPE=SCALARS_LITERALS> supports the following format: + ',' separated float values and/or tokens - + for example: "2.7,3.99,-12.9" or "" or "2.19,,3.19," - Note - in the "standard vocabulary token" - we support providing an external embeding layer (like in vanilla T5), - as it's part of the trained weights. + for text following <@TOKENIZER-TYPE=SCALARS_FROM_DICT> is expected to be a key to the sample NDict + for example: "blah.boo.banana" or "data.input.encoder_input" + note: in SCALARS_FROM_DICT you can't describe masked scalars (outputs) you can only describe inputs + + example usage: + + encoder_input: + <@TOKENIZER-TYPE=AA><@TOKENIZER-TYPE=SCALARS_LITERALS>0.3<@TOKENIZER-TYPE=AA><@TOKENIZER-TYPE=SCALARS_LITERALS><@TOKENIZER-TYPE=AA>ISGGDAIYSSTGRCSLGFNVRSGSTYYFLTAGICTDGATTWWANSARTTVLGTTSGSSFPNNDYGIVRYTNTTIPKDGTVGGQDITSAANATVGMAVTRRGSTTGTISGSVTALNATVNYGGGDVVYGMIRTNVCAEPGDSGGPLYSGTRAIGLTSGGSGNCSSGGTTFFQPVTEALVAYGVSVY + labels: + <@TOKENIZER-TYPE=AA><@TOKENIZER-TYPE=SCALARS_LITERALS>0.3<@TOKENIZER-TYPE=AA><@TOKENIZER-TYPE=SCALARS_LITERALS>12.4<@TOKENIZER-TYPE=AA>ISGGDAIYSSTGRCSLGFNVRSGSTYYFLTAGICTDGATTWWANSARTTVLGTTSGSSFPNNDYGIVRYTNTTIPKDGTVGGQDITSAANATVGMAVTRRGSTTGTISGSVTALNATVNYGGGDVVYGMIRTNVCAEPGDSGGPLYSGTRAIGLTSGGSGNCSSGGTTFFQPVTEALVAYGVSVY """ @@ -90,11 +95,6 @@ def build_placeholder_meta_tokenization( else: raise Exception(f"tokenizer_type={tokenizer_type} is not supported") - # elif tokenizer_type == "SCALARS_MASKED": - # values = subseq.split(",") - # assert all([x=='' for x in values]) #only is currently supported - # seq = "" * len(values) - with_placeholders.append(seq) elif tokenizer_type.startswith("VECTORS_"): @@ -201,7 +201,7 @@ def prepare_info_for_model_step( scalars_masked_indices = None return { - "scalars_indices": scalars_indices, - "scalars_values": scalars_values, - "scalars_masked_indices": scalars_masked_indices, + "scalars_indices": scalars_indices, # 1d - its length is the number of actual scalars (provided) found + "scalars_values": scalars_values, # 1d - values of provided scalars + "scalars_masked_indices": scalars_masked_indices, # 1d - indices of masked scalars } diff --git a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py index 22b4df2f..aa28cef1 100644 --- a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py +++ b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py @@ -1,21 +1,13 @@ from fuse.utils import NDict -# from fuse.data import OpBase, get_sample_id from fusedrug.data.tokenizer.injectortokenizer.injector_tokenizer import ( - InjectorTokenizer, + InjectorTokenizerHelpers, ) -# from fusedrug.data.tokenizer.modulartokenizer.modular_tokenizer import ModularTokenizer from fusedrug.data.tokenizer.ops import FastModularTokenizer -# from warnings import warn -# from collections import defaultdict from typing import Optional, Union, Any -# import os -# import re -# import torch - class InjectorTokenizerOp(FastModularTokenizer): """ @@ -26,11 +18,27 @@ class InjectorTokenizerOp(FastModularTokenizer): this allows to support more advanced inputs beyond token ids, like: * scalars inputs * embeddings vector within a single input + + supported syntax/format: + + for text following <@TOKENIZER-TYPE=SCALARS_LITERALS> supports the following format: + ',' separated float values and/or tokens - + for example: "2.7,3.99,-12.9" or "" or "2.19,,3.19," + + for text following <@TOKENIZER-TYPE=SCALARS_FROM_DICT> is expected to be a key to the sample NDict + for example: "blah.boo.banana" or "data.input.encoder_input" + note: in SCALARS_FROM_DICT you can't describe masked scalars (outputs) you can only describe inputs + + example usage: + + encoder_input: + <@TOKENIZER-TYPE=AA><@TOKENIZER-TYPE=SCALARS_LITERALS>0.3<@TOKENIZER-TYPE=AA><@TOKENIZER-TYPE=SCALARS_LITERALS><@TOKENIZER-TYPE=AA>ISGGDAIYSSTGRCSLGFNVRSGSTYYFLTAGICTDGATTWWANSARTTVLGTTSGSSFPNNDYGIVRYTNTTIPKDGTVGGQDITSAANATVGMAVTRRGSTTGTISGSVTALNATVNYGGGDVVYGMIRTNVCAEPGDSGGPLYSGTRAIGLTSGGSGNCSSGGTTFFQPVTEALVAYGVSVY + labels: + <@TOKENIZER-TYPE=AA><@TOKENIZER-TYPE=SCALARS_LITERALS>0.3<@TOKENIZER-TYPE=AA><@TOKENIZER-TYPE=SCALARS_LITERALS>12.4<@TOKENIZER-TYPE=AA>ISGGDAIYSSTGRCSLGFNVRSGSTYYFLTAGICTDGATTWWANSARTTVLGTTSGSSFPNNDYGIVRYTNTTIPKDGTVGGQDITSAANATVGMAVTRRGSTTGTISGSVTALNATVNYGGGDVVYGMIRTNVCAEPGDSGGPLYSGTRAIGLTSGGSGNCSSGGTTFFQPVTEALVAYGVSVY """ def __init__( self, - input_dim: int, tokenizer_path: str, max_size: Union[int, None] = None, pad_token: Union[str, None] = None, @@ -67,8 +75,6 @@ def __init__( **kwargs, ) - self._input_dim = input_dim - def __call__( self, sample_dict: NDict, @@ -107,12 +113,6 @@ def __call__( key_out_scalars_inputs_values:str optional if provided, will write to sample_dict in this key a 1D torch tensor with indices of all inputs scalar values. - - - Raises: - Exception: _description_ - Exception: _description_ - Returns: NDict: _description_ """ @@ -120,7 +120,7 @@ def __call__( ( with_placeholders_str, per_meta_orig, - ) = InjectorTokenizer.build_placeholder_meta_tokenization( + ) = InjectorTokenizerHelpers.build_placeholder_meta_tokenization( sequence=sample_dict[key_in], sample_dict=sample_dict ) sample_dict[key_in + ".with_placeholders"] = with_placeholders_str @@ -140,7 +140,7 @@ def __call__( + ".per_meta_part_encoding", # using the key_in as base for the name because key_out_* are optional ) - prepared_data = InjectorTokenizer.prepare_info_for_model_step( + prepared_data = InjectorTokenizerHelpers.prepare_info_for_model_step( per_meta_tokenizer_data=per_meta_orig, per_meta_encoding_including_placeholders=sample_dict[ key_in + ".per_meta_part_encoding"