-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Scalars support #132
Scalars support #132
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
from typing import Optional, List, Tuple, Dict | ||
from tokenizers import Encoding | ||
import torch | ||
import re | ||
from fuse.utils import NDict | ||
|
||
|
||
class InjectorTokenizerHelpers: | ||
""" | ||
InjectorTokenizer builds on top of ModularTokenizer. | ||
!!!! | ||
Note - this file contains only few utility (static) functions for InjectorTokenizerOp | ||
as a user, you are not expected to InjectorTokenizer directly, instead you should use fusedrug.data.tokenizer.ops.injector_tokenizer_ops.InjectorTokenizerOp | ||
!!!! | ||
|
||
applies a injector tokenizer | ||
|
||
injector tokenizer builds on top of modular tokenizer. | ||
its purpose is to build inputs_emb for the model (instead of input_ids) | ||
this allows to support more advanced inputs beyond token ids, like: | ||
* scalars inputs | ||
* embeddings vector within a single input | ||
|
||
supported syntax/format: | ||
|
||
for text following <@TOKENIZER-TYPE=SCALARS_LITERALS> supports the following format: | ||
',' separated float values and/or <MASK> tokens - | ||
for example: "2.7,3.99,-12.9" or "<MASK><MASK>" or "2.19,<MASK>,3.19,<MASK>" | ||
|
||
for text following <@TOKENIZER-TYPE=SCALARS_FROM_DICT> is expected to be a key to the sample NDict | ||
for example: "blah.boo.banana" or "data.input.encoder_input" | ||
note: in SCALARS_FROM_DICT you can't describe masked scalars (outputs) you can only describe inputs | ||
|
||
example usage: | ||
|
||
encoder_input: | ||
<@TOKENIZER-TYPE=AA><MOLECULAR_WEIGHT_IN_SOME_UNIT><@TOKENIZER-TYPE=SCALARS_LITERALS>0.3<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_NANOMOLAR><@TOKENIZER-TYPE=SCALARS_LITERALS><MASK><@TOKENIZER-TYPE=AA><SEQUENCE_NATURAL_START>ISGGDAIYSSTGRCSLGFNVRSGSTYYFLTAGICTDGATTWWANSARTTVLGTTSGSSFPNNDYGIVRYTNTTIPKDGTVGGQDITSAANATVGMAVTRRGSTTGTISGSVTALNATVNYGGGDVVYGMIRTNVCAEPGDSGGPLYSGTRAIGLTSGGSGNCSSGGTTFFQPVTEALVAYGVSVY<SEQUENCE_NATURAL_END> | ||
labels: | ||
<@TOKENIZER-TYPE=AA><MOLECULAR_WEIGHT_IN_SOME_UNIT><@TOKENIZER-TYPE=SCALARS_LITERALS>0.3<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_NANOMOLAR><@TOKENIZER-TYPE=SCALARS_LITERALS>12.4<@TOKENIZER-TYPE=AA><SEQUENCE_NATURAL_START>ISGGDAIYSSTGRCSLGFNVRSGSTYYFLTAGICTDGATTWWANSARTTVLGTTSGSSFPNNDYGIVRYTNTTIPKDGTVGGQDITSAANATVGMAVTRRGSTTGTISGSVTALNATVNYGGGDVVYGMIRTNVCAEPGDSGGPLYSGTRAIGLTSGGSGNCSSGGTTFFQPVTEALVAYGVSVY<SEQUENCE_NATURAL_END> | ||
|
||
""" | ||
|
||
@staticmethod | ||
def build_placeholder_meta_tokenization( | ||
*, | ||
sequence: str, | ||
sample_dict: Optional[NDict] = None, | ||
) -> Tuple[str, List[str]]: | ||
""" | ||
In order to avoid modifying and rewriting the logic in modular tokenizer, especially regarding padding, limitation of max length of certain sub-parts, | ||
we put placeholders to make sure that the total size is known/fixed and respects the meta instructions to the modular tokenizer | ||
|
||
Returns: a tuple with 2 elements | ||
( | ||
a single string with the full query containing placeholder tokens for FLOAT and VECTOR meta tokenizer parts, | ||
a list of [meta-tokenizer name, data, meta-tokenizer name, data, meta-tokenizer name, data, ...] | ||
) | ||
""" | ||
hints_and_subseq = re.split("<@TOKENIZER-TYPE=([^>]*)>", sequence)[ | ||
1: | ||
] # the first element is blank - removing it | ||
assert ( | ||
len(hints_and_subseq) > 0 and len(hints_and_subseq) % 2 == 0 | ||
), f"Error: expecting leading modular tokenizer hints followed by a sequence to tokenize, got {sequence}" | ||
|
||
with_placeholders = [] | ||
|
||
for tokenizer_type, subseq in zip( | ||
hints_and_subseq[::2], hints_and_subseq[1::2] | ||
): | ||
if tokenizer_type.startswith("SCALARS_"): | ||
with_placeholders.append( | ||
"<@TOKENIZER-TYPE=AA>" | ||
) # won't use AA tokens, just an arbitrary one to be able to use a token like <SCALAR> | ||
|
||
if ( | ||
tokenizer_type == "SCALARS_LITERALS" | ||
): # note: masking is only supported in literals (not in "from dict") | ||
values = subseq.split(",") | ||
# seq = "<SCALAR>" * len(values) | ||
seq = "".join( | ||
[ | ||
"<MASKED_SCALAR>" if x == "<MASK>" else "<SCALAR>" | ||
for x in values | ||
] | ||
) | ||
elif tokenizer_type == "SCALARS_FROM_DICT": | ||
if sample_dict is None: | ||
raise Exception( | ||
"SCALARS_FROM_DICT used but the provided sample_dict is None" | ||
) | ||
values = sample_dict[subseq] | ||
assert len(values.shape) == 1 | ||
seq = "<SCALAR>" * len(values) | ||
else: | ||
raise Exception(f"tokenizer_type={tokenizer_type} is not supported") | ||
|
||
with_placeholders.append(seq) | ||
|
||
elif tokenizer_type.startswith("VECTORS_"): | ||
raise Exception("VECTOR_* are not supported yet") | ||
else: | ||
with_placeholders.append("<@TOKENIZER-TYPE=" + tokenizer_type + ">") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You might mistakenly drop here the max length per element. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so:
tell me if you still think I miss something here |
||
with_placeholders.append(subseq) | ||
|
||
return "".join(with_placeholders), hints_and_subseq | ||
|
||
@staticmethod | ||
def prepare_info_for_model_step( | ||
*, | ||
per_meta_tokenizer_data: List[str], | ||
per_meta_encoding_including_placeholders: List[Encoding], | ||
sample_dict: Optional[NDict] = None, | ||
) -> Dict: | ||
""" | ||
since we: | ||
1. Need to use the model embedding layer (allowing gradients flow if needed) | ||
2. We prefer not to use the model during the data pipeline | ||
|
||
In this function we prepare everything so that during the train/val/test_step we'll be able to do what's needed before doing the forward pass | ||
|
||
Args: | ||
per_meta_tokenizer_data: a list of [meta-tokenizer name, data, meta-tokenizer name, data, meta-tokenizer name, data, ...] | ||
per_meta_encoding_including_placeholders: a list of Encoding elements. This is used to extract per tokenizer final tokens num (after all of the padding and cropping logic was already done) | ||
sample_dict: a fuse sample_dict - optional. | ||
needed only if the meta tokenizer instruction uses a syntax of lookup from the dictionary | ||
|
||
|
||
""" | ||
scalars_indices = [] | ||
scalars_values = [] | ||
scalars_masked_indices = [] | ||
prev_index_end = -1 | ||
|
||
for tokenizer_name, curr_str_data, curr_placeholder_encoding in zip( | ||
per_meta_tokenizer_data[::2], | ||
per_meta_tokenizer_data[1::2], | ||
per_meta_encoding_including_placeholders, | ||
): | ||
if tokenizer_name.startswith("SCALARS_"): | ||
if "SCALARS_LITERALS" == tokenizer_name: | ||
curr_str_data = curr_str_data.strip().split(",") | ||
if len(curr_str_data) != len(curr_placeholder_encoding.ids): | ||
raise Exception( | ||
f"should match expected length. Found length {len(curr_str_data)} but placeholders length was {len(curr_placeholder_encoding.ids)}" | ||
) | ||
|
||
curr_indices = [] | ||
curr_data = [] | ||
|
||
for i, val in enumerate(curr_str_data): | ||
if val != "<MASK>": | ||
curr_indices.append(i + prev_index_end + 1) | ||
curr_data.append(float(val)) | ||
else: | ||
scalars_masked_indices.append(i + prev_index_end + 1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this is a running index of scalars and index that aligns it to the encoder_input. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, this collects all of the indices (at the level of final tokens) of masked scalars, across the entire sequence. |
||
|
||
if len(curr_indices) > 0: | ||
curr_indices = torch.tensor(curr_data, dtype=torch.int64) | ||
curr_data = torch.tensor(curr_data, dtype=torch.float32) | ||
|
||
scalars_indices.append(curr_indices) | ||
scalars_values.append(curr_data) | ||
|
||
assert len(curr_data.shape) == 1 | ||
elif "SCALARS_FROM_DICT" == tokenizer_name: | ||
if sample_dict is None: | ||
raise Exception( | ||
"SCALARS_FROM_DICT used but the provided sample_dict is None" | ||
) | ||
curr_data = sample_dict[curr_str_data] | ||
assert len(curr_data.shape) == 1 | ||
curr_indices = torch.arange( | ||
prev_index_end + 1, prev_index_end + 1 + curr_data.shape[0] | ||
) | ||
|
||
scalars_indices.append(curr_indices) | ||
scalars_values.append(curr_data) | ||
|
||
prev_index_end += curr_data.shape[0] | ||
|
||
else: | ||
raise Exception( | ||
"Only supported SCALARS_* tokenizers are SCALARS_LITERALS and SCALARS_FROM_DICT" | ||
) | ||
|
||
elif tokenizer_name.startswith("VECTORS_"): | ||
raise NotImplementedError | ||
else: | ||
prev_index_end += len(curr_placeholder_encoding.ids) | ||
|
||
if len(scalars_indices) > 0: | ||
scalars_indices = torch.concat(scalars_indices) | ||
scalars_values = torch.concat(scalars_values) | ||
|
||
if len(scalars_masked_indices) > 0: | ||
scalars_masked_indices = torch.tensor( | ||
scalars_masked_indices, dtype=torch.int64 | ||
) | ||
else: | ||
scalars_masked_indices = None | ||
|
||
return { | ||
"scalars_indices": scalars_indices, # 1d - its length is the number of actual scalars (provided) found | ||
"scalars_values": scalars_values, # 1d - values of provided scalars | ||
"scalars_masked_indices": scalars_masked_indices, # 1d - indices of masked scalars | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1006,7 +1006,13 @@ def encode_list( | |
return_overflow_info: Optional[bool] = False, | ||
on_unknown: Optional[str] = "warn", | ||
verbose: int = 1, | ||
) -> Union[Encoding, Tuple[Encoding, str]]: | ||
also_return_split: bool = False, | ||
) -> Union[ | ||
Encoding, | ||
Tuple[Encoding, str], | ||
Tuple[Encoding, List[Encoding]], | ||
Tuple[Encoding, str, List[Encoding]], | ||
]: | ||
"""_summary_ | ||
|
||
Args: | ||
|
@@ -1025,6 +1031,7 @@ def encode_list( | |
on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to <UNK>) are encountered: 'raise' or 'warn' | ||
verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning | ||
with full data. Defaults to 1. | ||
also_return_split: defaults to False. If set to True, the return value will also contain a list that contains per meta-tokenizer-instruction element of Encoding | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should it be set to True if we want scalar support? or it's just for debug? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't call this directly, injector_tokenizer_op does it automatically for you. if we only get the final merged one we can't understand:
the only way we can do that externally is by effectively doing the entire logic of modular tokenizer including actual tokenization, padding, cropping, which is both code duplication and will also be slower. If this isn't completely clear yet let's talk |
||
Returns: | ||
Encoding: _description_ | ||
""" | ||
|
@@ -1150,9 +1157,15 @@ def encode_list( | |
f"Unexpected on_unknown value {on_unknown}. Should be 'warn' or 'raise'" | ||
) | ||
|
||
if (not return_overflow_info) and (not also_return_split): | ||
return merged_encoding | ||
ans = [merged_encoding] | ||
if return_overflow_info: | ||
return merged_encoding, overflow_info | ||
return merged_encoding | ||
ans += [overflow_info] | ||
if also_return_split: | ||
ans += [encoded_list] | ||
|
||
return tuple(ans) | ||
|
||
def decode(self, ids: Iterable, skip_special_tokens: Optional[bool] = False) -> str: | ||
"""Receives a list of IDs and returns a string of tokens | ||
|
@@ -1190,6 +1203,7 @@ def encode( | |
return_overflow_info: Optional[bool] = False, | ||
on_unknown: Optional[str] = "warn", | ||
verbose: Optional[int] = 1, | ||
also_return_split: bool = False, | ||
) -> Encoding: | ||
# (self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True) | ||
"""Receives a user-supplied string that contains, in addition to the text that is to be tokenized, special delimiters signifying the type | ||
|
@@ -1210,6 +1224,7 @@ def encode( | |
on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to <UNK>) are encountered: 'raise' or 'warn' | ||
verbose (int, optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning | ||
with full data. Defaults to 1. | ||
also_return_split: also return the per-meta-instruction encoded parts as a list of Encoding elements | ||
Returns: | ||
Encoding: _description_ | ||
str: _description_ information on overflow, if return_overflow_info=True | ||
|
@@ -1251,6 +1266,7 @@ def encode( | |
return_overflow_info=return_overflow_info, | ||
on_unknown=on_unknown, | ||
verbose=verbose, | ||
also_return_split=also_return_split, | ||
) | ||
|
||
def get_tokenizer_types(self) -> List: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So we should write "," and not "?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, the scalars tokenizer require that you split them with ','
if you have an alternative you prefer do suggest.
I will add some description of the expected format in the injector files docstrings
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added docstrings with format description for both injector_tokenizer.py and injector_tokenizer_op
also renamed InjectorTokenizer to InjectorTokenizerHelpers and stopped inheriting from ModularTokenizer in it because it's misleading, as it's just 2 static helper methods.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is part of the docstrings I've added