From 2460cb7a85ee21da0529bfe43d164abb2270da44 Mon Sep 17 00:00:00 2001
From: YoelShoshan <yoels@il.ibm.com>
Date: Fri, 19 Jul 2024 17:42:33 -0400
Subject: [PATCH 1/5] adding injector tokenizer to support scalars and vectors
 injection

---
 .../injectortokenizer/injector_tokenizer.py   | 192 ++++++++++++++++++
 fusedrug/data/tokenizer/ops/__init__.py       |   1 +
 .../tokenizer/ops/injector_tokenizer_ops.py   | 144 +++++++++++++
 3 files changed, 337 insertions(+)
 create mode 100644 fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
 create mode 100644 fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
diff --git a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
new file mode 100644
index 00000000..accf5b45
--- /dev/null
+++ b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
@@ -0,0 +1,192 @@
+from fusedrug.data.tokenizer.modulartokenizer.modular_tokenizer import ModularTokenizer
+from typing import Dict
+from typing import Optional, List, Union, Tuple, Any
+from tokenizers import Encoding
+import omegaconf
+import torch
+from collections.abc import Iterable
+import re
+
+
+class InjectorTokenizer(ModularTokenizer):
+    """
+    InjectorTokenizer builds on top of ModularTokenizer.
+
+    Its purpose is to extend beyond "standard" input tokens as integers as input for a model.
+    Instead, it provides control on *vectors* that are to be used as input for a model.
+
+    Example use cases:
+    1. Providing scalars (floating point) as inputs
+    2. Providing vectors of embeddings - for example of a protein embedding
+
+    Each input "token" becomes a tensor of a defined size, and is built of:
+    1. Header
+        made of 4 floats
+        [
+            0.0 or 1.0 #is this a sentinel/mask or not
+            0.0 or 1.0 #is this a standard vocabulary token
+            0.0 or 1.0 #is this a scalar
+            0.0 or 1.0 #is this a full injected vector (e.g. an embedding)
+        ]
+    2. Content
+        the rest of each input vector is made of input_dim-4 float elements.
+
+
+    Note - in the "standard vocabulary token" - we support providing an external embeding layer (like in vanilla T5),
+        as it's part of the trained weights.
+
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        embedding_layer: torch.nn.Module,
+        tokenizers_info: Union[List, omegaconf.listconfig.ListConfig],
+        load_adjusted_jsons: Optional[bool] = False,
+        special_tokens_dict: Optional[Dict] = None,
+        additional_tokens_list: Optional[List] = None,
+        max_possible_token_id: Optional[int] = None,
+        max_special_token_id: Optional[int] = None,
+        **kwargs: Any,
+    ) -> None:
+        """
+        input_dim: the size of a vector of each input. The total output will be [sequence length, input_dim]
+
+
+        """
+        self._input_dim = input_dim
+        self._embedding_layer = embedding_layer
+        self._modular_tokenizer = ModularTokenizer(
+            tokenizers_info=tokenizers_info,
+            load_adjusted_jsons=load_adjusted_jsons,
+            special_tokens_dict=special_tokens_dict,
+            additional_tokens_list=additional_tokens_list,
+            max_possible_token_id=max_possible_token_id,
+            max_special_token_id=max_special_token_id,
+            **kwargs,
+        )
+
+        print("")
+
+    def encode_list(
+        self,
+        typed_input_list: List,
+        max_len: Optional[int] = None,
+        padding_token_id: Optional[int] = None,
+        padding_token: Optional[str] = "<PAD>",
+        pad_type_id: Optional[int] = None,
+        return_overflow_info: Optional[bool] = False,
+        on_unknown: Optional[str] = "warn",
+        verbose: int = 1,
+    ) -> Union[Encoding, Tuple[Encoding, str]]:
+        """_summary_
+
+        Args:
+            typed_input_list (List): list of collections.namedtuple("input_type", ["input_string", "max_len"]), with
+                input type: the name of input type,
+                input_string: the string to be encoded
+                max_len: maximal length of the encoding (in tokens). Only relevant for truncation, as we do not need to
+                pad individual sub-tokenizer encodings - we only pad the final encoding of the ModularTokenizer.
+                The smallest value between config-defined and tuple-defined is used. If None, the max_len
+                that was defined for the sub-tokenizer in the config is used.
+            max_len (Optional[int], optional): _description_. Defaults to None.
+            padding_token_id (Optional[str], optional): _description_. Defaults to 0. TODO: default to None and infer it
+            padding_token (Optional[str], optional): _description_. Defaults to "<PAD>".
+            pad_type_id (Optional[int], optional): _description_. Defaults to 0. (TODO: raise exception)
+            return_overflow_info (Optional[bool], optional): If True return an additional string with overflow information. Defaults to False.
+            on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to <UNK>) are encountered: 'raise' or 'warn'
+            verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
+                with full data. Defaults to 1.
+        Returns:
+            Encoding: _description_
+        """
+
+        raise NotImplementedError
+
+    def encode(
+        self,
+        sequence: str,
+        max_len: Optional[int] = None,
+        padding_token_id: Optional[int] = 0,
+        padding_token: Optional[str] = "<PAD>",
+        pad_type_id: Optional[int] = 0,
+        return_overflow_info: Optional[bool] = False,
+        on_unknown: Optional[str] = "warn",
+        verbose: Optional[int] = 1,
+    ) -> Encoding:
+        # (self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)
+        """Receives a user-supplied string that contains, in addition to the text that is to be tokenized, special delimiters signifying the type
+        of input within each span of text (e.g. <@TOKENIZER-TYPE=AA> sequence, <@TOKENIZER-TYPE=SMILES>, etc.). These determine the type of tokenizer to use on each span,
+        and are not encoded.
+        Optionaly, you may also describe maximum length per section, for example:
+            "<@TOKENIZER-TYPE=AA><BLAH><BLAH2>QKPGQAPRLLIYG<@TOKENIZER-TYPE=AA@MAX-LEN=122><BLAH3>SGSDFSDFSFD"
+            would not have a local limitation of the first AA section, but will have a local maximum length of 122 on the second section.
+            local in this context means that the maximum length will be imposed on the individual section prior to applying any global "entire sequence" maximum size limitations (if any).
+
+        Args:
+            input_string (str): _description_
+            max_len (Optional[int], optional): _description_. Defaults to None.
+            padding_token_id (Optional[str], optional): _description_. Defaults to 0.
+            padding_token (Optional[str], optional): _description_. Defaults to "<PAD>".
+            pad_type_id (Optional[int], optional): _description_. Defaults to 0.
+            return_overflow_info (Optional[bool], optional): _description_. If True return an additional string with overflow information. Defaults to False.
+            on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to <UNK>) are encountered: 'raise' or 'warn'
+            verbose (int, optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
+                with full data. Defaults to 1.
+        Returns:
+            Encoding: _description_
+            str: _description_ information on overflow, if return_overflow_info=True
+        """
+
+        raise NotImplementedError
+
+    def decode(self, ids: Iterable, skip_special_tokens: Optional[bool] = False) -> str:
+        """Receives a list of IDs and returns a string of tokens
+            TODO: possibly output also the type of token (AA, SMILES, etc)
+        Args:
+            ids (Iterable): _description_
+            skip_special_tokens (Optional[bool], optional): _description_. Defaults to False.
+
+        Returns:
+            str: _description_
+        """
+
+        raise NotImplementedError
+
+    @staticmethod
+    def build_placeholder_meta_tokenization(sequence: str) -> Tuple[str, List[str]]:
+        """
+        In order to avoid modifying and rewriting the logic in modular tokenizer, especially regarding padding, limitation of max length of certain sub-parts,
+         we put placeholders to make sure that the total size is known/fixed and respects the meta instructions to the modular tokenizer
+        """
+        hints_and_subseq = re.split("<@TOKENIZER-TYPE=([^>]*)>", sequence)[
+            1:
+        ]  # the first element is blank - removing it
+        assert (
+            len(hints_and_subseq) > 0 and len(hints_and_subseq) % 2 == 0
+        ), f"Error: expecting leading modular tokenizer hints followed by a sequence to tokenize, got {sequence}"
+
+        with_placeholders = []
+
+        for tokenizer_type, subseq in zip(
+            hints_and_subseq[::2], hints_and_subseq[1::2]
+        ):
+            if tokenizer_type == "FLOAT":
+                with_placeholders.append(
+                    "<@TOKENIZER-TYPE=AA>"
+                )  # won't use AA tokens, just an arbitrary one to be able to use a token like <1>
+                values = subseq.split(",")
+                seq = "<1>" * len(values)
+                with_placeholders.append(seq)
+            elif tokenizer_type == "VECTOR":
+                with_placeholders.append(
+                    "<@TOKENIZER-TYPE=AA>"
+                )  # won't use AA tokens, just an arbitrary one to be able to use a token like <1>
+                values = subseq.split("@")
+                seq = "<1>" * len(values)
+                with_placeholders.append(seq)
+            else:
+                with_placeholders.append(tokenizer_type)
+                with_placeholders.append(subseq)
+
+        return "".join(with_placeholders), with_placeholders
diff --git a/fusedrug/data/tokenizer/ops/__init__.py b/fusedrug/data/tokenizer/ops/__init__.py
index d7f4adfd..3bd6c89f 100644
--- a/fusedrug/data/tokenizer/ops/__init__.py
+++ b/fusedrug/data/tokenizer/ops/__init__.py
@@ -1,5 +1,6 @@
 from .fast_tokenizer_ops import FastTokenizer
 from .modular_tokenizer_ops import FastModularTokenizer
+from .injector_tokenizer_ops import InjectorTokenizerOp
 
 try:
     from .pytoda_tokenizer import Op_pytoda_SMILESTokenizer, Op_pytoda_ProteinTokenizer
diff --git a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
new file mode 100644
index 00000000..c5222609
--- /dev/null
+++ b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
@@ -0,0 +1,144 @@
+from fuse.utils import NDict
+
+# from fuse.data import OpBase, get_sample_id
+from fusedrug.data.tokenizer.injectortokenizer.injector_tokenizer import (
+    InjectorTokenizer,
+)
+
+# from fusedrug.data.tokenizer.modulartokenizer.modular_tokenizer import ModularTokenizer
+from fusedrug.data.tokenizer.ops import FastModularTokenizer
+
+# from warnings import warn
+# from collections import defaultdict
+from typing import Optional, Union, Any
+
+# import os
+# import re
+# import torch
+
+
+class InjectorTokenizerOp(FastModularTokenizer):
+    """
+    applies a injector tokenizer
+
+    injector tokenizer builds on top of modular tokenizer.
+    its purpose is to build inputs_emb for the model (instead of input_ids)
+        this allows to support more advanced inputs beyond token ids, like:
+        * scalars inputs
+        * embeddings vector within a single input
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        tokenizer_path: str,
+        max_size: Union[int, None] = None,
+        pad_token: Union[str, None] = None,
+        pad_type_id: Union[int, None] = None,
+        validate_ends_with_eos: Optional[bool] = True,
+        eos: Optional[str] = "<EOS>",
+        verbose: Optional[bool] = False,
+        **kwargs: Any,
+    ) -> None:
+        """
+
+        Args:
+            tokenizer_path: full path to a directory that the tokenizer will be loaded from
+            max_size: sequences below this size will be padded, and above this size will be truncated
+            pad: a string of the pad token
+            pad_type_id: see tokenizers.Tokenizer.enable_padding() docstring
+            validate_ends_with_eos: during encoder request (a _call_ to the op) will make sure that it ends with the provided eos token, and raise exception otherwise.
+                having an eos (end of sentence) token in the end is useful for multiple scenarios, for example in a generative transformer (like T5 encoder-decoder)
+            verbose:
+        """
+        if verbose:
+            print(
+                f"DEBUG:InjectorTokenizerOp __init__ called for path {tokenizer_path}"
+            )
+
+        super().__init__(
+            tokenizer_path=tokenizer_path,
+            max_size=max_size,
+            pad_token=pad_token,
+            pad_type_id=pad_type_id,
+            validate_ends_with_eos=validate_ends_with_eos,
+            eos=eos,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        self._input_dim = input_dim
+
+    def __call__(
+        self,
+        sample_dict: NDict,
+        embedding_layer_key_in: str,  # should point to a torch.nn.Module of an embedding layer
+        key_in: str,
+        key_out_tokenized_object: Optional[str] = None,
+        key_out_tokens_ids: Optional[str] = None,
+        key_out_attention_mask: Optional[str] = None,
+        convert_attention_mask_to_bool: Optional[bool] = True,
+        max_seq_len: Optional[int] = None,
+        on_unknown: Optional[str] = "warn",
+        verbose: Optional[int] = 1,
+        validate_ends_with_eos: Optional[bool] = None,
+    ) -> NDict:
+        """_summary_
+
+        Args:
+            sample_dict (NDict): _description_
+            key_in (str): key to either a:
+                (1) string that contains, in addition to the text that is to be tokenized, special delimiters signifying the type
+                of input within each span of text (e.g. <@TOKENIZER-TYPE=AA> sequence, <@TOKENIZER-TYPE=SMILES>, etc.).
+                (2) list of modular_tokenizer.TypedInput specifying the tokenizer type and the subsequence to tokenize
+            key_out_tokenized_object (Optional[str], optional): _description_. Defaults to None.
+            key_out_tokens_ids (Optional[str], optional): _description_. Defaults to None.
+            key_out_attention_mask (Optional[str], optional): _description_. Defaults to None.
+            convert_attention_mask_to_bool (Optional[bool], optional): _description_. Defaults to True.
+            max_seq_len (Optional[int], optional): set maximum sequence len dynamically, used for both padding and truncation.. Defaults to None.
+            on_unknown (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to <UNK>) are encountered: 'raise' or 'warn'. Defaults to "warn".
+            verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
+                with full data. Defaults to 1.
+            validate_ends_with_eos (Optional[bool], optional): if not None, overrides self._validate_ends_with_eos
+
+        Raises:
+            Exception: _description_
+            Exception: _description_
+
+        Returns:
+            NDict: _description_
+        """
+
+        print("FOR DEBUGGING! REMOVE !!!!!!")
+        # orig: '<@TOKENIZER-TYPE=AA><GENERAL_AFFINITY_CLASS><MASK>
+        # <@TOKENIZER-TYPE=AA><COMPLEX_ENTITY><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>KSSCKRIPLYVDFSDVGWNDWIVAPPGYIAMYCIGECPFPLADILNSTNIAIVQTLVNSVNSKIPKACCVPTELSAISMLMLDENEKVVLKNYQDMVVEGCGCR<SEQUENCE_NATURAL_END>
+        # <@TOKENIZER-TYPE=AA><COMPLEX_ENTITY><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>WLITGTEASCENEGEVLIIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG<SEQUENCE_NATURAL_END>
+        # <@TOKENIZER-TYPE=AA><MUTATED><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>WAITGTEASCENEGEVLAIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG<SEQUENCE_NATURAL_END><EOS>'
+        sample_dict[key_in] = (
+            "<@TOKENIZER-TYPE=AA><GENERAL_AFFINITY_CLASS><MASK>"
+            + "<@TOKENIZER-TYPE=FLOAT>12.7,3.2,14.8,99,9"
+            + "<@TOKENIZER-TYPE=AA><COMPLEX_ENTITY><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>KSSCKRIPLYVDFSDVGWNDWIVAPPGYIAMYCIGECPFPLADILNSTNIAIVQTLVNSVNSKIPKACCVPTELSAISMLMLDENEKVVLKNYQDMVVEGCGCR<SEQUENCE_NATURAL_END>"
+            + "<@TOKENIZER-TYPE=AA><COMPLEX_ENTITY><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>WLITGTEASCENEGEVLIIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG<SEQUENCE_NATURAL_END>"
+            + "<@TOKENIZER-TYPE=AA><MUTATED><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>WAITGTEASCENEGEVLAIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG<SEQUENCE_NATURAL_END><EOS>"
+        )
+
+        (
+            with_placeholders_str,
+            with_placeholders_per_meta,
+        ) = InjectorTokenizer.build_placeholder_meta_tokenization(sample_dict[key_in])
+        sample_dict[key_in + "@with_placeholders"] = with_placeholders_str
+
+        super().__call__(
+            sample_dict=sample_dict,
+            key_in=key_in + "@with_placeholders",
+            key_out_tokenized_object=key_out_tokenized_object,
+            key_out_tokens_ids=key_out_tokens_ids,
+            key_out_attention_mask=key_out_attention_mask,
+            convert_attention_mask_to_bool=convert_attention_mask_to_bool,
+            max_seq_len=max_seq_len,
+            on_unknown=on_unknown,
+            verbose=verbose,
+            validate_ends_with_eos=validate_ends_with_eos,
+        )
+
+        print("")

From 3df904c0894f497109413dedf509eff5febce6bc Mon Sep 17 00:00:00 2001
From: YoelShoshan <yoels@il.ibm.com>
Date: Sat, 20 Jul 2024 16:09:22 -0400
Subject: [PATCH 2/5] FLOAT and VECTOR meta tokenizers support

---
 .../injectortokenizer/injector_tokenizer.py   |  2 +-
 .../modulartokenizer/modular_tokenizer.py     | 22 ++++++++++++++++---
 .../tokenizer/ops/injector_tokenizer_ops.py   | 15 +++++++++++--
 .../tokenizer/ops/modular_tokenizer_ops.py    | 14 ++++++++++--
 4 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
index accf5b45..df781cca 100644
--- a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
+++ b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
@@ -186,7 +186,7 @@ def build_placeholder_meta_tokenization(sequence: str) -> Tuple[str, List[str]]:
                 seq = "<1>" * len(values)
                 with_placeholders.append(seq)
             else:
-                with_placeholders.append(tokenizer_type)
+                with_placeholders.append("<@TOKENIZER-TYPE=" + tokenizer_type + ">")
                 with_placeholders.append(subseq)
 
         return "".join(with_placeholders), with_placeholders
diff --git a/fusedrug/data/tokenizer/modulartokenizer/modular_tokenizer.py b/fusedrug/data/tokenizer/modulartokenizer/modular_tokenizer.py
index 6a929d8e..01d40b97 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/modular_tokenizer.py
+++ b/fusedrug/data/tokenizer/modulartokenizer/modular_tokenizer.py
@@ -1006,7 +1006,13 @@ def encode_list(
         return_overflow_info: Optional[bool] = False,
         on_unknown: Optional[str] = "warn",
         verbose: int = 1,
-    ) -> Union[Encoding, Tuple[Encoding, str]]:
+        also_return_split: bool = False,
+    ) -> Union[
+        Encoding,
+        Tuple[Encoding, str],
+        Tuple[Encoding, List[Encoding]],
+        Tuple[Encoding, str, List[Encoding]],
+    ]:
         """_summary_
 
         Args:
@@ -1025,6 +1031,7 @@ def encode_list(
             on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to <UNK>) are encountered: 'raise' or 'warn'
             verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
                 with full data. Defaults to 1.
+            also_return_split: defaults to False. If set to True, the return value will also contain a list that contains per meta-tokenizer-instruction element of Encoding
         Returns:
             Encoding: _description_
         """
@@ -1150,9 +1157,15 @@ def encode_list(
                     f"Unexpected on_unknown value {on_unknown}. Should be 'warn' or 'raise'"
                 )
 
+        if (not return_overflow_info) and (not also_return_split):
+            return merged_encoding
+        ans = [merged_encoding]
         if return_overflow_info:
-            return merged_encoding, overflow_info
-        return merged_encoding
+            ans += [overflow_info]
+        if also_return_split:
+            ans += [encoded_list]
+
+        return tuple(ans)
 
     def decode(self, ids: Iterable, skip_special_tokens: Optional[bool] = False) -> str:
         """Receives a list of IDs and returns a string of tokens
@@ -1190,6 +1203,7 @@ def encode(
         return_overflow_info: Optional[bool] = False,
         on_unknown: Optional[str] = "warn",
         verbose: Optional[int] = 1,
+        also_return_split: bool = False,
     ) -> Encoding:
         # (self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)
         """Receives a user-supplied string that contains, in addition to the text that is to be tokenized, special delimiters signifying the type
@@ -1210,6 +1224,7 @@ def encode(
             on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to <UNK>) are encountered: 'raise' or 'warn'
             verbose (int, optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
                 with full data. Defaults to 1.
+            also_return_split: also return the per-meta-instruction encoded parts as a list of Encoding elements
         Returns:
             Encoding: _description_
             str: _description_ information on overflow, if return_overflow_info=True
@@ -1251,6 +1266,7 @@ def encode(
             return_overflow_info=return_overflow_info,
             on_unknown=on_unknown,
             verbose=verbose,
+            also_return_split=also_return_split,
         )
 
     def get_tokenizer_types(self) -> List:
diff --git a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
index c5222609..7dea654e 100644
--- a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
+++ b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
@@ -126,11 +126,11 @@ def __call__(
             with_placeholders_str,
             with_placeholders_per_meta,
         ) = InjectorTokenizer.build_placeholder_meta_tokenization(sample_dict[key_in])
-        sample_dict[key_in + "@with_placeholders"] = with_placeholders_str
+        sample_dict[key_in + ".with_placeholders"] = with_placeholders_str
 
         super().__call__(
             sample_dict=sample_dict,
-            key_in=key_in + "@with_placeholders",
+            key_in=key_in + ".with_placeholders",
             key_out_tokenized_object=key_out_tokenized_object,
             key_out_tokens_ids=key_out_tokens_ids,
             key_out_attention_mask=key_out_attention_mask,
@@ -139,6 +139,17 @@ def __call__(
             on_unknown=on_unknown,
             verbose=verbose,
             validate_ends_with_eos=validate_ends_with_eos,
+            key_out_encoding_per_meta=key_in
+            + ".per_meta_part_encoding",  # using the key_in as base for the name because key_out_* are optional
         )
 
+        # TODO 1: call embedding layer on all tokens to get a [sequence_length, model_dim] matrix. Make sure that gradients are allowed to flow to it when needed
+        #         what is the best way to provide the model embedding layer here? the data-pipeline seems to be created BEFORE the model is constructed
+        #         if we want to call the model with the entire minibatch, then we can go with option 1:
+        #         option 1 - only prepare data towards that, and actually run the last part of the logic inside the *_step in pl_module
+        #         option 2 - call the model embedding layer per individual sample, and also somehow load the model BEFORE the data pipeline (less likely we'll go with this option...)
+        # TODO 2: override per injecting meta tokenizer type (FLOAT and VECTOR) the
+
         print("")
+
+        return sample_dict
diff --git a/fusedrug/data/tokenizer/ops/modular_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/modular_tokenizer_ops.py
index ffdf4b0f..ca2134dc 100644
--- a/fusedrug/data/tokenizer/ops/modular_tokenizer_ops.py
+++ b/fusedrug/data/tokenizer/ops/modular_tokenizer_ops.py
@@ -193,6 +193,7 @@ def __call__(
         on_unknown: Optional[str] = "warn",
         verbose: Optional[int] = 1,
         validate_ends_with_eos: Optional[bool] = None,
+        key_out_encoding_per_meta: Optional[str] = None,
     ) -> NDict:
         """_summary_
 
@@ -211,6 +212,7 @@ def __call__(
             verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
                 with full data. Defaults to 1.
             validate_ends_with_eos (Optional[bool], optional): if not None, overrides self._validate_ends_with_eos
+            key_out_encoding_per_meta: optional key out. If set to a string will put in it the per-meta-instruction encoded parts as a list of Encoding elements
 
         Raises:
             Exception: _description_
@@ -240,22 +242,30 @@ def __call__(
                 )
 
         if isinstance(data, str):
-            encoded, overflow_info = self._tokenizer.encode(
+            _ans = self._tokenizer.encode(
                 data,
                 max_len=max_seq_len,
                 return_overflow_info=True,
                 on_unknown=on_unknown,
                 verbose=verbose,
+                also_return_split=key_out_encoding_per_meta is not None,
             )
         else:
-            encoded, overflow_info = self._tokenizer.encode_list(
+            _ans = self._tokenizer.encode_list(
                 data,
                 max_len=max_seq_len,
                 return_overflow_info=True,
                 on_unknown=on_unknown,
                 verbose=verbose,
+                also_return_split=key_out_encoding_per_meta is not None,
             )
 
+        if key_out_encoding_per_meta is not None:
+            encoded, overflow_info, per_meta_encoded = _ans
+            sample_dict[key_out_encoding_per_meta] = per_meta_encoded
+        else:
+            encoded, overflow_info = _ans
+
         expected_max_len = self.get_max_len(override_max_len=max_seq_len)
         if (
             expected_max_len is not None

From 44419b308c8754e770b8729629b1f4018fc6974f Mon Sep 17 00:00:00 2001
From: YoelShoshan <yoels@il.ibm.com>
Date: Sun, 21 Jul 2024 15:47:10 -0400
Subject: [PATCH 3/5] scalars support

---
 .../injectortokenizer/injector_tokenizer.py   | 331 +++++++++++-------
 .../tokenizer/ops/injector_tokenizer_ops.py   |  32 +-
 2 files changed, 237 insertions(+), 126 deletions(-)

diff --git a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
index df781cca..821363e6 100644
--- a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
+++ b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
@@ -1,11 +1,9 @@
 from fusedrug.data.tokenizer.modulartokenizer.modular_tokenizer import ModularTokenizer
-from typing import Dict
-from typing import Optional, List, Union, Tuple, Any
+from typing import Optional, List, Tuple, Dict
 from tokenizers import Encoding
-import omegaconf
 import torch
-from collections.abc import Iterable
 import re
+from fuse.utils import NDict
 
 
 class InjectorTokenizer(ModularTokenizer):
@@ -37,127 +35,136 @@ class InjectorTokenizer(ModularTokenizer):
 
     """
 
-    def __init__(
-        self,
-        input_dim: int,
-        embedding_layer: torch.nn.Module,
-        tokenizers_info: Union[List, omegaconf.listconfig.ListConfig],
-        load_adjusted_jsons: Optional[bool] = False,
-        special_tokens_dict: Optional[Dict] = None,
-        additional_tokens_list: Optional[List] = None,
-        max_possible_token_id: Optional[int] = None,
-        max_special_token_id: Optional[int] = None,
-        **kwargs: Any,
-    ) -> None:
-        """
-        input_dim: the size of a vector of each input. The total output will be [sequence length, input_dim]
+    # def __init__(
+    #     self,
+    #     input_dim: int,
+    #     embedding_layer: torch.nn.Module,
+    #     tokenizers_info: Union[List, omegaconf.listconfig.ListConfig],
+    #     load_adjusted_jsons: Optional[bool] = False,
+    #     special_tokens_dict: Optional[Dict] = None,
+    #     additional_tokens_list: Optional[List] = None,
+    #     max_possible_token_id: Optional[int] = None,
+    #     max_special_token_id: Optional[int] = None,
+    #     **kwargs: Any,
+    # ) -> None:
+    #     """
+    #     input_dim: the size of a vector of each input. The total output will be [sequence length, input_dim]
 
+    #     """
+    #     self._input_dim = input_dim
+    #     self._embedding_layer = embedding_layer
+    #     self._modular_tokenizer = ModularTokenizer(
+    #         tokenizers_info=tokenizers_info,
+    #         load_adjusted_jsons=load_adjusted_jsons,
+    #         special_tokens_dict=special_tokens_dict,
+    #         additional_tokens_list=additional_tokens_list,
+    #         max_possible_token_id=max_possible_token_id,
+    #         max_special_token_id=max_special_token_id,
+    #         **kwargs,
+    #     )
 
-        """
-        self._input_dim = input_dim
-        self._embedding_layer = embedding_layer
-        self._modular_tokenizer = ModularTokenizer(
-            tokenizers_info=tokenizers_info,
-            load_adjusted_jsons=load_adjusted_jsons,
-            special_tokens_dict=special_tokens_dict,
-            additional_tokens_list=additional_tokens_list,
-            max_possible_token_id=max_possible_token_id,
-            max_special_token_id=max_special_token_id,
-            **kwargs,
-        )
-
-        print("")
-
-    def encode_list(
-        self,
-        typed_input_list: List,
-        max_len: Optional[int] = None,
-        padding_token_id: Optional[int] = None,
-        padding_token: Optional[str] = "<PAD>",
-        pad_type_id: Optional[int] = None,
-        return_overflow_info: Optional[bool] = False,
-        on_unknown: Optional[str] = "warn",
-        verbose: int = 1,
-    ) -> Union[Encoding, Tuple[Encoding, str]]:
-        """_summary_
+    #     print("")
 
-        Args:
-            typed_input_list (List): list of collections.namedtuple("input_type", ["input_string", "max_len"]), with
-                input type: the name of input type,
-                input_string: the string to be encoded
-                max_len: maximal length of the encoding (in tokens). Only relevant for truncation, as we do not need to
-                pad individual sub-tokenizer encodings - we only pad the final encoding of the ModularTokenizer.
-                The smallest value between config-defined and tuple-defined is used. If None, the max_len
-                that was defined for the sub-tokenizer in the config is used.
-            max_len (Optional[int], optional): _description_. Defaults to None.
-            padding_token_id (Optional[str], optional): _description_. Defaults to 0. TODO: default to None and infer it
-            padding_token (Optional[str], optional): _description_. Defaults to "<PAD>".
-            pad_type_id (Optional[int], optional): _description_. Defaults to 0. (TODO: raise exception)
-            return_overflow_info (Optional[bool], optional): If True return an additional string with overflow information. Defaults to False.
-            on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to <UNK>) are encountered: 'raise' or 'warn'
-            verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
-                with full data. Defaults to 1.
-        Returns:
-            Encoding: _description_
-        """
+    # def encode_list(
+    #     self,
+    #     typed_input_list: List,
+    #     max_len: Optional[int] = None,
+    #     padding_token_id: Optional[int] = None,
+    #     padding_token: Optional[str] = "<PAD>",
+    #     pad_type_id: Optional[int] = None,
+    #     return_overflow_info: Optional[bool] = False,
+    #     on_unknown: Optional[str] = "warn",
+    #     verbose: int = 1,
+    # ) -> Union[Encoding, Tuple[Encoding, str]]:
+    #     """_summary_
 
-        raise NotImplementedError
+    #     Args:
+    #         typed_input_list (List): list of collections.namedtuple("input_type", ["input_string", "max_len"]), with
+    #             input type: the name of input type,
+    #             input_string: the string to be encoded
+    #             max_len: maximal length of the encoding (in tokens). Only relevant for truncation, as we do not need to
+    #             pad individual sub-tokenizer encodings - we only pad the final encoding of the ModularTokenizer.
+    #             The smallest value between config-defined and tuple-defined is used. If None, the max_len
+    #             that was defined for the sub-tokenizer in the config is used.
+    #         max_len (Optional[int], optional): _description_. Defaults to None.
+    #         padding_token_id (Optional[str], optional): _description_. Defaults to 0. TODO: default to None and infer it
+    #         padding_token (Optional[str], optional): _description_. Defaults to "<PAD>".
+    #         pad_type_id (Optional[int], optional): _description_. Defaults to 0. (TODO: raise exception)
+    #         return_overflow_info (Optional[bool], optional): If True return an additional string with overflow information. Defaults to False.
+    #         on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to <UNK>) are encountered: 'raise' or 'warn'
+    #         verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
+    #             with full data. Defaults to 1.
+    #     Returns:
+    #         Encoding: _description_
+    #     """
 
-    def encode(
-        self,
-        sequence: str,
-        max_len: Optional[int] = None,
-        padding_token_id: Optional[int] = 0,
-        padding_token: Optional[str] = "<PAD>",
-        pad_type_id: Optional[int] = 0,
-        return_overflow_info: Optional[bool] = False,
-        on_unknown: Optional[str] = "warn",
-        verbose: Optional[int] = 1,
-    ) -> Encoding:
-        # (self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)
-        """Receives a user-supplied string that contains, in addition to the text that is to be tokenized, special delimiters signifying the type
-        of input within each span of text (e.g. <@TOKENIZER-TYPE=AA> sequence, <@TOKENIZER-TYPE=SMILES>, etc.). These determine the type of tokenizer to use on each span,
-        and are not encoded.
-        Optionaly, you may also describe maximum length per section, for example:
-            "<@TOKENIZER-TYPE=AA><BLAH><BLAH2>QKPGQAPRLLIYG<@TOKENIZER-TYPE=AA@MAX-LEN=122><BLAH3>SGSDFSDFSFD"
-            would not have a local limitation of the first AA section, but will have a local maximum length of 122 on the second section.
-            local in this context means that the maximum length will be imposed on the individual section prior to applying any global "entire sequence" maximum size limitations (if any).
+    #     raise NotImplementedError
 
-        Args:
-            input_string (str): _description_
-            max_len (Optional[int], optional): _description_. Defaults to None.
-            padding_token_id (Optional[str], optional): _description_. Defaults to 0.
-            padding_token (Optional[str], optional): _description_. Defaults to "<PAD>".
-            pad_type_id (Optional[int], optional): _description_. Defaults to 0.
-            return_overflow_info (Optional[bool], optional): _description_. If True return an additional string with overflow information. Defaults to False.
-            on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to <UNK>) are encountered: 'raise' or 'warn'
-            verbose (int, optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
-                with full data. Defaults to 1.
-        Returns:
-            Encoding: _description_
-            str: _description_ information on overflow, if return_overflow_info=True
-        """
+    # def encode(
+    #     self,
+    #     sequence: str,
+    #     max_len: Optional[int] = None,
+    #     padding_token_id: Optional[int] = 0,
+    #     padding_token: Optional[str] = "<PAD>",
+    #     pad_type_id: Optional[int] = 0,
+    #     return_overflow_info: Optional[bool] = False,
+    #     on_unknown: Optional[str] = "warn",
+    #     verbose: Optional[int] = 1,
+    # ) -> Encoding:
+    #     # (self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)
+    #     """Receives a user-supplied string that contains, in addition to the text that is to be tokenized, special delimiters signifying the type
+    #     of input within each span of text (e.g. <@TOKENIZER-TYPE=AA> sequence, <@TOKENIZER-TYPE=SMILES>, etc.). These determine the type of tokenizer to use on each span,
+    #     and are not encoded.
+    #     Optionaly, you may also describe maximum length per section, for example:
+    #         "<@TOKENIZER-TYPE=AA><BLAH><BLAH2>QKPGQAPRLLIYG<@TOKENIZER-TYPE=AA@MAX-LEN=122><BLAH3>SGSDFSDFSFD"
+    #         would not have a local limitation of the first AA section, but will have a local maximum length of 122 on the second section.
+    #         local in this context means that the maximum length will be imposed on the individual section prior to applying any global "entire sequence" maximum size limitations (if any).
 
-        raise NotImplementedError
+    #     Args:
+    #         input_string (str): _description_
+    #         max_len (Optional[int], optional): _description_. Defaults to None.
+    #         padding_token_id (Optional[str], optional): _description_. Defaults to 0.
+    #         padding_token (Optional[str], optional): _description_. Defaults to "<PAD>".
+    #         pad_type_id (Optional[int], optional): _description_. Defaults to 0.
+    #         return_overflow_info (Optional[bool], optional): _description_. If True return an additional string with overflow information. Defaults to False.
+    #         on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to <UNK>) are encountered: 'raise' or 'warn'
+    #         verbose (int, optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
+    #             with full data. Defaults to 1.
+    #     Returns:
+    #         Encoding: _description_
+    #         str: _description_ information on overflow, if return_overflow_info=True
+    #     """
 
-    def decode(self, ids: Iterable, skip_special_tokens: Optional[bool] = False) -> str:
-        """Receives a list of IDs and returns a string of tokens
-            TODO: possibly output also the type of token (AA, SMILES, etc)
-        Args:
-            ids (Iterable): _description_
-            skip_special_tokens (Optional[bool], optional): _description_. Defaults to False.
+    #     raise NotImplementedError
 
-        Returns:
-            str: _description_
-        """
+    # def decode(self, ids: Iterable, skip_special_tokens: Optional[bool] = False) -> str:
+    #     """Receives a list of IDs and returns a string of tokens
+    #         TODO: possibly output also the type of token (AA, SMILES, etc)
+    #     Args:
+    #         ids (Iterable): _description_
+    #         skip_special_tokens (Optional[bool], optional): _description_. Defaults to False.
+
+    #     Returns:
+    #         str: _description_
+    #     """
 
-        raise NotImplementedError
+    #     raise NotImplementedError
 
     @staticmethod
-    def build_placeholder_meta_tokenization(sequence: str) -> Tuple[str, List[str]]:
+    def build_placeholder_meta_tokenization(
+        *,
+        sequence: str,
+        sample_dict: Optional[NDict] = None,
+    ) -> Tuple[str, List[str]]:
         """
         In order to avoid modifying and rewriting the logic in modular tokenizer, especially regarding padding, limitation of max length of certain sub-parts,
          we put placeholders to make sure that the total size is known/fixed and respects the meta instructions to the modular tokenizer
+
+         Returns: a tuple with 2 elements
+         (
+            a single string with the full query containing placeholder tokens for FLOAT and VECTOR meta tokenizer parts,
+            a list of [meta-tokenizer name, data, meta-tokenizer name, data, meta-tokenizer name, data,  ...]
+         )
         """
         hints_and_subseq = re.split("<@TOKENIZER-TYPE=([^>]*)>", sequence)[
             1:
@@ -171,22 +178,104 @@ def build_placeholder_meta_tokenization(sequence: str) -> Tuple[str, List[str]]:
         for tokenizer_type, subseq in zip(
             hints_and_subseq[::2], hints_and_subseq[1::2]
         ):
-            if tokenizer_type == "FLOAT":
-                with_placeholders.append(
-                    "<@TOKENIZER-TYPE=AA>"
-                )  # won't use AA tokens, just an arbitrary one to be able to use a token like <1>
-                values = subseq.split(",")
-                seq = "<1>" * len(values)
-                with_placeholders.append(seq)
-            elif tokenizer_type == "VECTOR":
+            if tokenizer_type.startswith("SCALARS_"):
                 with_placeholders.append(
                     "<@TOKENIZER-TYPE=AA>"
                 )  # won't use AA tokens, just an arbitrary one to be able to use a token like <1>
-                values = subseq.split("@")
-                seq = "<1>" * len(values)
+
+                if tokenizer_type == "SCALARS_LITERALS":
+                    values = subseq.split(",")
+                elif tokenizer_type == "SCALARS_FROM_DICT":
+                    if sample_dict is None:
+                        raise Exception(
+                            "SCALARS_FROM_DICT used but the provided sample_dict is None"
+                        )
+                    values = sample_dict[subseq]
+                    assert len(values.shape) == 1
+                seq = "<1>" * len(values)  # TODO: put a <SCALAR> token instead
                 with_placeholders.append(seq)
+            elif tokenizer_type.startswith("VECTORS_"):
+                raise Exception("VECTOR_* are not supported yet")
             else:
                 with_placeholders.append("<@TOKENIZER-TYPE=" + tokenizer_type + ">")
                 with_placeholders.append(subseq)
 
-        return "".join(with_placeholders), with_placeholders
+        return "".join(with_placeholders), hints_and_subseq
+
+    @staticmethod
+    def prepare_info_for_model_step(
+        *,
+        per_meta_tokenizer_data: List[str],
+        per_meta_encoding_including_placeholders: List[Encoding],
+        sample_dict: Optional[NDict] = None,
+    ) -> Dict:
+        """
+        since we:
+        1. Need to use the model embedding layer (allowing gradients flow if needed)
+        2. We prefer not to use the model during the data pipeline
+
+        In this function we prepare everything so that during the train/val/test_step we'll be able to do what's needed before doing the forward pass
+
+        Args:
+            per_meta_tokenizer_data: a list of [meta-tokenizer name, data, meta-tokenizer name, data, meta-tokenizer name, data,  ...]
+            per_meta_encoding_including_placeholders: a list of Encoding elements. This is used to extract per tokenizer final tokens num (after all of the padding and cropping logic was already done)
+            sample_dict: a fuse sample_dict - optional.
+                needed only if the meta tokenizer instruction uses a syntax of lookup from the dictionary
+
+
+        """
+        scalars_indices = None
+        scalars_values = None
+        prev_index_end = -1
+
+        for tokenizer_name, curr_str_data, curr_placeholder_encoding in zip(
+            per_meta_tokenizer_data[::2],
+            per_meta_tokenizer_data[1::2],
+            per_meta_encoding_including_placeholders,
+        ):
+            if tokenizer_name.startswith("SCALARS_"):
+                if "SCALARS_LITERALS" == tokenizer_name:
+                    curr_str_data = curr_str_data.strip().split(",")
+                    if len(curr_str_data) != len(curr_placeholder_encoding.ids):
+                        raise Exception(
+                            f"should match expected length. Found length {len(curr_str_data)} but placeholders length was {len(curr_placeholder_encoding.ids)}"
+                        )
+                    curr_data = [float(_) for _ in curr_str_data]
+                    curr_data = torch.tensor(curr_data, dtype=torch.float32)
+                    assert len(curr_data.shape) == 1
+                elif "SCALARS_FROM_DICT" == tokenizer_name:
+                    if sample_dict is None:
+                        raise Exception(
+                            "SCALARS_FROM_DICT used but the provided sample_dict is None"
+                        )
+                    curr_data = sample_dict[curr_str_data]
+                    assert len(curr_data.shape) == 1
+                else:
+                    raise Exception(
+                        "Only supported SCALARS_* tokenizers are SCALARS_LITERALS and SCALARS_FROM_DICT"
+                    )
+
+                curr_indices = torch.arange(
+                    prev_index_end + 1, prev_index_end + 1 + curr_data.shape[0]
+                )
+                scalars_indices = (
+                    curr_indices
+                    if scalars_indices is None
+                    else torch.concat([scalars_indices, curr_indices])
+                )
+                scalars_values = (
+                    curr_data
+                    if scalars_values is None
+                    else torch.concat([scalars_values, curr_data])
+                )
+
+                prev_index_end += curr_data.shape[0]
+            if tokenizer_name.startswith("VECTORS_"):
+                raise NotImplementedError
+            else:
+                prev_index_end += len(curr_placeholder_encoding.ids)
+
+        return {
+            "scalars_indices": scalars_indices,
+            "scalars_values": scalars_values,
+        }
diff --git a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
index 7dea654e..a6cdbb84 100644
--- a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
+++ b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
@@ -15,6 +15,7 @@
 # import os
 # import re
 # import torch
+import torch
 
 
 class InjectorTokenizerOp(FastModularTokenizer):
@@ -72,7 +73,6 @@ def __init__(
     def __call__(
         self,
         sample_dict: NDict,
-        embedding_layer_key_in: str,  # should point to a torch.nn.Module of an embedding layer
         key_in: str,
         key_out_tokenized_object: Optional[str] = None,
         key_out_tokens_ids: Optional[str] = None,
@@ -82,6 +82,8 @@ def __call__(
         on_unknown: Optional[str] = "warn",
         verbose: Optional[int] = 1,
         validate_ends_with_eos: Optional[bool] = None,
+        key_out_scalars_indices: Optional[str] = None,
+        key_out_scalars_values: Optional[str] = None,
     ) -> NDict:
         """_summary_
 
@@ -100,6 +102,11 @@ def __call__(
             verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
                 with full data. Defaults to 1.
             validate_ends_with_eos (Optional[bool], optional): if not None, overrides self._validate_ends_with_eos
+            key_out_scalars_indices:str optional
+                if provided, will write to sample_dict in this key a 1D torch tensor with indices of all scalar elements.
+            key_out_scalars_values:str optional
+                if provided, will write to sample_dict in this key a 1D torch tensor with indices of all scalar values.
+
 
         Raises:
             Exception: _description_
@@ -116,16 +123,22 @@ def __call__(
         # <@TOKENIZER-TYPE=AA><MUTATED><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>WAITGTEASCENEGEVLAIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG<SEQUENCE_NATURAL_END><EOS>'
         sample_dict[key_in] = (
             "<@TOKENIZER-TYPE=AA><GENERAL_AFFINITY_CLASS><MASK>"
-            + "<@TOKENIZER-TYPE=FLOAT>12.7,3.2,14.8,99,9"
+            + "<@TOKENIZER-TYPE=SCALARS_LITERALS>12.7,3.2,14.8,99,9"
             + "<@TOKENIZER-TYPE=AA><COMPLEX_ENTITY><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>KSSCKRIPLYVDFSDVGWNDWIVAPPGYIAMYCIGECPFPLADILNSTNIAIVQTLVNSVNSKIPKACCVPTELSAISMLMLDENEKVVLKNYQDMVVEGCGCR<SEQUENCE_NATURAL_END>"
+            + "<@TOKENIZER-TYPE=SCALARS_FROM_DICT>blah.model.banana"
             + "<@TOKENIZER-TYPE=AA><COMPLEX_ENTITY><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>WLITGTEASCENEGEVLIIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG<SEQUENCE_NATURAL_END>"
             + "<@TOKENIZER-TYPE=AA><MUTATED><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>WAITGTEASCENEGEVLAIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG<SEQUENCE_NATURAL_END><EOS>"
         )
+        sample_dict["blah.model.banana"] = torch.tensor(
+            [100.0, 200.0, 300.0], dtype=torch.float32
+        )
 
         (
             with_placeholders_str,
-            with_placeholders_per_meta,
-        ) = InjectorTokenizer.build_placeholder_meta_tokenization(sample_dict[key_in])
+            per_meta_orig,
+        ) = InjectorTokenizer.build_placeholder_meta_tokenization(
+            sequence=sample_dict[key_in], sample_dict=sample_dict
+        )
         sample_dict[key_in + ".with_placeholders"] = with_placeholders_str
 
         super().__call__(
@@ -150,6 +163,15 @@ def __call__(
         #         option 2 - call the model embedding layer per individual sample, and also somehow load the model BEFORE the data pipeline (less likely we'll go with this option...)
         # TODO 2: override per injecting meta tokenizer type (FLOAT and VECTOR) the
 
-        print("")
+        prepared_data = InjectorTokenizer.prepare_info_for_model_step(
+            per_meta_tokenizer_data=per_meta_orig,
+            per_meta_encoding_including_placeholders=sample_dict[
+                key_in + ".per_meta_part_encoding"
+            ],
+            sample_dict=sample_dict,
+        )
+
+        sample_dict[key_out_scalars_indices] = prepared_data["scalars_indices"]
+        sample_dict[key_out_scalars_values] = prepared_data["scalars_values"]
 
         return sample_dict

From f642a078f94de822bb2605743b8ad4015dbad8a2 Mon Sep 17 00:00:00 2001
From: YoelShoshan <yoels@il.ibm.com>
Date: Wed, 24 Jul 2024 03:30:44 -0400
Subject: [PATCH 4/5] merge

---
 .../injectortokenizer/injector_tokenizer.py   | 204 ++++++------------
 ...with_aug_4272372_samples_balanced_1_1.json |  40 ++++
 .../cell_attributes_tokenizer.json            |  40 ++++
 .../gene_tokenizer.json                       |  40 ++++
 .../t5_tokenizer_AA_special.json              |  40 ++++
 ...with_aug_4272372_samples_balanced_1_1.json |  40 ++++
 .../cell_attributes_tokenizer.json            |  40 ++++
 .../t5_tokenizer_AA_special.json              |  40 ++++
 ...with_aug_4272372_samples_balanced_1_1.json |  40 ++++
 .../cell_attributes_tokenizer.json            |  40 ++++
 .../t5_tokenizer_AA_special.json              |  40 ++++
 .../modulartokenizer/special_tokens.py        |   4 +
 .../tokenizer/ops/injector_tokenizer_ops.py   |  62 +++---
 13 files changed, 500 insertions(+), 170 deletions(-)

diff --git a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
index 821363e6..f0b720a9 100644
--- a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
+++ b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
@@ -35,121 +35,6 @@ class InjectorTokenizer(ModularTokenizer):
 
     """
 
-    # def __init__(
-    #     self,
-    #     input_dim: int,
-    #     embedding_layer: torch.nn.Module,
-    #     tokenizers_info: Union[List, omegaconf.listconfig.ListConfig],
-    #     load_adjusted_jsons: Optional[bool] = False,
-    #     special_tokens_dict: Optional[Dict] = None,
-    #     additional_tokens_list: Optional[List] = None,
-    #     max_possible_token_id: Optional[int] = None,
-    #     max_special_token_id: Optional[int] = None,
-    #     **kwargs: Any,
-    # ) -> None:
-    #     """
-    #     input_dim: the size of a vector of each input. The total output will be [sequence length, input_dim]
-
-    #     """
-    #     self._input_dim = input_dim
-    #     self._embedding_layer = embedding_layer
-    #     self._modular_tokenizer = ModularTokenizer(
-    #         tokenizers_info=tokenizers_info,
-    #         load_adjusted_jsons=load_adjusted_jsons,
-    #         special_tokens_dict=special_tokens_dict,
-    #         additional_tokens_list=additional_tokens_list,
-    #         max_possible_token_id=max_possible_token_id,
-    #         max_special_token_id=max_special_token_id,
-    #         **kwargs,
-    #     )
-
-    #     print("")
-
-    # def encode_list(
-    #     self,
-    #     typed_input_list: List,
-    #     max_len: Optional[int] = None,
-    #     padding_token_id: Optional[int] = None,
-    #     padding_token: Optional[str] = "<PAD>",
-    #     pad_type_id: Optional[int] = None,
-    #     return_overflow_info: Optional[bool] = False,
-    #     on_unknown: Optional[str] = "warn",
-    #     verbose: int = 1,
-    # ) -> Union[Encoding, Tuple[Encoding, str]]:
-    #     """_summary_
-
-    #     Args:
-    #         typed_input_list (List): list of collections.namedtuple("input_type", ["input_string", "max_len"]), with
-    #             input type: the name of input type,
-    #             input_string: the string to be encoded
-    #             max_len: maximal length of the encoding (in tokens). Only relevant for truncation, as we do not need to
-    #             pad individual sub-tokenizer encodings - we only pad the final encoding of the ModularTokenizer.
-    #             The smallest value between config-defined and tuple-defined is used. If None, the max_len
-    #             that was defined for the sub-tokenizer in the config is used.
-    #         max_len (Optional[int], optional): _description_. Defaults to None.
-    #         padding_token_id (Optional[str], optional): _description_. Defaults to 0. TODO: default to None and infer it
-    #         padding_token (Optional[str], optional): _description_. Defaults to "<PAD>".
-    #         pad_type_id (Optional[int], optional): _description_. Defaults to 0. (TODO: raise exception)
-    #         return_overflow_info (Optional[bool], optional): If True return an additional string with overflow information. Defaults to False.
-    #         on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to <UNK>) are encountered: 'raise' or 'warn'
-    #         verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
-    #             with full data. Defaults to 1.
-    #     Returns:
-    #         Encoding: _description_
-    #     """
-
-    #     raise NotImplementedError
-
-    # def encode(
-    #     self,
-    #     sequence: str,
-    #     max_len: Optional[int] = None,
-    #     padding_token_id: Optional[int] = 0,
-    #     padding_token: Optional[str] = "<PAD>",
-    #     pad_type_id: Optional[int] = 0,
-    #     return_overflow_info: Optional[bool] = False,
-    #     on_unknown: Optional[str] = "warn",
-    #     verbose: Optional[int] = 1,
-    # ) -> Encoding:
-    #     # (self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)
-    #     """Receives a user-supplied string that contains, in addition to the text that is to be tokenized, special delimiters signifying the type
-    #     of input within each span of text (e.g. <@TOKENIZER-TYPE=AA> sequence, <@TOKENIZER-TYPE=SMILES>, etc.). These determine the type of tokenizer to use on each span,
-    #     and are not encoded.
-    #     Optionaly, you may also describe maximum length per section, for example:
-    #         "<@TOKENIZER-TYPE=AA><BLAH><BLAH2>QKPGQAPRLLIYG<@TOKENIZER-TYPE=AA@MAX-LEN=122><BLAH3>SGSDFSDFSFD"
-    #         would not have a local limitation of the first AA section, but will have a local maximum length of 122 on the second section.
-    #         local in this context means that the maximum length will be imposed on the individual section prior to applying any global "entire sequence" maximum size limitations (if any).
-
-    #     Args:
-    #         input_string (str): _description_
-    #         max_len (Optional[int], optional): _description_. Defaults to None.
-    #         padding_token_id (Optional[str], optional): _description_. Defaults to 0.
-    #         padding_token (Optional[str], optional): _description_. Defaults to "<PAD>".
-    #         pad_type_id (Optional[int], optional): _description_. Defaults to 0.
-    #         return_overflow_info (Optional[bool], optional): _description_. If True return an additional string with overflow information. Defaults to False.
-    #         on_unknown: (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to <UNK>) are encountered: 'raise' or 'warn'
-    #         verbose (int, optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
-    #             with full data. Defaults to 1.
-    #     Returns:
-    #         Encoding: _description_
-    #         str: _description_ information on overflow, if return_overflow_info=True
-    #     """
-
-    #     raise NotImplementedError
-
-    # def decode(self, ids: Iterable, skip_special_tokens: Optional[bool] = False) -> str:
-    #     """Receives a list of IDs and returns a string of tokens
-    #         TODO: possibly output also the type of token (AA, SMILES, etc)
-    #     Args:
-    #         ids (Iterable): _description_
-    #         skip_special_tokens (Optional[bool], optional): _description_. Defaults to False.
-
-    #     Returns:
-    #         str: _description_
-    #     """
-
-    #     raise NotImplementedError
-
     @staticmethod
     def build_placeholder_meta_tokenization(
         *,
@@ -181,10 +66,19 @@ def build_placeholder_meta_tokenization(
             if tokenizer_type.startswith("SCALARS_"):
                 with_placeholders.append(
                     "<@TOKENIZER-TYPE=AA>"
-                )  # won't use AA tokens, just an arbitrary one to be able to use a token like <1>
+                )  # won't use AA tokens, just an arbitrary one to be able to use a token like <SCALAR>
 
-                if tokenizer_type == "SCALARS_LITERALS":
+                if (
+                    tokenizer_type == "SCALARS_LITERALS"
+                ):  # note: masking is only supported in literals (not in "from dict")
                     values = subseq.split(",")
+                    # seq = "<SCALAR>" * len(values)
+                    seq = "".join(
+                        [
+                            "<MASKED_SCALAR>" if x == "<MASK>" else "<SCALAR>"
+                            for x in values
+                        ]
+                    )
                 elif tokenizer_type == "SCALARS_FROM_DICT":
                     if sample_dict is None:
                         raise Exception(
@@ -192,8 +86,17 @@ def build_placeholder_meta_tokenization(
                         )
                     values = sample_dict[subseq]
                     assert len(values.shape) == 1
-                seq = "<1>" * len(values)  # TODO: put a <SCALAR> token instead
+                    seq = "<SCALAR>" * len(values)
+                else:
+                    raise Exception(f"tokenizer_type={tokenizer_type} is not supported")
+
+                # elif tokenizer_type == "SCALARS_MASKED":
+                #     values = subseq.split(",")
+                #     assert all([x=='<MASK>' for x in values]) #only <MASK> is currently supported
+                #     seq = "<MASKED_SCALAR>" * len(values)
+
                 with_placeholders.append(seq)
+
             elif tokenizer_type.startswith("VECTORS_"):
                 raise Exception("VECTOR_* are not supported yet")
             else:
@@ -224,8 +127,9 @@ def prepare_info_for_model_step(
 
 
         """
-        scalars_indices = None
-        scalars_values = None
+        scalars_indices = []
+        scalars_values = []
+        scalars_masked_indices = []
         prev_index_end = -1
 
         for tokenizer_name, curr_str_data, curr_placeholder_encoding in zip(
@@ -240,9 +144,25 @@ def prepare_info_for_model_step(
                         raise Exception(
                             f"should match expected length. Found length {len(curr_str_data)} but placeholders length was {len(curr_placeholder_encoding.ids)}"
                         )
-                    curr_data = [float(_) for _ in curr_str_data]
-                    curr_data = torch.tensor(curr_data, dtype=torch.float32)
-                    assert len(curr_data.shape) == 1
+
+                    curr_indices = []
+                    curr_data = []
+
+                    for i, val in enumerate(curr_str_data):
+                        if val != "<MASK>":
+                            curr_indices.append(i + prev_index_end + 1)
+                            curr_data.append(float(val))
+                        else:
+                            scalars_masked_indices.append(i + prev_index_end + 1)
+
+                    if len(curr_indices) > 0:
+                        curr_indices = torch.tensor(curr_data, dtype=torch.int64)
+                        curr_data = torch.tensor(curr_data, dtype=torch.float32)
+
+                        scalars_indices.append(curr_indices)
+                        scalars_values.append(curr_data)
+
+                        assert len(curr_data.shape) == 1
                 elif "SCALARS_FROM_DICT" == tokenizer_name:
                     if sample_dict is None:
                         raise Exception(
@@ -250,32 +170,38 @@ def prepare_info_for_model_step(
                         )
                     curr_data = sample_dict[curr_str_data]
                     assert len(curr_data.shape) == 1
+                    curr_indices = torch.arange(
+                        prev_index_end + 1, prev_index_end + 1 + curr_data.shape[0]
+                    )
+
+                    scalars_indices.append(curr_indices)
+                    scalars_values.append(curr_data)
+
+                    prev_index_end += curr_data.shape[0]
+
                 else:
                     raise Exception(
                         "Only supported SCALARS_* tokenizers are SCALARS_LITERALS and SCALARS_FROM_DICT"
                     )
 
-                curr_indices = torch.arange(
-                    prev_index_end + 1, prev_index_end + 1 + curr_data.shape[0]
-                )
-                scalars_indices = (
-                    curr_indices
-                    if scalars_indices is None
-                    else torch.concat([scalars_indices, curr_indices])
-                )
-                scalars_values = (
-                    curr_data
-                    if scalars_values is None
-                    else torch.concat([scalars_values, curr_data])
-                )
-
-                prev_index_end += curr_data.shape[0]
-            if tokenizer_name.startswith("VECTORS_"):
+            elif tokenizer_name.startswith("VECTORS_"):
                 raise NotImplementedError
             else:
                 prev_index_end += len(curr_placeholder_encoding.ids)
 
+        if len(scalars_indices) > 0:
+            scalars_indices = torch.concat(scalars_indices)
+            scalars_values = torch.concat(scalars_values)
+
+        if len(scalars_masked_indices) > 0:
+            scalars_masked_indices = torch.tensor(
+                scalars_masked_indices, dtype=torch.int64
+            )
+        else:
+            scalars_masked_indices = None
+
         return {
             "scalars_indices": scalars_indices,
             "scalars_values": scalars_values,
+            "scalars_masked_indices": scalars_masked_indices,
         }
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
index 55ce4bd4..4932902a 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
@@ -2747,6 +2747,42 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 305,
+      "content": "<SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 306,
+      "content": "<VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 307,
+      "content": "<MASKED_SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 308,
+      "content": "<MASKED_VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3067,6 +3103,10 @@
       "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
       "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
       "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
+      "<SCALAR>": 305,
+      "<VECTOR>": 306,
+      "<MASKED_SCALAR>": 307,
+      "<MASKED_VECTOR>": 308,
       "#": 527,
       "%": 528,
       "(": 529,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json
index e5d2ec64..b1f00797 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json
@@ -2747,6 +2747,42 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 305,
+      "content": "<SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 306,
+      "content": "<VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 307,
+      "content": "<MASKED_SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 308,
+      "content": "<MASKED_VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3073,6 +3109,10 @@
       "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
       "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
       "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
+      "<SCALAR>": 305,
+      "<VECTOR>": 306,
+      "<MASKED_SCALAR>": 307,
+      "<MASKED_VECTOR>": 308,
       "[CL:0000499]": 3522,
       "[CL:2000060]": 3523,
       "[CL:0000235]": 3524,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json
index 6a0ed97b..34977c75 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json
@@ -2747,6 +2747,42 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 305,
+      "content": "<SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 306,
+      "content": "<VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 307,
+      "content": "<MASKED_SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 308,
+      "content": "<MASKED_VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3073,6 +3109,10 @@
       "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
       "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
       "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
+      "<SCALAR>": 305,
+      "<VECTOR>": 306,
+      "<MASKED_SCALAR>": 307,
+      "<MASKED_VECTOR>": 308,
       "[100130093]": 5000,
       "[100133445]": 5001,
       "[100286793]": 5002,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json
index ced94e24..9e796ab9 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json
@@ -2747,6 +2747,42 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 305,
+      "content": "<SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 306,
+      "content": "<VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 307,
+      "content": "<MASKED_SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 308,
+      "content": "<MASKED_VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3073,6 +3109,10 @@
       "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
       "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
       "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
+      "<SCALAR>": 305,
+      "<VECTOR>": 306,
+      "<MASKED_SCALAR>": 307,
+      "<MASKED_VECTOR>": 308,
       "A": 501,
       "B": 502,
       "C": 503,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
index 55ce4bd4..4932902a 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
@@ -2747,6 +2747,42 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 305,
+      "content": "<SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 306,
+      "content": "<VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 307,
+      "content": "<MASKED_SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 308,
+      "content": "<MASKED_VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3067,6 +3103,10 @@
       "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
       "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
       "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
+      "<SCALAR>": 305,
+      "<VECTOR>": 306,
+      "<MASKED_SCALAR>": 307,
+      "<MASKED_VECTOR>": 308,
       "#": 527,
       "%": 528,
       "(": 529,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json
index e5d2ec64..b1f00797 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json
@@ -2747,6 +2747,42 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 305,
+      "content": "<SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 306,
+      "content": "<VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 307,
+      "content": "<MASKED_SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 308,
+      "content": "<MASKED_VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3073,6 +3109,10 @@
       "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
       "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
       "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
+      "<SCALAR>": 305,
+      "<VECTOR>": 306,
+      "<MASKED_SCALAR>": 307,
+      "<MASKED_VECTOR>": 308,
       "[CL:0000499]": 3522,
       "[CL:2000060]": 3523,
       "[CL:0000235]": 3524,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json
index ced94e24..9e796ab9 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json
@@ -2747,6 +2747,42 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 305,
+      "content": "<SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 306,
+      "content": "<VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 307,
+      "content": "<MASKED_SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 308,
+      "content": "<MASKED_VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3073,6 +3109,10 @@
       "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
       "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
       "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
+      "<SCALAR>": 305,
+      "<VECTOR>": 306,
+      "<MASKED_SCALAR>": 307,
+      "<MASKED_VECTOR>": 308,
       "A": 501,
       "B": 502,
       "C": 503,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
index 55ce4bd4..4932902a 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
@@ -2747,6 +2747,42 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 305,
+      "content": "<SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 306,
+      "content": "<VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 307,
+      "content": "<MASKED_SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 308,
+      "content": "<MASKED_VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3067,6 +3103,10 @@
       "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
       "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
       "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
+      "<SCALAR>": 305,
+      "<VECTOR>": 306,
+      "<MASKED_SCALAR>": 307,
+      "<MASKED_VECTOR>": 308,
       "#": 527,
       "%": 528,
       "(": 529,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json
index e5d2ec64..b1f00797 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json
@@ -2747,6 +2747,42 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 305,
+      "content": "<SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 306,
+      "content": "<VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 307,
+      "content": "<MASKED_SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 308,
+      "content": "<MASKED_VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3073,6 +3109,10 @@
       "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
       "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
       "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
+      "<SCALAR>": 305,
+      "<VECTOR>": 306,
+      "<MASKED_SCALAR>": 307,
+      "<MASKED_VECTOR>": 308,
       "[CL:0000499]": 3522,
       "[CL:2000060]": 3523,
       "[CL:0000235]": 3524,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json
index ced94e24..9e796ab9 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json
@@ -2747,6 +2747,42 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 305,
+      "content": "<SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 306,
+      "content": "<VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 307,
+      "content": "<MASKED_SCALAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 308,
+      "content": "<MASKED_VECTOR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3073,6 +3109,10 @@
       "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
       "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
       "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
+      "<SCALAR>": 305,
+      "<VECTOR>": 306,
+      "<MASKED_SCALAR>": 307,
+      "<MASKED_VECTOR>": 308,
       "A": 501,
       "B": 502,
       "C": 503,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py b/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py
index 5ef9159a..61bd9b94 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py
+++ b/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py
@@ -326,6 +326,10 @@
     "GENERAL_CHAIN",
     "CDR3_REGION",
     "MUTATED",
+    "SCALAR",
+    "VECTOR",
+    "MASKED_SCALAR",
+    "MASKED_VECTOR",
 ]
 
 AA_tokens = [
diff --git a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
index a6cdbb84..22b4df2f 100644
--- a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
+++ b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
@@ -15,7 +15,6 @@
 # import os
 # import re
 # import torch
-import torch
 
 
 class InjectorTokenizerOp(FastModularTokenizer):
@@ -84,6 +83,7 @@ def __call__(
         validate_ends_with_eos: Optional[bool] = None,
         key_out_scalars_indices: Optional[str] = None,
         key_out_scalars_values: Optional[str] = None,
+        key_out_masked_scalars_indices: Optional[str] = None,
     ) -> NDict:
         """_summary_
 
@@ -102,10 +102,11 @@ def __call__(
             verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning
                 with full data. Defaults to 1.
             validate_ends_with_eos (Optional[bool], optional): if not None, overrides self._validate_ends_with_eos
-            key_out_scalars_indices:str optional
-                if provided, will write to sample_dict in this key a 1D torch tensor with indices of all scalar elements.
-            key_out_scalars_values:str optional
-                if provided, will write to sample_dict in this key a 1D torch tensor with indices of all scalar values.
+            key_out_scalars_inputs_indices:str optional
+                if provided, will write to sample_dict in this key a 1D torch tensor with indices of all inputs scalar elements.
+            key_out_scalars_inputs_values:str optional
+                if provided, will write to sample_dict in this key a 1D torch tensor with indices of all inputs scalar values.
+
 
 
         Raises:
@@ -116,23 +117,6 @@ def __call__(
             NDict: _description_
         """
 
-        print("FOR DEBUGGING! REMOVE !!!!!!")
-        # orig: '<@TOKENIZER-TYPE=AA><GENERAL_AFFINITY_CLASS><MASK>
-        # <@TOKENIZER-TYPE=AA><COMPLEX_ENTITY><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>KSSCKRIPLYVDFSDVGWNDWIVAPPGYIAMYCIGECPFPLADILNSTNIAIVQTLVNSVNSKIPKACCVPTELSAISMLMLDENEKVVLKNYQDMVVEGCGCR<SEQUENCE_NATURAL_END>
-        # <@TOKENIZER-TYPE=AA><COMPLEX_ENTITY><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>WLITGTEASCENEGEVLIIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG<SEQUENCE_NATURAL_END>
-        # <@TOKENIZER-TYPE=AA><MUTATED><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>WAITGTEASCENEGEVLAIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG<SEQUENCE_NATURAL_END><EOS>'
-        sample_dict[key_in] = (
-            "<@TOKENIZER-TYPE=AA><GENERAL_AFFINITY_CLASS><MASK>"
-            + "<@TOKENIZER-TYPE=SCALARS_LITERALS>12.7,3.2,14.8,99,9"
-            + "<@TOKENIZER-TYPE=AA><COMPLEX_ENTITY><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>KSSCKRIPLYVDFSDVGWNDWIVAPPGYIAMYCIGECPFPLADILNSTNIAIVQTLVNSVNSKIPKACCVPTELSAISMLMLDENEKVVLKNYQDMVVEGCGCR<SEQUENCE_NATURAL_END>"
-            + "<@TOKENIZER-TYPE=SCALARS_FROM_DICT>blah.model.banana"
-            + "<@TOKENIZER-TYPE=AA><COMPLEX_ENTITY><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>WLITGTEASCENEGEVLIIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG<SEQUENCE_NATURAL_END>"
-            + "<@TOKENIZER-TYPE=AA><MUTATED><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>WAITGTEASCENEGEVLAIPNITDNPCISCVCLNQKAECKQEKCAPLAEDCALVVKQTGACCEKCKG<SEQUENCE_NATURAL_END><EOS>"
-        )
-        sample_dict["blah.model.banana"] = torch.tensor(
-            [100.0, 200.0, 300.0], dtype=torch.float32
-        )
-
         (
             with_placeholders_str,
             per_meta_orig,
@@ -156,13 +140,6 @@ def __call__(
             + ".per_meta_part_encoding",  # using the key_in as base for the name because key_out_* are optional
         )
 
-        # TODO 1: call embedding layer on all tokens to get a [sequence_length, model_dim] matrix. Make sure that gradients are allowed to flow to it when needed
-        #         what is the best way to provide the model embedding layer here? the data-pipeline seems to be created BEFORE the model is constructed
-        #         if we want to call the model with the entire minibatch, then we can go with option 1:
-        #         option 1 - only prepare data towards that, and actually run the last part of the logic inside the *_step in pl_module
-        #         option 2 - call the model embedding layer per individual sample, and also somehow load the model BEFORE the data pipeline (less likely we'll go with this option...)
-        # TODO 2: override per injecting meta tokenizer type (FLOAT and VECTOR) the
-
         prepared_data = InjectorTokenizer.prepare_info_for_model_step(
             per_meta_tokenizer_data=per_meta_orig,
             per_meta_encoding_including_placeholders=sample_dict[
@@ -171,7 +148,30 @@ def __call__(
             sample_dict=sample_dict,
         )
 
-        sample_dict[key_out_scalars_indices] = prepared_data["scalars_indices"]
-        sample_dict[key_out_scalars_values] = prepared_data["scalars_values"]
+        if key_out_scalars_indices is not None:
+            sample_dict[key_out_scalars_indices] = prepared_data["scalars_indices"]
+        else:
+            if prepared_data["scalars_indices"] is not None:
+                raise Exception(
+                    "non None scalars_indices found but no key_out_scalars_indices found"
+                )
+
+        if key_out_scalars_values is not None:
+            sample_dict[key_out_scalars_values] = prepared_data["scalars_values"]
+        else:
+            if prepared_data["scalars_values"] is not None:
+                raise Exception(
+                    "non None scalars_value found but no key_out_scalars_values found"
+                )
+
+        if key_out_masked_scalars_indices is not None:
+            sample_dict[key_out_masked_scalars_indices] = prepared_data[
+                "scalars_masked_indices"
+            ]
+        else:
+            if prepared_data["scalars_masked_indices"] is not None:
+                raise Exception(
+                    "non None scalars_masked_indices found but no key_out_masked_scalars_indices found"
+                )
 
         return sample_dict

From ab1672271538b21e2d770c88d7c00192c5c9804e Mon Sep 17 00:00:00 2001
From: YoelShoshan <yoels@il.ibm.com>
Date: Fri, 26 Jul 2024 02:20:28 -0400
Subject: [PATCH 5/5] PR comments

---
 .../injectortokenizer/injector_tokenizer.py   | 56 +++++++++----------
 .../tokenizer/ops/injector_tokenizer_ops.py   | 40 ++++++-------
 2 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
index f0b720a9..6088b15f 100644
--- a/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
+++ b/fusedrug/data/tokenizer/injectortokenizer/injector_tokenizer.py
@@ -1,4 +1,3 @@
-from fusedrug.data.tokenizer.modulartokenizer.modular_tokenizer import ModularTokenizer
 from typing import Optional, List, Tuple, Dict
 from tokenizers import Encoding
 import torch
@@ -6,32 +5,38 @@
 from fuse.utils import NDict
 
 
-class InjectorTokenizer(ModularTokenizer):
+class InjectorTokenizerHelpers:
     """
     InjectorTokenizer builds on top of ModularTokenizer.
+    !!!!
+    Note - this file contains only few utility (static) functions for InjectorTokenizerOp
+    as a user, you are not expected to InjectorTokenizer directly, instead you should use fusedrug.data.tokenizer.ops.injector_tokenizer_ops.InjectorTokenizerOp
+    !!!!
 
-    Its purpose is to extend beyond "standard" input tokens as integers as input for a model.
-    Instead, it provides control on *vectors* that are to be used as input for a model.
+    applies a injector tokenizer
 
-    Example use cases:
-    1. Providing scalars (floating point) as inputs
-    2. Providing vectors of embeddings - for example of a protein embedding
+    injector tokenizer builds on top of modular tokenizer.
+    its purpose is to build inputs_emb for the model (instead of input_ids)
+        this allows to support more advanced inputs beyond token ids, like:
+        * scalars inputs
+        * embeddings vector within a single input
 
-    Each input "token" becomes a tensor of a defined size, and is built of:
-    1. Header
-        made of 4 floats
-        [
-            0.0 or 1.0 #is this a sentinel/mask or not
-            0.0 or 1.0 #is this a standard vocabulary token
-            0.0 or 1.0 #is this a scalar
-            0.0 or 1.0 #is this a full injected vector (e.g. an embedding)
-        ]
-    2. Content
-        the rest of each input vector is made of input_dim-4 float elements.
+    supported syntax/format:
 
+    for text following <@TOKENIZER-TYPE=SCALARS_LITERALS> supports the following format:
+    ',' separated float values and/or <MASK> tokens -
+        for example: "2.7,3.99,-12.9" or "<MASK><MASK>" or "2.19,<MASK>,3.19,<MASK>"
 
-    Note - in the "standard vocabulary token" - we support providing an external embeding layer (like in vanilla T5),
-        as it's part of the trained weights.
+    for text following <@TOKENIZER-TYPE=SCALARS_FROM_DICT> is expected to be a key to the sample NDict
+        for example: "blah.boo.banana"  or "data.input.encoder_input"
+        note: in SCALARS_FROM_DICT you can't describe masked scalars (outputs) you can only describe inputs
+
+    example usage:
+
+    encoder_input:
+    <@TOKENIZER-TYPE=AA><MOLECULAR_WEIGHT_IN_SOME_UNIT><@TOKENIZER-TYPE=SCALARS_LITERALS>0.3<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_NANOMOLAR><@TOKENIZER-TYPE=SCALARS_LITERALS><MASK><@TOKENIZER-TYPE=AA><SEQUENCE_NATURAL_START>ISGGDAIYSSTGRCSLGFNVRSGSTYYFLTAGICTDGATTWWANSARTTVLGTTSGSSFPNNDYGIVRYTNTTIPKDGTVGGQDITSAANATVGMAVTRRGSTTGTISGSVTALNATVNYGGGDVVYGMIRTNVCAEPGDSGGPLYSGTRAIGLTSGGSGNCSSGGTTFFQPVTEALVAYGVSVY<SEQUENCE_NATURAL_END>
+    labels:
+    <@TOKENIZER-TYPE=AA><MOLECULAR_WEIGHT_IN_SOME_UNIT><@TOKENIZER-TYPE=SCALARS_LITERALS>0.3<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_NANOMOLAR><@TOKENIZER-TYPE=SCALARS_LITERALS>12.4<@TOKENIZER-TYPE=AA><SEQUENCE_NATURAL_START>ISGGDAIYSSTGRCSLGFNVRSGSTYYFLTAGICTDGATTWWANSARTTVLGTTSGSSFPNNDYGIVRYTNTTIPKDGTVGGQDITSAANATVGMAVTRRGSTTGTISGSVTALNATVNYGGGDVVYGMIRTNVCAEPGDSGGPLYSGTRAIGLTSGGSGNCSSGGTTFFQPVTEALVAYGVSVY<SEQUENCE_NATURAL_END>
 
     """
 
@@ -90,11 +95,6 @@ def build_placeholder_meta_tokenization(
                 else:
                     raise Exception(f"tokenizer_type={tokenizer_type} is not supported")
 
-                # elif tokenizer_type == "SCALARS_MASKED":
-                #     values = subseq.split(",")
-                #     assert all([x=='<MASK>' for x in values]) #only <MASK> is currently supported
-                #     seq = "<MASKED_SCALAR>" * len(values)
-
                 with_placeholders.append(seq)
 
             elif tokenizer_type.startswith("VECTORS_"):
@@ -201,7 +201,7 @@ def prepare_info_for_model_step(
             scalars_masked_indices = None
 
         return {
-            "scalars_indices": scalars_indices,
-            "scalars_values": scalars_values,
-            "scalars_masked_indices": scalars_masked_indices,
+            "scalars_indices": scalars_indices,  # 1d - its length is the number of actual scalars (provided) found
+            "scalars_values": scalars_values,  # 1d - values of provided scalars
+            "scalars_masked_indices": scalars_masked_indices,  # 1d - indices of masked scalars
         }
diff --git a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
index 22b4df2f..aa28cef1 100644
--- a/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
+++ b/fusedrug/data/tokenizer/ops/injector_tokenizer_ops.py
@@ -1,21 +1,13 @@
 from fuse.utils import NDict
 
-# from fuse.data import OpBase, get_sample_id
 from fusedrug.data.tokenizer.injectortokenizer.injector_tokenizer import (
-    InjectorTokenizer,
+    InjectorTokenizerHelpers,
 )
 
-# from fusedrug.data.tokenizer.modulartokenizer.modular_tokenizer import ModularTokenizer
 from fusedrug.data.tokenizer.ops import FastModularTokenizer
 
-# from warnings import warn
-# from collections import defaultdict
 from typing import Optional, Union, Any
 
-# import os
-# import re
-# import torch
-
 
 class InjectorTokenizerOp(FastModularTokenizer):
     """
@@ -26,11 +18,27 @@ class InjectorTokenizerOp(FastModularTokenizer):
         this allows to support more advanced inputs beyond token ids, like:
         * scalars inputs
         * embeddings vector within a single input
+
+    supported syntax/format:
+
+    for text following <@TOKENIZER-TYPE=SCALARS_LITERALS> supports the following format:
+    ',' separated float values and/or <MASK> tokens -
+        for example: "2.7,3.99,-12.9" or "<MASK><MASK>" or "2.19,<MASK>,3.19,<MASK>"
+
+    for text following <@TOKENIZER-TYPE=SCALARS_FROM_DICT> is expected to be a key to the sample NDict
+        for example: "blah.boo.banana"  or "data.input.encoder_input"
+        note: in SCALARS_FROM_DICT you can't describe masked scalars (outputs) you can only describe inputs
+
+    example usage:
+
+    encoder_input:
+    <@TOKENIZER-TYPE=AA><MOLECULAR_WEIGHT_IN_SOME_UNIT><@TOKENIZER-TYPE=SCALARS_LITERALS>0.3<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_NANOMOLAR><@TOKENIZER-TYPE=SCALARS_LITERALS><MASK><@TOKENIZER-TYPE=AA><SEQUENCE_NATURAL_START>ISGGDAIYSSTGRCSLGFNVRSGSTYYFLTAGICTDGATTWWANSARTTVLGTTSGSSFPNNDYGIVRYTNTTIPKDGTVGGQDITSAANATVGMAVTRRGSTTGTISGSVTALNATVNYGGGDVVYGMIRTNVCAEPGDSGGPLYSGTRAIGLTSGGSGNCSSGGTTFFQPVTEALVAYGVSVY<SEQUENCE_NATURAL_END>
+    labels:
+    <@TOKENIZER-TYPE=AA><MOLECULAR_WEIGHT_IN_SOME_UNIT><@TOKENIZER-TYPE=SCALARS_LITERALS>0.3<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_NANOMOLAR><@TOKENIZER-TYPE=SCALARS_LITERALS>12.4<@TOKENIZER-TYPE=AA><SEQUENCE_NATURAL_START>ISGGDAIYSSTGRCSLGFNVRSGSTYYFLTAGICTDGATTWWANSARTTVLGTTSGSSFPNNDYGIVRYTNTTIPKDGTVGGQDITSAANATVGMAVTRRGSTTGTISGSVTALNATVNYGGGDVVYGMIRTNVCAEPGDSGGPLYSGTRAIGLTSGGSGNCSSGGTTFFQPVTEALVAYGVSVY<SEQUENCE_NATURAL_END>
     """
 
     def __init__(
         self,
-        input_dim: int,
         tokenizer_path: str,
         max_size: Union[int, None] = None,
         pad_token: Union[str, None] = None,
@@ -67,8 +75,6 @@ def __init__(
             **kwargs,
         )
 
-        self._input_dim = input_dim
-
     def __call__(
         self,
         sample_dict: NDict,
@@ -107,12 +113,6 @@ def __call__(
             key_out_scalars_inputs_values:str optional
                 if provided, will write to sample_dict in this key a 1D torch tensor with indices of all inputs scalar values.
 
-
-
-        Raises:
-            Exception: _description_
-            Exception: _description_
-
         Returns:
             NDict: _description_
         """
@@ -120,7 +120,7 @@ def __call__(
         (
             with_placeholders_str,
             per_meta_orig,
-        ) = InjectorTokenizer.build_placeholder_meta_tokenization(
+        ) = InjectorTokenizerHelpers.build_placeholder_meta_tokenization(
             sequence=sample_dict[key_in], sample_dict=sample_dict
         )
         sample_dict[key_in + ".with_placeholders"] = with_placeholders_str
@@ -140,7 +140,7 @@ def __call__(
             + ".per_meta_part_encoding",  # using the key_in as base for the name because key_out_* are optional
         )
 
-        prepared_data = InjectorTokenizer.prepare_info_for_model_step(
+        prepared_data = InjectorTokenizerHelpers.prepare_info_for_model_step(
             per_meta_tokenizer_data=per_meta_orig,
             per_meta_encoding_including_placeholders=sample_dict[
                 key_in + ".per_meta_part_encoding"