From 7fc56d97ffa91757ab4fd5af9421bc40fe0ffacb Mon Sep 17 00:00:00 2001 From: floccinauc <92428874+floccinauc@users.noreply.github.com> Date: Sun, 2 Jun 2024 13:58:38 +0300 Subject: [PATCH] TCR chain type special tokens (#127) Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com --- ...with_aug_4272372_samples_balanced_1_1.json | 50 +++++++++++++++++++ .../cell_attributes_tokenizer.json | 50 +++++++++++++++++++ .../gene_tokenizer.json | 50 +++++++++++++++++++ .../t5_tokenizer_AA_special.json | 50 +++++++++++++++++++ ...with_aug_4272372_samples_balanced_1_1.json | 50 +++++++++++++++++++ .../cell_attributes_tokenizer.json | 50 +++++++++++++++++++ .../t5_tokenizer_AA_special.json | 50 +++++++++++++++++++ ...with_aug_4272372_samples_balanced_1_1.json | 50 +++++++++++++++++++ .../cell_attributes_tokenizer.json | 50 +++++++++++++++++++ .../t5_tokenizer_AA_special.json | 50 +++++++++++++++++++ .../modulartokenizer/special_tokens.py | 9 +++- 11 files changed, 507 insertions(+), 2 deletions(-) diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json index 60790c73..55ce4bd4 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3017,6 +3062,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "#": 527, "%": 528, "(": 529, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json index a7d4e535..e5d2ec64 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3023,6 +3068,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "[CL:0000499]": 3522, "[CL:2000060]": 3523, "[CL:0000235]": 3524, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json index 64375885..6a0ed97b 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3023,6 +3068,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "[100130093]": 5000, "[100133445]": 5001, "[100286793]": 5002, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json index dbb3d5ee..ced94e24 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3023,6 +3068,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "A": 501, "B": 502, "C": 503, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json index 60790c73..55ce4bd4 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3017,6 +3062,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "#": 527, "%": 528, "(": 529, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json index a7d4e535..e5d2ec64 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3023,6 +3068,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "[CL:0000499]": 3522, "[CL:2000060]": 3523, "[CL:0000235]": 3524, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json index dbb3d5ee..ced94e24 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3023,6 +3068,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "A": 501, "B": 502, "C": 503, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json index 60790c73..55ce4bd4 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3017,6 +3062,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "#": 527, "%": 528, "(": 529, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json index a7d4e535..e5d2ec64 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3023,6 +3068,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "[CL:0000499]": 3522, "[CL:2000060]": 3523, "[CL:0000235]": 3524, diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json index dbb3d5ee..ced94e24 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3023,6 +3068,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "A": 501, "B": 502, "C": 503, diff --git a/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py b/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py index c6e102c0..5ef9159a 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py +++ b/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py @@ -37,9 +37,14 @@ "MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1", "MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2", "MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3", - "MOLECULAR_ENTITY_TCR_ALPHA_CHAIN", # TCR "light" chain - only V, J and C segments + "MOLECULAR_ENTITY_TCR_ALPHA_CHAIN", # TCR "light" chain - only V, J and C segments (variable region) "MOLECULAR_ENTITY_TCR_BETA_VDJ", # TCR "heavy" chain - V(ariable), D(iversity), and J(oining) segments, as well as the C(onstant) segment - "MOLECULAR_ENTITY_TCR_BETA_CDR3", + "MOLECULAR_ENTITY_TCR_BETA_CDR3", # TCR beta chain CDR3 region + "MOLECULAR_ENTITY_TCR_GAMMA_VAR", # TCR gamma chain variable region + "MOLECULAR_ENTITY_TCR_DELTA_VAR", # TCR delta chain variable region + "MOLECULAR_ENTITY_TCR_ALPHA_CDR3", # TCR alpha chain CDR3 region + "MOLECULAR_ENTITY_TCR_GAMMA_CDR3", # TCR gamma chain CDR3 region + "MOLECULAR_ENTITY_TCR_DELTA_CDR3", # TCR delta chain CDR3 region "TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE", # A prefix to our T5 model to inform it that it will run in "encoder only" mode (so only # the encoder-stack is used, plus the encoder-output-tokens-classification-head) "DECODER_START",