From 7fc56d97ffa91757ab4fd5af9421bc40fe0ffacb Mon Sep 17 00:00:00 2001
From: floccinauc <92428874+floccinauc@users.noreply.github.com>
Date: Sun, 2 Jun 2024 13:58:38 +0300
Subject: [PATCH] TCR chain type special tokens (#127)

Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com <vadimra@cccxl014.pok.ibm.com>
Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com <vadimra@cccxc563.pok.ibm.com>
Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com <vadimra@cccxc405.pok.ibm.com>
Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com <vadimra@cccxc436.pok.ibm.com>
Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com <vadimra@cccxc430.pok.ibm.com>
Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com <vadimra@cccxc524.pok.ibm.com>
Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com <vadimra@cccxl012.pok.ibm.com>
Co-authored-by: VADIM RATNER VADIMRA@il.ibm.com <vadimra@cccxc438.pok.ibm.com>
---
 ...with_aug_4272372_samples_balanced_1_1.json | 50 +++++++++++++++++++
 .../cell_attributes_tokenizer.json            | 50 +++++++++++++++++++
 .../gene_tokenizer.json                       | 50 +++++++++++++++++++
 .../t5_tokenizer_AA_special.json              | 50 +++++++++++++++++++
 ...with_aug_4272372_samples_balanced_1_1.json | 50 +++++++++++++++++++
 .../cell_attributes_tokenizer.json            | 50 +++++++++++++++++++
 .../t5_tokenizer_AA_special.json              | 50 +++++++++++++++++++
 ...with_aug_4272372_samples_balanced_1_1.json | 50 +++++++++++++++++++
 .../cell_attributes_tokenizer.json            | 50 +++++++++++++++++++
 .../t5_tokenizer_AA_special.json              | 50 +++++++++++++++++++
 .../modulartokenizer/special_tokens.py        |  9 +++-
 11 files changed, 507 insertions(+), 2 deletions(-)
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
index 60790c73..55ce4bd4 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
@@ -2702,6 +2702,51 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 300,
+      "content": "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 301,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 302,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 303,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 304,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3017,6 +3062,11 @@
       "<GENERAL_CHAIN>": 297,
       "<SUBMOLECULAR_ENTITY>": 298,
       "<MUTATED>": 299,
+      "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
+      "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>": 301,
+      "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
       "#": 527,
       "%": 528,
       "(": 529,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json
index a7d4e535..e5d2ec64 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json
@@ -2702,6 +2702,51 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 300,
+      "content": "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 301,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 302,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 303,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 304,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3023,6 +3068,11 @@
       "<GENERAL_CHAIN>": 297,
       "<SUBMOLECULAR_ENTITY>": 298,
       "<MUTATED>": 299,
+      "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
+      "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>": 301,
+      "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
       "[CL:0000499]": 3522,
       "[CL:2000060]": 3523,
       "[CL:0000235]": 3524,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json
index 64375885..6a0ed97b 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json
@@ -2702,6 +2702,51 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 300,
+      "content": "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 301,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 302,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 303,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 304,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3023,6 +3068,11 @@
       "<GENERAL_CHAIN>": 297,
       "<SUBMOLECULAR_ENTITY>": 298,
       "<MUTATED>": 299,
+      "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
+      "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>": 301,
+      "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
       "[100130093]": 5000,
       "[100133445]": 5001,
       "[100286793]": 5002,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json
index dbb3d5ee..ced94e24 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json
@@ -2702,6 +2702,51 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 300,
+      "content": "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 301,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 302,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 303,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 304,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3023,6 +3068,11 @@
       "<GENERAL_CHAIN>": 297,
       "<SUBMOLECULAR_ENTITY>": 298,
       "<MUTATED>": 299,
+      "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
+      "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>": 301,
+      "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
       "A": 501,
       "B": 502,
       "C": 503,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
index 60790c73..55ce4bd4 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
@@ -2702,6 +2702,51 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 300,
+      "content": "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 301,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 302,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 303,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 304,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3017,6 +3062,11 @@
       "<GENERAL_CHAIN>": 297,
       "<SUBMOLECULAR_ENTITY>": 298,
       "<MUTATED>": 299,
+      "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
+      "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>": 301,
+      "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
       "#": 527,
       "%": 528,
       "(": 529,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json
index a7d4e535..e5d2ec64 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json
@@ -2702,6 +2702,51 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 300,
+      "content": "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 301,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 302,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 303,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 304,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3023,6 +3068,11 @@
       "<GENERAL_CHAIN>": 297,
       "<SUBMOLECULAR_ENTITY>": 298,
       "<MUTATED>": 299,
+      "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
+      "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>": 301,
+      "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
       "[CL:0000499]": 3522,
       "[CL:2000060]": 3523,
       "[CL:0000235]": 3524,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json
index dbb3d5ee..ced94e24 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json
@@ -2702,6 +2702,51 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 300,
+      "content": "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 301,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 302,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 303,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 304,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3023,6 +3068,11 @@
       "<GENERAL_CHAIN>": 297,
       "<SUBMOLECULAR_ENTITY>": 298,
       "<MUTATED>": 299,
+      "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
+      "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>": 301,
+      "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
       "A": 501,
       "B": 502,
       "C": 503,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
index 60790c73..55ce4bd4 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
@@ -2702,6 +2702,51 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 300,
+      "content": "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 301,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 302,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 303,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 304,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3017,6 +3062,11 @@
       "<GENERAL_CHAIN>": 297,
       "<SUBMOLECULAR_ENTITY>": 298,
       "<MUTATED>": 299,
+      "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
+      "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>": 301,
+      "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
       "#": 527,
       "%": 528,
       "(": 529,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json
index a7d4e535..e5d2ec64 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json
@@ -2702,6 +2702,51 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 300,
+      "content": "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 301,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 302,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 303,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 304,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3023,6 +3068,11 @@
       "<GENERAL_CHAIN>": 297,
       "<SUBMOLECULAR_ENTITY>": 298,
       "<MUTATED>": 299,
+      "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
+      "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>": 301,
+      "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
       "[CL:0000499]": 3522,
       "[CL:2000060]": 3523,
       "[CL:0000235]": 3524,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json
index dbb3d5ee..ced94e24 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json
+++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json
@@ -2702,6 +2702,51 @@
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 300,
+      "content": "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 301,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 302,
+      "content": "<MOLECULAR_ENTITY_TCR_DELTA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 303,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 304,
+      "content": "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": null,
@@ -3023,6 +3068,11 @@
       "<GENERAL_CHAIN>": 297,
       "<SUBMOLECULAR_ENTITY>": 298,
       "<MUTATED>": 299,
+      "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
+      "<MOLECULAR_ENTITY_TCR_DELTA_CDR3>": 301,
+      "<MOLECULAR_ENTITY_TCR_DELTA_VAR>": 302,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_CDR3>": 303,
+      "<MOLECULAR_ENTITY_TCR_GAMMA_VAR>": 304,
       "A": 501,
       "B": 502,
       "C": 503,
diff --git a/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py b/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py
index c6e102c0..5ef9159a 100644
--- a/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py
+++ b/fusedrug/data/tokenizer/modulartokenizer/special_tokens.py
@@ -37,9 +37,14 @@
     "MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1",
     "MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2",
     "MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3",
-    "MOLECULAR_ENTITY_TCR_ALPHA_CHAIN",  # TCR "light" chain - only V, J and C segments
+    "MOLECULAR_ENTITY_TCR_ALPHA_CHAIN",  # TCR "light" chain - only V, J and C segments (variable region)
     "MOLECULAR_ENTITY_TCR_BETA_VDJ",  # TCR "heavy" chain - V(ariable), D(iversity), and J(oining) segments, as well as the C(onstant) segment
-    "MOLECULAR_ENTITY_TCR_BETA_CDR3",
+    "MOLECULAR_ENTITY_TCR_BETA_CDR3",  # TCR beta chain CDR3 region
+    "MOLECULAR_ENTITY_TCR_GAMMA_VAR",  # TCR gamma chain variable region
+    "MOLECULAR_ENTITY_TCR_DELTA_VAR",  # TCR delta chain variable region
+    "MOLECULAR_ENTITY_TCR_ALPHA_CDR3",  # TCR alpha chain CDR3 region
+    "MOLECULAR_ENTITY_TCR_GAMMA_CDR3",  # TCR gamma chain CDR3 region
+    "MOLECULAR_ENTITY_TCR_DELTA_CDR3",  # TCR delta chain CDR3 region
     "TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE",  # A prefix to our T5 model to inform it that it will run in "encoder only" mode (so only
     # the encoder-stack is used, plus the encoder-output-tokens-classification-head)
     "DECODER_START",