diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json index 60790c73..18ba1702 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3017,6 +3062,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "#": 527, "%": 528, "(": 529, @@ -8958,4 +9008,4 @@ "c2n c3n(" ] } -} +} \ No newline at end of file diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json index a7d4e535..75b888e2 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/cell_attributes_tokenizer.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3023,6 +3068,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "[CL:0000499]": 3522, "[CL:2000060]": 3523, "[CL:0000235]": 3524, @@ -3815,4 +3865,4 @@ }, "unk_token": "" } -} +} \ No newline at end of file diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json index 64375885..6f1c56d7 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/gene_tokenizer.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3023,6 +3068,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "[100130093]": 5000, "[100133445]": 5001, "[100286793]": 5002, @@ -97930,4 +97980,4 @@ }, "unk_token": "" } -} +} \ No newline at end of file diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json index dbb3d5ee..4322238a 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_extended_modular_tokenizer/t5_tokenizer_AA_special.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3023,6 +3068,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "A": 501, "B": 502, "C": 503, @@ -3052,4 +3102,4 @@ }, "unk_token": "" } -} +} \ No newline at end of file diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json index 60790c73..18ba1702 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3017,6 +3062,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "#": 527, "%": 528, "(": 529, @@ -8958,4 +9008,4 @@ "c2n c3n(" ] } -} +} \ No newline at end of file diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json index a7d4e535..75b888e2 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/cell_attributes_tokenizer.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3023,6 +3068,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "[CL:0000499]": 3522, "[CL:2000060]": 3523, "[CL:0000235]": 3524, @@ -3815,4 +3865,4 @@ }, "unk_token": "" } -} +} \ No newline at end of file diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json index dbb3d5ee..4322238a 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/bmfm_modular_tokenizer/t5_tokenizer_AA_special.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3023,6 +3068,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "A": 501, "B": 502, "C": 503, @@ -3052,4 +3102,4 @@ }, "unk_token": "" } -} +} \ No newline at end of file diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json index 60790c73..18ba1702 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3017,6 +3062,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "#": 527, "%": 528, "(": 529, @@ -8958,4 +9008,4 @@ "c2n c3n(" ] } -} +} \ No newline at end of file diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json index a7d4e535..75b888e2 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/cell_attributes_tokenizer.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3023,6 +3068,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "[CL:0000499]": 3522, "[CL:2000060]": 3523, "[CL:0000235]": 3524, @@ -3815,4 +3865,4 @@ }, "unk_token": "" } -} +} \ No newline at end of file diff --git a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json index dbb3d5ee..4322238a 100644 --- a/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json +++ b/fusedrug/data/tokenizer/modulartokenizer/pretrained_tokenizers/modular_AA_SMILES_single_path/t5_tokenizer_AA_special.json @@ -2702,6 +2702,51 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 300, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": null, @@ -3023,6 +3068,11 @@ "": 297, "": 298, "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, "A": 501, "B": 502, "C": 503, @@ -3052,4 +3102,4 @@ }, "unk_token": "" } -} +} \ No newline at end of file