Merge pull request #41 from adamoyoung/adamo5

Adding simulation stuff
pluskal-lab · Feb 10, 2025 · 00f5c03 · 00f5c03
2 parents f426a6c + 1c5fdd2
commit 00f5c03
Show file tree

Hide file tree

Showing 32 changed files with 6,861 additions and 1,743 deletions.
diff --git a/.gitignore b/.gitignore
@@ -136,3 +136,6 @@ dmypy.json
 
 # VSCode
 *.vscode
+
+# W&B
+wandb/
diff --git a/config/simulation/demo.yml b/config/simulation/demo.yml
@@ -0,0 +1,19 @@
+# wandb
+wandb_name: "demo"
+# data
+pth: "MassSpecGym.tsv"
+# candidates_pth: "molecules/MassSpecGym_retrieval_candidates_mass.json"
+split_type: "benchmark"
+subsample_frac: 0.01 # 1.0
+# output
+mz_max: 1005.
+mz_bin_res: 0.01
+ints_transform: "sqrt"
+# model
+model_type: "fp"
+# optimization
+max_epochs: 1 # 100
+# other
+accelerator: "cpu"
+num_workers: 0
+save_ckpt: False
diff --git a/config/simulation/fp_formula.yml b/config/simulation/fp_formula.yml
@@ -0,0 +1,17 @@
+# wandb
+wandb_name: "fp_formula"
+# data
+pth: "MassSpecGym.tsv"
+candidates_pth: "molecules/MassSpecGym_retrieval_candidates_formula.json"
+split_type: "benchmark"
+# output
+mz_max: 1005.
+mz_bin_res: 0.01
+ints_transform: "sqrt"
+# model
+model_type: "fp"
+# optimization
+max_epochs: 100
+# other
+accelerator: "gpu"
+save_ckpt: True
diff --git a/config/simulation/fp_mass.yml b/config/simulation/fp_mass.yml
@@ -0,0 +1,17 @@
+# wandb
+wandb_name: "fp_mass"
+# data
+pth: "MassSpecGym.tsv"
+candidates_pth: "molecules/MassSpecGym_retrieval_candidates_mass.json"
+split_type: "benchmark"
+# output
+mz_max: 1005.
+mz_bin_res: 0.01
+ints_transform: "sqrt"
+# model
+model_type: "fp"
+# optimization
+max_epochs: 100
+# other
+accelerator: "gpu"
+save_ckpt: True
diff --git a/config/simulation/fp_noret.yml b/config/simulation/fp_noret.yml
@@ -0,0 +1,16 @@
+# wandb
+wandb_name: "fp_noret"
+# data
+pth: "MassSpecGym.tsv"
+split_type: "benchmark"
+# output
+mz_max: 1005.
+mz_bin_res: 0.01
+ints_transform: "sqrt"
+# model
+model_type: "fp"
+# optimization
+max_epochs: 100
+# other
+accelerator: "gpu"
+do_retrieval: False
diff --git a/config/simulation/gnn_formula.yml b/config/simulation/gnn_formula.yml
@@ -0,0 +1,17 @@
+# wandb
+wandb_name: "gnn_formula"
+# data
+pth: "MassSpecGym.tsv"
+candidates_pth: "molecules/MassSpecGym_retrieval_candidates_formula.json"
+split_type: "benchmark"
+# output
+mz_max: 1005.
+mz_bin_res: 0.01
+ints_transform: "sqrt"
+# model
+model_type: "gnn"
+# optimization
+max_epochs: 100
+# other
+accelerator: "gpu"
+save_ckpt: True
diff --git a/config/simulation/gnn_mass.yml b/config/simulation/gnn_mass.yml
@@ -0,0 +1,17 @@
+# wandb
+wandb_name: "gnn_mass"
+# data
+pth: "MassSpecGym.tsv"
+candidates_pth: "molecules/MassSpecGym_retrieval_candidates_mass.json"
+split_type: "benchmark"
+# output
+mz_max: 1005.
+mz_bin_res: 0.01
+ints_transform: "sqrt"
+# model
+model_type: "gnn"
+# optimization
+max_epochs: 100
+# other
+accelerator: "gpu"
+save_ckpt: True
diff --git a/config/simulation/gnn_noret.yml b/config/simulation/gnn_noret.yml
@@ -0,0 +1,16 @@
+# wandb
+wandb_name: "gnn_noret"
+# data
+pth: "MassSpecGym.tsv"
+split_type: "benchmark"
+# output
+mz_max: 1005.
+mz_bin_res: 0.01
+ints_transform: "sqrt"
+# model
+model_type: "gnn"
+# optimization
+max_epochs: 100
+# other
+accelerator: "gpu"
+do_retrieval: False
diff --git a/config/simulation/preconly_formula.yml b/config/simulation/preconly_formula.yml
@@ -0,0 +1,17 @@
+# wandb
+wandb_name: "preconly_formula"
+# data
+pth: "MassSpecGym.tsv"
+candidates_pth: "molecules/MassSpecGym_retrieval_candidates_formula.json"
+split_type: "benchmark"
+# output
+mz_max: 1005.
+mz_bin_res: 0.01
+ints_transform: "none"
+# model
+model_type: "prec_only"
+# optimization
+max_epochs: 1
+# other
+accelerator: "gpu"
+save_ckpt: True
diff --git a/config/simulation/preconly_mass.yml b/config/simulation/preconly_mass.yml
@@ -0,0 +1,17 @@
+# wandb
+wandb_name: "preconly_mass"
+# data
+pth: "MassSpecGym.tsv"
+candidates_pth: "molecules/MassSpecGym_retrieval_candidates_mass.json"
+split_type: "benchmark"
+# output
+mz_max: 1005.
+mz_bin_res: 0.01
+ints_transform: "none"
+# model
+model_type: "prec_only"
+# optimization
+max_epochs: 1
+# other
+accelerator: "gpu"
+save_ckpt: True
diff --git a/config/simulation/preconly_noret.yml b/config/simulation/preconly_noret.yml
@@ -0,0 +1,16 @@
+# wandb
+wandb_name: "preconly_noret"
+# data
+pth: "MassSpecGym.tsv"
+split_type: "benchmark"
+# output
+mz_max: 1005.
+mz_bin_res: 0.01
+ints_transform: "none"
+# model
+model_type: "prec_only"
+# optimization
+max_epochs: 1
+# other
+accelerator: "gpu"
+do_retrieval: False
diff --git a/config/simulation/template.yml b/config/simulation/template.yml
@@ -0,0 +1,74 @@
+# wandb
+wandb_entity: "your_wandb_entity" ### change this
+wandb_project: "your_wandb_project" ## change this
+wandb_name: "template"
+# data
+pth: ## add path
+candidates_pth: # add path
+meta_keys: ["adduct","precursor_mz","instrument_type","collision_energy"]
+fp_types: ["morgan","maccs","rdkit"]
+adducts: ["[M+H]+"]
+instrument_types: ["QTOF","QFT","Orbitrap","ITFT"]
+max_collision_energy: 200.
+mz_from: 10.
+mz_to: 1000.
+split_type: "benchmark"
+subsample_frac: 
+# input
+metadata_insert_location: "mlp"
+collision_energy_insert_size: 16
+adduct_insert_size: 16
+instrument_type_insert_size: 16
+# output
+mz_max: 1005.
+mz_bin_res: 0.1
+ints_transform: "none"
+# model
+model_type: "fp"
+mlp_hidden_size: 1024
+mlp_dropout: 0.1
+mlp_num_layers: 4
+mlp_use_residuals: True
+ff_prec_mz_offset: 5
+ff_bidirectional: True
+ff_output_map_size: 256
+mol_hidden_size: 256
+mol_num_layers: 4
+mol_gnn_type: GINE
+mol_normalization: batch
+mol_dropout: 0.2
+mol_pool_type: mean
+# optimization
+lr: 0.0003
+lr_schedule: False
+lr_decay_rate: 0.0
+lr_warmup_steps: 1000
+lr_decay_steps: 5000
+weight_decay: 0.0000001
+train_sample_weight: False #True
+eval_sample_weight: False #True
+batch_size: 128
+max_epochs: 100
+drop_last: False
+gradient_clip_val: 0.0
+gradient_clip_algorithm:
+optimizer_type: "adam"
+# other
+num_workers: 8
+accelerator: "cpu"
+log_every_n_steps: 1
+seed: 420
+cache_feats: False
+mp_sharing_strategy: "file_system"
+do_retrieval: True
+retrieval_batch_size: 8
+at_ks: [1, 5, 20]
+pin_memory: True
+persistent_workers: True
+sim_metrics:
+  - cos_sim
+  - js_sim
+  - cos_sim_sqrt
+  - cos_sim_obj
+save_df_test: True
+save_ckpt: False
-Original file line number
+Diff line change
@@ Expand Up / @@ -136,3 +136,6 @@ dmypy.json @@
     # VSCode
     *.vscode
+    # W&B
+    wandb/