wellcometrust · nsorros · Sep 6, 2023 · Jul 18, 2023 · Jul 18, 2023 · Jul 18, 2023
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,8 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+
+# Folder where training outputs are stored
+bertmesh_outs/
+wandb/
diff --git a/README.md b/README.md
diff --git a/data/grants_comparison/mesh_tree_letters_list.txt b/data/grants_comparison/mesh_tree_letters_list.txt
@@ -0,0 +1,4 @@
+Information Sources: L
+Phenomena and Processes: G
+Geographicals: Z
+Diseases: C
diff --git a/data/raw/.gitignore b/data/raw/.gitignore
@@ -1,3 +1,4 @@
 /allMeSH_2021.json
+/allMeSH_2021.jsonl
 /desc2021.xml
 /disease_tags_validation_grants.xlsx
diff --git a/data/raw/allMeSH_2021.jsonl.dvc b/data/raw/allMeSH_2021.jsonl.dvc
@@ -0,0 +1,4 @@
+outs:
+- md5: 94f18c3918b180728a553123edb2ee32
+  size: 27914288461
+  path: allMeSH_2021.jsonl
diff --git a/examples/augment.sh b/examples/augment.sh
@@ -0,0 +1,3 @@
+grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FOLDER_HERE] \
+  --min-examples 25 \
+  --concurrent-calls 25
diff --git a/examples/augment_specific_tags.sh b/examples/augment_specific_tags.sh
@@ -0,0 +1,5 @@
+# Augments data using a file with 1 label per line and years
+grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FOLDER_HERE] \
+  --tags-file-path tags_to_augment.txt \
+  --examples 25 \
+  --concurrent-calls 25
diff --git a/examples/preprocess_and_train_by_epochs.sh b/examples/preprocess_and_train_by_epochs.sh
@@ -0,0 +1,37 @@
+# Run on g5.12xlarge instance
+
+# Without saving (on-the-fly)
+SOURCE="data/raw/allMeSH_2021.jsonl"
+
+grants-tagger train bertmesh \
+    "" \
+    $SOURCE \
+    --test-size 25000 \
+    --train-years 2016,2017,2018,2019 \
+    --test-years 2020,2021 \
+    --output_dir bertmesh_outs/pipeline_test/ \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 1 \
+    --multilabel_attention True \
+    --freeze_backbone unfreeze \
+    --num_train_epochs 7 \
+    --learning_rate 5e-5 \
+    --dropout 0.1 \
+    --hidden_size 1024 \
+    --warmup_steps 5000 \
+    --max_grad_norm 2.0 \
+    --scheduler_type cosine_hard_restart \
+    --weight_decay 0.2 \
+    --correct_bias True \
+    --threshold 0.25 \
+    --prune_labels_in_evaluation True \
+    --hidden_dropout_prob 0.2 \
+    --attention_probs_dropout_prob 0.2 \
+    --fp16 \
+    --torch_compile \
+    --evaluation_strategy epochs \
+    --eval_accumulation_steps 20 \
+    --save_strategy epochs \
+    --wandb_project wellcome-mesh \
+    --wandb_name test-train-all \
+    --wandb_api_key ${WANDB_API_KEY}
diff --git a/examples/preprocess_and_train_by_steps.sh b/examples/preprocess_and_train_by_steps.sh
@@ -0,0 +1,39 @@
+# Run on g5.12xlarge instance
+
+# Without saving (on-the-fly)
+SOURCE="data/raw/allMeSH_2021.jsonl"
+
+grants-tagger train bertmesh \
+    "" \
+    $SOURCE \
+    --test-size 25000 \
+    --train-years 2016,2017,2018,2019 \
+    --test-years 2020,2021 \
+    --output_dir bertmesh_outs/pipeline_test/ \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 1 \
+    --multilabel_attention True \
+    --freeze_backbone unfreeze \
+    --num_train_epochs 7 \
+    --learning_rate 5e-5 \
+    --dropout 0.1 \
+    --hidden_size 1024 \
+    --warmup_steps 5000 \
+    --max_grad_norm 2.0 \
+    --scheduler_type cosine_hard_restart \
+    --weight_decay 0.2 \
+    --correct_bias True \
+    --threshold 0.25 \
+    --prune_labels_in_evaluation True \
+    --hidden_dropout_prob 0.2 \
+    --attention_probs_dropout_prob 0.2 \
+    --fp16 \
+    --torch_compile \
+    --evaluation_strategy steps \
+    --eval_steps 50000 \
+    --eval_accumulation_steps 20 \
+    --save_strategy steps \
+    --save_steps 50000 \
+    --wandb_project wellcome-mesh \
+    --wandb_name test-train-all \
+    --wandb_api_key ${WANDB_API_KEY}
diff --git a/examples/preprocess_splitting_by_fract.sh b/examples/preprocess_splitting_by_fract.sh
@@ -0,0 +1,2 @@
+grants-tagger preprocess mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
+  --test-size 0.05
diff --git a/examples/preprocess_splitting_by_rows.sh b/examples/preprocess_splitting_by_rows.sh
@@ -0,0 +1,2 @@
+grants-tagger preprocess mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
+  --test-size 25000
diff --git a/examples/preprocess_splitting_by_years.sh b/examples/preprocess_splitting_by_years.sh
@@ -0,0 +1,4 @@
+grants-tagger preprocess mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
+  --test-size 25000 \
+  --train-years 2016,2017,2018,2019 \
+  --test-years 2020,2021
diff --git a/examples/resume_train_by_epoch.sh b/examples/resume_train_by_epoch.sh
@@ -0,0 +1,37 @@
+# Run on g5.12xlarge instance
+
+# After preprocessing
+SOURCE="[SET_YOUR_PREPROCESSING_FOLDER_HERE]"
+
+# Checkpoint
+CHECKPOINT="checkpoint-100000"
+
+grants-tagger train bertmesh \
+    bertmesh_outs/pipeline_test/$CHECKPOINT \
+    $SOURCE \
+    --output_dir bertmesh_outs/pipeline_test/ \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 1 \
+    --multilabel_attention True \
+    --freeze_backbone unfreeze \
+    --num_train_epochs 3 \
+    --learning_rate 5e-5 \
+    --dropout 0.1 \
+    --hidden_size 1024 \
+    --warmup_steps 0 \
+    --max_grad_norm 2.0 \
+    --scheduler_type cosine_hard_restart \
+    --weight_decay 0.2 \
+    --correct_bias True \
+    --threshold 0.25 \
+    --prune_labels_in_evaluation True \
+    --hidden_dropout_prob 0.2 \
+    --attention_probs_dropout_prob 0.2 \
+    --fp16 \
+    --torch_compile \
+    --evaluation_strategy epoch \
+    --eval_accumulation_steps 20 \
+    --save_strategy epoch \
+    --wandb_project wellcome-mesh \
+    --wandb_name test-train-all \
+    --wandb_api_key ${WANDB_API_KEY}
diff --git a/examples/resume_train_by_steps.sh b/examples/resume_train_by_steps.sh
@@ -0,0 +1,39 @@
+# Run on g5.12xlarge instance
+
+# After preprocessing
+SOURCE="[SET_YOUR_PREPROCESSING_FOLDER_HERE]"
+
+# Checkpoint
+CHECKPOINT="checkpoint-100000"
+
+grants-tagger train bertmesh \
+    bertmesh_outs/pipeline_test/$CHECKPOINT \
+    $SOURCE \
+    --output_dir bertmesh_outs/pipeline_test/ \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 1 \
+    --multilabel_attention True \
+    --freeze_backbone unfreeze \
+    --num_train_epochs 3 \
+    --learning_rate 5e-5 \
+    --dropout 0.1 \
+    --hidden_size 1024 \
+    --warmup_steps 0 \
+    --max_grad_norm 2.0 \
+    --scheduler_type cosine_hard_restart \
+    --weight_decay 0.2 \
+    --correct_bias True \
+    --threshold 0.25 \
+    --prune_labels_in_evaluation True \
+    --hidden_dropout_prob 0.2 \
+    --attention_probs_dropout_prob 0.2 \
+    --fp16 \
+    --torch_compile \
+    --evaluation_strategy steps \
+    --eval_steps 10000 \
+    --eval_accumulation_steps 20 \
+    --save_strategy steps \
+    --save_steps 10000 \
+    --wandb_project wellcome-mesh \
+    --wandb_name test-train-all \
+    --wandb_api_key ${WANDB_API_KEY}
diff --git a/examples/train_by_epochs.sh b/examples/train_by_epochs.sh
@@ -0,0 +1,34 @@
+# Run on g5.12xlarge instance
+
+# After preprocessing
+SOURCE="[SET_YOUR_PREPROCESSING_FOLDER_HERE]"
+
+grants-tagger train bertmesh \
+    "" \
+    $SOURCE \
+    --output_dir bertmesh_outs/pipeline_test/ \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 1 \
+    --multilabel_attention True \
+    --freeze_backbone unfreeze \
+    --num_train_epochs 7 \
+    --learning_rate 5e-5 \
+    --dropout 0.1 \
+    --hidden_size 1024 \
+    --warmup_steps 5000 \
+    --max_grad_norm 2.0 \
+    --scheduler_type cosine_hard_restart \
+    --weight_decay 0.2 \
+    --correct_bias True \
+    --threshold 0.25 \
+    --prune_labels_in_evaluation True \
+    --hidden_dropout_prob 0.2 \
+    --attention_probs_dropout_prob 0.2 \
+    --fp16 \
+    --torch_compile \
+    --evaluation_strategy epoch \
+    --eval_accumulation_steps 20 \
+    --save_strategy epoch \
+    --wandb_project wellcome-mesh \
+    --wandb_name test-train-all \
+    --wandb_api_key ${WANDB_API_KEY}
diff --git a/examples/train_by_steps.sh b/examples/train_by_steps.sh
@@ -0,0 +1,36 @@
+# Run on g5.12xlarge instance
+
+# After preprocessing
+SOURCE="[SET_YOUR_PREPROCESSING_FOLDER_HERE]"
+
+grants-tagger train bertmesh \
+    "" \
+    $SOURCE \
+    --output_dir bertmesh_outs/pipeline_test/ \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 1 \
+    --multilabel_attention True \
+    --freeze_backbone unfreeze \
+    --num_train_epochs 7 \
+    --learning_rate 5e-5 \
+    --dropout 0.1 \
+    --hidden_size 1024 \
+    --warmup_steps 5000 \
+    --max_grad_norm 2.0 \
+    --scheduler_type cosine_hard_restart \
+    --weight_decay 0.2 \
+    --correct_bias True \
+    --threshold 0.25 \
+    --prune_labels_in_evaluation True \
+    --hidden_dropout_prob 0.2 \
+    --attention_probs_dropout_prob 0.2 \
+    --fp16 \
+    --torch_compile \
+    --evaluation_strategy steps \
+    --eval_steps 10000 \
+    --eval_accumulation_steps 20 \
+    --save_strategy steps \
+    --save_steps 10000 \
+    --wandb_project wellcome-mesh \
+    --wandb_name test-train-all \
+    --wandb_api_key ${WANDB_API_KEY}
diff --git a/grants_tagger_light/augmentation/JsonParser.py b/grants_tagger_light/augmentation/JsonParser.py
@@ -0,0 +1,67 @@
+"""
+From langchain: https://raw.githubusercontent.com/langchain-ai/langchain/master/libs/langchain/langchain/output_parsers/json.py
+"""
+
+import json
+import re
+
+
+class JsonParser:
+    def __init(self):
+        """Class to parse json produced by LLMs. Inspiration taken from langchain.
+        It fixes quotes, it escapes separators, etc."""
+        pass
+
+    @staticmethod
+    def _replace_new_line(match: re.Match[str]) -> str:
+        value = match.group(2)
+        value = re.sub(r"\n", r"\\n", value)
+        value = re.sub(r"\r", r"\\r", value)
+        value = re.sub(r"\t", r"\\t", value)
+        value = re.sub('"', r"\"", value)
+
+        return match.group(1) + value + match.group(3)
+
+    @staticmethod
+    def _custom_parser(multiline_string: str) -> str:
+        """
+        The LLM response for `action_input` may be a multiline
+        string containing unescaped newlines, tabs or quotes. This function
+        replaces those characters with their escaped counterparts.
+        (newlines in JSON must be double-escaped: `\\n`)
+        """
+        if isinstance(multiline_string, (bytes, bytearray)):
+            multiline_string = multiline_string.decode()
+
+        multiline_string = re.sub(
+            r'("action_input"\:\s*")(.*)(")',
+            JsonParser._replace_new_line,
+            multiline_string,
+            flags=re.DOTALL,
+        )
+
+        return multiline_string
+
+    @staticmethod
+    def parse_json(json_string: str) -> dict:
+        """
+        Parse a JSON string from LLM response
+
+        Args:
+            json_string: The Markdown string.
+
+        Returns:
+            The parsed JSON object as a Python dictionary.
+        """
+        json_str = json_string
+
+        # Strip whitespace and newlines from the start and end
+        json_str = json_str.strip()
+
+        # handle newlines and other special characters inside the returned value
+        json_str = JsonParser._custom_parser(json_str)
+
+        # Parse the JSON string into a Python dictionary
+        parsed = json.loads(json_str)
+
+        return parsed
diff --git a/grants_tagger_light/augmentation/__init__.py b/grants_tagger_light/augmentation/__init__.py
@@ -0,0 +1,8 @@
+import typer
+from .augment import augment_cli
+
+augment_app = typer.Typer()
+augment_app.command(
+    "mesh",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)(augment_cli)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		grants-tagger preprocess mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
		--test-size 0.05
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		grants-tagger preprocess mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
		--test-size 25000