Initial Commit

acg-team · Jun 7, 2022 · 0729c12 · 0729c12
commit 0729c12
Show file tree

Hide file tree

Showing 9 changed files with 910 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,80 @@
+# MEDOBO
+Automatically tagging MEDLINE abstracts with OBO ontologies
+
+![MEDOBO Schema](schema0.png)
+### Step 1: Processing UMLS
+Download the [UMLS bulk](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html) after acquiring the licence (e.g., umls-2022AA-full.zip) and place it in the 'umls' folder.
+
+Run the following commands in the console one by one.
+```bash
+unzip umls-2022AA-full.zip
+
+mkdir META
+mkdir NET
+unzip 2022AA-full/2022aa-1-meta.nlm
+unzip 2022AA-full/2022aa-2-meta.nlm
+unzip 2022AA-full/2022aa-otherks.nlm
+
+gunzip 2022AA/META/MRCONSO.RRF.aa.gz
+gunzip 2022AA/META/MRCONSO.RRF.ab.gz
+gunzip 2022AA/META/MRCONSO.RRF.ac.gz
+cat 2022AA/META/MRCONSO.RRF.aa 2022AA/META/MRCONSO.RRF.ab 2022AA/META/MRCONSO.RRF.ac > META/MRCONSO.RRF
+
+gunzip 2022AA/META/MRDEF.RRF.gz
+mv 2022AA/META/MRDEF.RRF META/
+
+gunzip 2022AA/META/MRSTY.RRF.gz
+mv 2022AA/META/MRSTY.RRF META/
+
+mv 2022AA/NET/SRDEF NET/
+mv 2022AA/NET/SRSTRE1 NET/
+
+gunzip 2022AA/META/MRXNS_ENG.RRF.aa.gz
+gunzip 2022AA/META/MRXNS_ENG.RRF.ab.gz
+cat 2022AA/META/MRXNS_ENG.RRF.aa 2022AA/META/MRXNS_ENG.RRF.ab > META/MRXNS_ENG.RRF
+
+gunzip 2022AA/META/MRXNW_ENG.RRF.aa.gz
+gunzip 2022AA/META/MRXNW_ENG.RRF.ab.gz
+gunzip 2022AA/META/MRXNW_ENG.RRF.ac.gz
+cat 2022AA/META/MRXNW_ENG.RRF.aa 2022AA/META/MRXNW_ENG.RRF.ab 2022AA/META/MRXNW_ENG.RRF.ac > META/MRXNW_ENG.RRF
+```
+
+### Step 2: Create an environment
+```bash
+$ conda create -n medobo python=3.6
+$ conda activate medobo
+(medobo)$ pip install -r requirements.txt
+```
+
+### Step 3: Get OBO ontologies
+Download [OBO ontologies](https://drive.switch.ch/index.php/s/HSL9gkPfjAE77s1) as a folder and place in the root of project
+
+
+### Step 4: Processing MEDLINE
+```bash
+(medobo)$ python dataset.py 
+```
+Or download the preprocessed data from the [Switch drived](https://drive.switch.ch/index.php/s/HSL9gkPfjAE77s1) (for replication purposes, please make sure not to generate a new dataset, instead download the official splits and contents from [Switch drive](https://drive.switch.ch/index.php/s/HSL9gkPfjAE77s1))
+
+
+### Step 5: Pre-processing OBO
+```bash
+(medobo)$ python chi_sqaure.py 
+```
+
+### Step 6: Download embeddings
+Download [BioASK embedding](http://bioasq.org/news/bioasq-releases-continuous-space-word-vectors-obtained-applying-word2vec-pubmed-abstracts), unzip and place it in 'Resources' folder
+
+
+### Naive Bayes baseline
+```bash
+(medobo)$ python main_NB.py <num_of_training_data>
+(medobo)$ python main_NB.py 100000
+```
+
+
+### Deep learning baseline
+```bash
+(medobo)$ python main_DL.py <num_of_training_data> <num_of_features>
+(medobo)$ python main_DL.py 100000 50000
+```
diff --git a/chi_sqaure.py b/chi_sqaure.py
@@ -0,0 +1,56 @@
+import os
+import json
+import pickle
+from tqdm import tqdm
+from itertools import chain
+from sklearn.feature_selection import chi2
+from sklearn.feature_extraction.text import CountVectorizer
+
+base_path = os.path.join('umls_org', 'objects')
+
+def chisquare_dataset(dev_x_idx, dev_y, top_words):
+    """
+    :param dataset: dev dataset
+    :param top_words: max number of words with the highest Chi2 given each class
+    :return: json object written in issu_description_path
+    """
+    issu_description = {}
+    dev_x_set = set(dev_x_idx)
+
+    dev_x_text = ['']*len(dev_x_idx)
+    print('Loading training data for Chi2 compute ...')
+    classes = list(set(chain(*dev_y)))
+
+    for content_pack in tqdm(os.listdir(os.path.join(base_path, 'pmid2contents'))):
+        pmid2mesh_terms_map = pickle.load(open(os.path.join(base_path, 'pmid2contents', content_pack), 'rb'))
+        for pmid in pmid2mesh_terms_map:
+            if pmid in dev_x_set:
+                title = pmid2mesh_terms_map[pmid][1]
+                abstract = pmid2mesh_terms_map[pmid][2]
+                dev_x_text[dev_x_idx.index(pmid)] = '%s %s'%(title, abstract)
+
+    print('Generating Chi2 dictionary...\n')
+    for cls in tqdm(classes):
+        y = []
+        for labels in dev_y:
+            if cls in labels:
+                y.append(1)
+            else:
+                y.append(0)
+
+        vectorizer = CountVectorizer(lowercase=True, stop_words='english')
+        X = vectorizer.fit_transform(dev_x_text)
+
+        chi2score = chi2(X,y)[0]
+        wscores = zip(vectorizer.get_feature_names(),chi2score)
+        wchi2 = sorted(wscores,key=lambda x:x[1])
+        chi_words = [x[0] for x in wchi2[-top_words:][::-1]]
+        issu_description[cls] = chi_words
+
+    with open(os.path.join(base_path, 'class_chi2_words_path.json'), 'w') as wr:
+        json.dump(issu_description, wr, indent=1)
+
+
+if __name__ == "__main__":
+    X_dev, y_dev = pickle.load(open(os.path.join(base_path, 'dataset', 'dev_ids.pkl'), 'rb'))
+    chisquare_dataset(X_dev, y_dev, top_words=100)