-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
updated to work with Python 3.8.10 and implemented DrugStandardizer c…
…lass
- Loading branch information
Showing
5 changed files
with
23,493 additions
and
93,710 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,55 +1,66 @@ | ||
# Drug Standards | ||
This package provides tools for standardizing drug names into a single format. For example; Benadryl and diphenhydramine refer to the same chemical entity. In certain tasks, such as mining the FDA Adverse Event Reporting System database, it is useful to standardize all drug names (i.e. convert all instances of Benadryl to diphenhydramine) in order to compute various statistics. This package uses a database of drug synonyms and brand names to return the generic name for a drug. To handle misspellings the standardize function will return the generic name for the most similar match based on Jaro-Winkler similarity. A threshold can be set in order to specify the minimal similarity required to be considered a match. | ||
This package provides tools for standardizing drug names into a single format. For example; Benadryl and diphenhydramine refer to the same chemical entity. In certain tasks, such as mining the FDA Adverse Event Reporting System database, it is useful to standardize all drug names (i.e. convert all instances of Benadryl to diphenhydramine) in order to compute various statistics. This package uses a database of drug synonyms and brand names to return the generic name for a drug. To handle misspellings the standardize function will return the generic name for the most similar match based on Jaro-Winkler similarity. A threshold can be set in order to specify the minimal similarity required to be considered a match. | ||
|
||
## Installation | ||
|
||
#### 1. Install drugstandards package using PIP | ||
#### Installing drugstandards from source | ||
|
||
`sudo pip install drugstandards` | ||
|
||
#### 2. Installing drugstandards from source | ||
``` | ||
# install Levenshtein package | ||
sudo pip3 install Levenshtein==0.21.0 | ||
# Download this github repository and enter the following | ||
# install drugstandards | ||
cd drugstandards | ||
sudo python setup.py install | ||
sudo python3 setup.py install | ||
``` | ||
|
||
## Usage | ||
|
||
#### 1. Import the module | ||
|
||
`import drugstandards as drugs` | ||
`import drugstandards as ds` | ||
|
||
#### 2. Standardize a single, correctly spelled drug name to generic. | ||
|
||
``` | ||
# create standardizer object | ||
s = ds.DrugStandardizer() | ||
# Note that this function is NOT case-sensitive. | ||
drugs.standardize(["lopressor"]) | ||
s.standardize(["lopressor"]) | ||
``` | ||
|
||
#### 3. Standardize a single brand name to generic. | ||
|
||
``` | ||
drugs.standardize(["Benadryl"]) | ||
s.standardize(["Benadryl"]) | ||
``` | ||
|
||
#### 4. Standardize misspelled names to generic. | ||
|
||
`drugs.standardize(["Benadril", "lopresor"])` | ||
`s.standardize(["Benadril", "lopresor"])` | ||
|
||
#### 5. Return generic name for terms that have a Jaro-Winkler similarity greater than 0.9 | ||
|
||
``` | ||
# Will return None if no match is found. | ||
drugs.standardize(["Benadril"], thresh=0.9) | ||
s.standardize(["Benadril"], thresh=0.9) | ||
``` | ||
|
||
#### 6. Add drug mapping to drug dictionary | ||
|
||
``` | ||
# If a mapping does not exist you may create your own by updating the drug-dictionary. | ||
# For example, we may be interested in mapping the term "MULTI-VITAMIN" to "VITAMIN" | ||
drugs.add_drug_mapping({"MULTI-VITAMIN":"VITAMIN"}) | ||
s.add_drug_mapping({"MULTI-VITAMIN":"VITAMIN"}) | ||
# We can also create many updates simultaneously | ||
drugs.add_drug_mapping({"MULTI-VITAMIN":"VITAMIN", "TYLENOL EXTRA STRENGTH": "ACETAMINOPHEN"}) | ||
s.add_drug_mapping({"MULTI-VITAMIN":"VITAMIN", "TYLENOL EXTRA STRENGTH": "ACETAMINOPHEN"}) | ||
``` | ||
## Questions/issues/contact | ||
|
||
[email protected] | ||
|
||
## Citing | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,78 +1,70 @@ | ||
import Levenshtein | ||
import operator | ||
import csv | ||
import pickle | ||
import os | ||
import re | ||
from pkg_resources import Requirement, resource_filename | ||
|
||
dictionary_file = resource_filename(Requirement.parse("drugstandards"), "drugstandards/data/synonyms.dat") | ||
drugdict = pickle.load(open(dictionary_file, "rb")) | ||
class DrugStandardizer(): | ||
def __init__(self): | ||
self.dictionary_file = resource_filename(Requirement.parse("drugstandards"), "drugstandards/data/synonyms.dat") | ||
self.drugdict = dict(i.strip().split("\t") for i in open(self.dictionary_file, "r")) | ||
|
||
def create_drug_dictionary(filename): | ||
""" This function creates a drug dictionary of the form | ||
{"synonym1":"generic1", "synonym2":"generic1"} using | ||
drug names (brand, generic, synonyms) found in DrugBank. | ||
""" | ||
f = csv.reader(open(filename, 'rb'), delimiter="\t") | ||
drug_dictionary = {} | ||
for i in f: | ||
if i[0] == "WID": continue | ||
drug_dictionary[i[2].upper()] = i[2].upper() | ||
if i[3] != "NULL": drug_dictionary[i[3].upper()] = i[2].upper() | ||
if i[4] != "NULL": drug_dictionary[i[4].upper()] = i[2].upper() | ||
return drug_dictionary | ||
|
||
def find_closest_string(query, dictionary, thresh=0.90): | ||
""" This function returns the closest match for | ||
a query string against a dictionary of terms | ||
using levenstein distance | ||
""" | ||
dist = {i:Levenshtein.jaro_winkler(query, i) for i in dictionary} | ||
dist = sorted(dist.items(), key=operator.itemgetter(1), reverse=True) | ||
if dist[0][1] >= thresh: | ||
return dist[0][0] | ||
else: | ||
return None | ||
|
||
def add_drug_mapping(mapdict): | ||
""" This function is used to add drug mappings to | ||
the drug dictionary. For example, if the term | ||
"benadry" is not found in the dictionary, you can | ||
add the custom mapping by using the following: | ||
drugs.add_drug_mapping({"benadryl":"diphenhydramine"}) | ||
Additionally, one might want to map all instances of | ||
"multi-vitamin" to "vitamin" in which case you would | ||
use: | ||
drugs.add_drug_mapping({"multi-vitamin":"vitamin"}) | ||
""" | ||
filename = resource_filename(Requirement.parse("drugstandards"), "drugstandards/data/synonyms.dat") | ||
drugdict = pickle.load(open(filename, "rb")) | ||
for k,v in mapdict.items(): | ||
drugdict[k] = v | ||
pickle.dump(drugdict, open(filename, "wb")) | ||
print "Drug dictionary successfully updated..." | ||
|
||
def standardize(druglist, thresh=0.90): | ||
""" This function takes a list of drugs (brand name, | ||
misspelled drugs, generic names) and converts them | ||
to the generic names. It is used to provide naming | ||
consistency to the FAERS reports. | ||
""" | ||
splitter = re.compile("\\W+|\d+") | ||
standardized_druglist = [] | ||
for drug in druglist: | ||
drug = drug.upper() | ||
drug = " ".join(splitter.split(drug)).strip() | ||
gen = drugdict.get(drug) | ||
if gen: | ||
standardized_druglist.append(gen) | ||
continue | ||
def create_drug_dictionary(self, filename, delimiter = "\t"): | ||
""" This function creates a drug dictionary of the form | ||
{"synonym1":"generic1", "synonym2":"generic1"} using | ||
drug names (brand, generic, synonyms) found in DrugBank. | ||
""" | ||
self.drugdict= {} | ||
with csv.reader(open(filename, 'r'), delimiter = delimter) as csvfile: | ||
for k, v in csvfile: | ||
self.drugdict[k.upper()] = v | ||
|
||
def find_closest_string(self, query, dictionary, thresh=0.90): | ||
""" This function returns the closest match for | ||
a query string against a dictionary of terms | ||
using levenstein distance | ||
""" | ||
dist = {i:Levenshtein.jaro_winkler(query, i) for i in dictionary} | ||
dist = sorted(dist.items(), key=operator.itemgetter(1), reverse=True) | ||
if dist[0][1] >= thresh: | ||
return dist[0][0] | ||
else: | ||
close_match = find_closest_string(str(drug), drugdict.keys(), thresh=thresh) | ||
close_match = drugdict.get(close_match) | ||
standardized_druglist.append(close_match) | ||
return standardized_druglist | ||
return None | ||
|
||
def add_drug_mapping(self, mapdict): | ||
""" This function is used to add drug mappings to | ||
the drug dictionary. For example, if the term | ||
"benadry" is not found in the dictionary, you can | ||
add the custom mapping by using the following: | ||
drugs.add_drug_mapping({"benadryl":"diphenhydramine"}) | ||
Additionally, one might want to map all instances of | ||
"multi-vitamin" to "vitamin" in which case you would | ||
use: | ||
drugs.add_drug_mapping({"multi-vitamin":"vitamin"}) | ||
""" | ||
for k,v in mapdict.items(): | ||
self.drugdict[k.upper()] = v | ||
|
||
def standardize(self, druglist, thresh=0.90): | ||
""" This function takes a list of drugs (brand name, | ||
misspelled drugs, generic names) and converts them | ||
to the generic names. | ||
""" | ||
splitter = re.compile("\\W+|\d+") | ||
standardized_druglist = [] | ||
for drug in druglist: | ||
drug = drug.upper() | ||
drug = " ".join(splitter.split(drug)).strip() | ||
gen = self.drugdict.get(drug) | ||
if gen: | ||
standardized_druglist.append(gen) | ||
continue | ||
else: | ||
close_match = self.find_closest_string(str(drug), self.drugdict.keys(), thresh=thresh) | ||
close_match = self.drugdict.get(close_match) | ||
standardized_druglist.append(close_match) | ||
return standardized_druglist |
Binary file not shown.
Oops, something went wrong.