-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathChEBI_Indexer.py
103 lines (88 loc) · 3.55 KB
/
ChEBI_Indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
Usage:
ChEBI_Indexer.py [options] [JSON_LINE_FILES...]
Options:
--index PATH Location of folder to store index. [default: index]
"""
import sqlite3 as sql
import json
import time
import collections
from whoosh.fields import Schema, TEXT, STORED, NUMERIC
import whoosh.index as index
import sys
import mmap
import os.path
from docopt import docopt as magic_docopt
magic_docopt()
chebi = Schema(
ChEBI=NUMERIC(int, 32, signed=False, stored=True),
InChI=STORED,
InChIKey=TEXT(stored=True),
SMILES=TEXT(stored=True),
mass=NUMERIC(float, 32, stored=True),
charge=NUMERIC(int, 32, signed=True, stored=True),
formula=TEXT(stored=True),
names=TEXT(stored=True),
CAS=TEXT(stored=True),
definition=STORED
)
if os.path.exists(arguments.index):
ix = index.open_dir(arguments.index)
else:
os.mkdir(arguments.index)
ix = index.create_in(arguments.index, chebi)
if not arguments.JSON_LINE_FILES:
import glob
arguments["JSON_LINE_FILES"] = glob.glob("chebi_split/*.json")
writer = ix.writer(procs=2, limitmb=512, multisegment=True)
class ChEBIDatastore(object):
def __init__(self, db_name="chebi_v0.db"):
self.conn = sql.connect(db_name)
init_table = "CREATE TABLE IF NOT EXISTS chebi_table (ChEBI INT, names TEXT, mass REAL, charge INT, InChI TEXT, InChIKey TEXT, SMILES TEXT, formula TEXT, CAS TEXT, definition TEXT)"
self.conn.execute(init_table)
create_index = "CREATE INDEX IF NOT EXISTS chebi_index ON chebi_table (ChEBI ASC)"
self.conn.execute(create_index)
self.conn.commit()
self.cursor = self.conn.cursor()
def get_chebi_id(self, startid):
selector = "SELECT * FROM chebi_table WHERE ChEBI IS %d" % (startid)
samples = self.cursor.execute(selector)
return [x for x in samples]
def get_chebi_name(self, name):
selector = "SELECT * FROM chebi-table WHERE name LIKE %s" % (name)
samples = self.cursor.execute(selector)
return [x for x in samples]
def add_chebi(self, entity):
inserter = "INSERT INTO chebi_table VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
data = [x for x in entity.values()]
self.cursor.executemany(inserter, [data])
return self.conn.commit()
if __name__ == "__main__":
datastore = ChEBIDatastore()
for JL_FILE in arguments.JSON_LINE_FILES:
with open(JL_FILE, "r+") as f:
data = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
while True:
line = data.readline()
if len(line.strip()) < 1:
break
res = json.loads(line.decode("utf-8"))
try:
out = {}
out["ChEBI"] = res["ChEBI ID"]
out["names"] = "; ".join([res["ChEBI Name"], *res.get("IUPAC Names", []), *res.get("Synonyms", [])])
out["mass"] = res["Mass"]
out["charge"] = res["Charge"]
out["InChI"] = "".join(res.get("InChI", ""))
out["InChIKey"] = "".join(res.get("InChIKey",""))
out["SMILES"] = res["SMILES"][0]
out["formula"] = " ".join(res.get("Formulae",[]))
out["definition"] = res.get("Definition", "")
out["CAS"] = " ".join(res.get("CAS Registry Numbers",[]))
writer.add_document(**out)
if not datastore.get_chebi_id(out["ChEBI"]):
datastore.add_chebi(out)
except:
pass
writer.commit()