-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSca_extraction.py
99 lines (91 loc) · 3.37 KB
/
Sca_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import numpy as np
import scaffoldgraph as sg
import networkx as nx
import os
from rdkit.Chem import Draw
from rdkit import Chem
from utils import get_mol
import matplotlib.pyplot as plt
import random
import argparse
def filter_sca(sca):
mol = Chem.MolFromSmiles(sca)
if mol == None:
return None
print("WUXIAO")
ri = mol.GetRingInfo()
benzene_smarts = Chem.MolFromSmiles("c1ccccc1")
if ri.NumRings() == 1 and mol.HasSubstructMatch(benzene_smarts) == True:
return None
elif mol.GetNumHeavyAtoms() > 20:
return None
elif Chem.rdMolDescriptors.CalcNumRotatableBonds(mol) > 3:
return None
else:
return True
def mol_if_equal_sca (mol,sca):
S_sca = []
#Chem.Kekulize(mol)
smile = get_mol(mol)
sca_mol = get_mol(sca)
# smile = Chem.MolFromSmiles(mol)
# sca_mol = Chem.MolFromSmarts(sca)
if smile == None or sca_mol == None :
return False
else:
n_atoms = smile.GetNumAtoms()
index = smile.GetSubstructMatch(sca_mol)
for i in range(n_atoms):
if i in index:
S_sca.append(1)
else:
S_sca.append(0)
arr = np.array(S_sca)
if (arr == 1).all() == True or (arr == 0).all() == True:
return False
else:
return True
def main(args):
network = sg.ScaffoldNetwork.from_smiles_file(args.data)
n_scaffolds = network.num_scaffold_nodes
n_molecules = network.num_molecule_nodes
print('\nGenerated scaffold network from {} molecules with {} scaffolds\n'.format(n_molecules, n_scaffolds))
scaffolds = list(network.get_scaffold_nodes())
molecules = list(network.get_molecule_nodes())
counts = network.get_hierarchy_sizes() # returns a collections Counter object
lists = sorted(counts.items())
x, y = zip(*lists)
plt.figure(figsize=(8, 6))
plt.bar(x, y)
plt.xlabel('Hierarchy')
plt.ylabel('Scaffold Count')
plt.title('Number of Scaffolds per Hierarchy (Network)')
plt.show()
with open(args.save_dir, "w") as f:
total = 0
for pubchem_id in molecules:
predecessors = list(nx.bfs_tree(network, pubchem_id, reverse=True))
smile = network.nodes[predecessors[0]]['smiles']
# 获取smiles_list中元素为smile的索引号
sca = []
for i in range(1, len(predecessors)):
if filter_sca(predecessors[i]) is not None:
sca.append(predecessors[i])
if len(sca) != 0:
random_sca = random.choice(sca)
# for i in range(len(sca)):
# if mol_if_equal_sca(smile,sca[i]) is True:
# f.write(smile + " " + sca[i] +'\n')
# total +=1
if mol_if_equal_sca(smile, random_sca) is True:
f.write(smile + " " + random_sca + '\n')
total += 1
# index +=1
print('\nthe pairs of molecula and scaffold have {} \n'.format(total))
if __name__ == "__main__":
# Argument parser
parser = argparse.ArgumentParser(description='Neural message passing and rnn')
parser.add_argument('--data', default='./data/chembl31.smi', help='dataset path')
parser.add_argument('--save_dir', default='./data/chembl31_sca_randone.smi', help='save model path')
args = parser.parse_args()
main(args)