forked from conmeehan/pathophy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpairwiseCluster.py
177 lines (161 loc) · 5.11 KB
/
pairwiseCluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#! /usr/bin/env python
import sys
import re
from collections import defaultdict
#script to cluster sequences based on a pairwise distance file
#Script takes the output from pairwiseDist.py and a cut-off
#Output is
#A file listing all sequences that are within the cut-off from another sequence
#A file where each line is a list of sequences within cut-off distance of each other (tight cluster)
#A file where each line is a list of sequences where at each sequence is within the cut-off from at least 1 other (e.g. if cutoff is 5 then seq1 and seq2 could be 5 apart and seq3 could be 10 from seq2 and 3 from seq1 but all on same line) (loose cluster)
#NOTE: cut-off is used as <= so if cutoff is 5 then those a distance of 5 will be counted as clustered
#NOTE2: NA can be an entry (e.g. for a tip distance that is not supported by bootstrap or unequal lengths of patterns) so if it is, count as not within cutoff
#usage
#python pairwiseCluster.py pairwiseDistFile cut-off
#subroutine to remove duplicates from an array
def unique(array):
uniq = list(set(array))
seen = {}
for item in array:
seen[item] = seen.get(item, 0) + 1
uniq = seen.keys()
seen = {}
uniq = []
for item in array:
count = seen.get(item, 0)
if count == 0:
uniq.append(item)
seen[item] = count + 1
return uniq
#read in file
try:
pwMxF=open(sys.argv[1], 'r')
except IndexError:
print("\n pairwise distance matrix not supplied.")
sys.exit()
except IOError:
print("\n pairwise distance matrix not found in directory.")
sys.exit()
#get the cut-off
try:
cutoff = float(sys.argv[2])
except IndexError:
print("\ncluster distance cut-off not supplied.")
sys.exit()
except ValueError:
print("\ncluster distance cut-off is not a number.")
sys.exit()
#create save files
try:
saveList=open("samplesInCluster_"+sys.argv[2]+"cutoff.txt",'w')
except IOError:
print('no room for save file')
sys.exit()
try:
saveTightCluster=open("tightClusters_"+sys.argv[2]+"cutoff.txt",'w')
except IOError:
print('no room for save file')
sys.exit()
try:
saveLooseCluster=open("looseClusters_"+sys.argv[2]+"cutoff.txt",'w')
except IOError:
print('no room for save file')
sys.exit()
#read in the pairwise matrix and for each sequence keep a dictionary of what others they are in the cluster with
#keep a list of all that are in a cluster
first=0
names=0
clusters=defaultdict(list)
inCluster=[]
while 1:
s=pwMxF.readline()
if not s:
break
s=s.rstrip()
sections=s.split("\t")
if first==0:#header line
names=sections
names.pop(0)
first=1
continue
dists=list(sections)
seqName=dists.pop(0)
seqPos=names.index(seqName)
for i in range(len(dists)):
if i==seqPos: #skip the self hits
continue
elif dists[i]=="NA": #is a distance that is not applicable so skip
continue
elif float(dists[i])<=cutoff:
clusters[seqName].append(names[i])
inCluster.append(names[i])
pwMxF.close()
#create a unique list of those in clusters and print to file
inCluster=unique(inCluster)
inCluster.sort()
saveList.write("\n".join(inCluster)+"\n")
saveList.close()
#turn the dictionary into a unique sorted list of sorted strings (tight clusters) and sorted lists(loose clusters)
#the strings are made unique and these are the tight clusters
stringClust=[]
for seq in clusters:
temp=list(clusters[seq])
temp.append(seq)
temp.sort()
stringClust.append("\t".join(temp))
stringClust=unique(stringClust)
stringClust.sort()
#save to file the clusters
for sc in stringClust:
saveTightCluster.write(sc+"\n")
saveTightCluster.close()
#loose groups are formed by going sample by sample and adding all the other samples they are in a cluster with to a loose cluster. If none are in an existing cluster, a new one is started
looseClusters=[]
for sample in inCluster:
newClust=list(clusters[sample])
newClust.append(sample)
sNC=frozenset(newClust) #create a set version of the list
if looseClusters==[]: #no existing clusters
looseClusters.append(sNC)
else:
#check all samples in this cluster against samples already in the loose clusters list. If any are there, add all to that loose cluster
found=0
for lcNum in range(len(looseClusters)):
slc=looseClusters[lcNum]
if sNC.intersection(slc):
newLC=sNC.union(slc)
looseClusters[lcNum]=newLC
found=1
break
if found==0:
looseClusters.append(sNC)
#there is a possibility that some overlaps were missed due to order of adding them together so do one extra round of additions
looseClusters_final=[]
skip=[]
for lcNum in range(len(looseClusters)):
if lcNum in skip:
continue
else:
found=0
lc1=looseClusters[lcNum]
for lcNum2 in range(lcNum+1,len(looseClusters)):
lc2=looseClusters[lcNum2]
if lc1.intersection(lc2):
newLC=lc1.union(lc2)
looseClusters_final.append(newLC)
found=1
skip.append(lcNum2)
break
if found==0:
looseClusters_final.append(lc1)
#create a unique list for each loose cluster and save as strings
looseClustersStrings=[]
for slc in looseClusters_final:
lc=unique(list(slc))
lc.sort()
looseClustersStrings.append("\t".join(lc))
looseClustersStrings.sort()
#save to file the clusters
for lcs in looseClustersStrings:
saveLooseCluster.write(lcs+"\n")
saveLooseCluster.close()