-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathP3.py
122 lines (108 loc) · 3.92 KB
/
P3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
PAD Project 2022: P3
Ekaterina Golubeva
Input: dict(tuple(int,int) -> tuple(string,string))
Output: list(list(float)
Outline :
- Exception_ ACGT - input check helper function
- Exception_Format - input check helper function
- Extract_Pairs_Seqs - helper function extracts pairs of sequences from the input dictionary
- Length - calculates the length of the alignment (without dashes)
- P_Dist - calculates p-distance between two sequences
- D_Dist - calculates d-distance between two sequences
- ComputeDistMatrix - main function
"""
import numpy as np
def Exception_ACTG_(string):
"""Checks if the nucleotides string only contains allowed characters
input : string, sequence of nucleotides with gaps
output: True if there's malformed input, False if everything is ok"""
counter_errors = 0
for letter in string:
if letter not in ['A', 'a', 'c', 'C', 'T', 't', 'g', 'G', '-'] or not type(letter) == str:
counter_errors += 1
return True
else:
counter_errors = 0
if counter_errors == 0:
return False
def Exception_Format(dict_tuples):
"""Checks if the input is of correct type and format
input : dict{tuple(int,int):tuple(string,string)}
output: True if there's malformed input, False if everything is ok"""
Exception = False
if type(dict_tuples) != dict:
Exception = True
print(Exception)
else:
for key in dict_tuples.keys():
if type(key) != tuple or type(dict_tuples[key]) != tuple:
Exception = True
elif type(key[0]) != int or type(key[1]) != int:
Exception = True
elif type(dict_tuples[key][0])!= str or type(dict_tuples[key][1]) != str:
Exception = True
else:
Exception_ACTG_(dict_tuples[key][0])
Exception_ACTG_(dict_tuples[key][1])
return Exception
def Extract_Pairs_Seqs(dict, i, j):
""" Extracts pairs of sequences from the input dictionary
input : dict, int, int
output : the pair of sequences as a list of tuples
"""
pairs = dict[(i, j)]
seq1 = pairs[0]
seq2 = pairs[1]
return seq1, seq2
def Length(seq1, seq2):
""" Calculates the length of the sequences alignment without dashes
input : seq1,seq2
output: length (in int type) of the alignment
"""
l = len(seq1)
for i in range(len(seq2)):
if seq1[i] == '-' or seq2[i] == '-':
l = l-1
return l
def P_Dist(l, seq1, seq2):
"""Calculates the p distance between two sequences and stores it in a matrix
input: length, seq,seq2
output: p-distance between sequences
"""
difference = 0
for i in range(len(seq2)):
if seq1[i] != seq2[i] and seq1[i] != '-' and seq2[i] != '-':
difference += 1
return difference/l
def D_Dist(p):
"""Calculates evolutionary distance between two sequences and stores in a matrix
input : p-distance
output: d-distance
"""
if p >= 3/4:
d = 30
else:
d = -3/4*np.log(1-4/3*p)
return d
def ComputeDistMatrix(dict):
""" Calculates evolutionary distance of aligned sequences.
Input: dict(tuple(int,int) -> tuple(string,string))
Output: list(list(float)"""
if Exception_Format(dict):
raise Exception("malformed input")
l = len(dict)
n = round(1/2 + ((1+8*l)**0.5)/2) # number of sequences, matrix dimension
matrix = np.zeros((n, n))
for i in range(0, n):
for j in range(i+1, n):
seq1, seq2 = Extract_Pairs_Seqs(dict, i, j)
l = Length(seq1, seq2)
p = P_Dist(l, seq1, seq2)
d = D_Dist(p)
if i == j:
matrix[i, j] = 0
else:
matrix[i, j] = d
matrix[j, i] = d
return matrix