-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAccessiontoAlignment.py
82 lines (72 loc) · 5.04 KB
/
AccessiontoAlignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#! python
#
# Code 2023 by Jamel Simpson
from os import system
from pathlib import Path
from Bio import Entrez
def accession_to_fasta(monomer_file_name, accession, email_for_Bio,subunits, multimer_name='NA'):
"""Takes an accession number and creates a fasta file with the sequence that corresponds with the accession given.
A monomeric file is always created by default for alignment purposes even if a multimer file is requested"""
Entrez.email = email_for_Bio
# Pulling the sequence corresponding with accession numer specified
handle = Entrez.efetch(db='protein', id=accession, retmode='text', rettype='fasta').readlines()
# Turning the retrieved sequence into a single string with no breaks
sequence = ''.join([x for x in handle if x[0] != '>' if x != '']).strip().replace('\n', '')
# Creating a monomer file by default for alignment purposes, if a multimer is requested it's made later
fasta_creation(monomer_file_name, sequence, 1)
if subunits != 1:
fasta_creation(multimer_name, sequence, subunits)
def multiple_sequence_alignment(list_of_fastas, fasta_for_alignment, new_alignment_file, reference_protein_fasta,muscle_command):
"""Creates a multiple sequence alignment using muscle and a concatenated fasta file with a reference fasta as the base,
joined with all fastas specified in list_of_fastas."""
# Creating a copy of the fasta file of a reference_protein_fasta to be added into
system(f'cp {reference_protein_fasta} {fasta_for_alignment}')
# Concatenating the fasta files found in list_of_fasta
for fasta in list_of_fastas:
system(f"cat {fasta} >> {fasta_for_alignment}")
# Using muscle to perform the alignment
system(f'{muscle_command} -in {fasta_for_alignment} -out {new_alignment_file}')
def run_emboss_needle(new_emboss_file, sequence_one, sequence_two, needle_directory):
"""This runs EMBOSS on the command line."""
system(f'{needle_directory} -sprotein -gapopen 10 -gapextend 0.5 '
f'-outfile {new_emboss_file} -asequence asis:{sequence_one} -bsequence asis:{sequence_two}')
def alignment_finder(alignment_file, sequence_of_interest, comparison_protein,
reference_protein, run_emboss='', new_emboss_file=''):
"""Takes a fasta style alignment and a sequence_of_interest from a reference_protein and returns the sequence of the
comparison_protein that is outlined in the boundaries of the sequence_of_interest, as well as the python index boundaries
for the found_alignment of the comparison_protein. reference_protein and comparison_protein must the names following
'>' in the alignment file"""
#Must be in fasta format
with open(alignment_file, "r") as alignment:
alignment = alignment.read().split('>')
# Splitting up the sequences names and sequences into a dictionary
sequence_dictionary = {sequence.split('\n')[0]: ''.join(sequence.split('\n')[1:]) for sequence in alignment if
len(sequence) != 0}
# Recording the specified sequences as variables
reference_sequence = sequence_dictionary[reference_protein]
comparison_sequence = sequence_dictionary[comparison_protein]
# Matching python indexing for the indexing from the alignment with some amount of '-' and indexing in the regular sequence
reference_sequence_indexing = [ind for ind, x in enumerate(reference_sequence) if x != '-']
# Creating a regular sequence without '-'
no_gap_reference_sequence = ''.join([x for ind, x in enumerate(reference_sequence) if x != '-'])
# Boundaries are given in python index
reference_start = reference_sequence_indexing[no_gap_reference_sequence.find(sequence_of_interest)]
reference_end = reference_sequence_indexing[no_gap_reference_sequence.find(sequence_of_interest) + len(sequence_of_interest)]
# Pulling the section of the comparison_sequence that overlaps with the sequence_of_interest
found_alignment = comparison_sequence[reference_start:reference_end].replace('-', '')
no_gap_reference_start = no_gap_reference_sequence.find(sequence_of_interest)
no_gap_reference_end = no_gap_reference_start + len(sequence_of_interest)
# Recording the indexes of the found_alignment
# Additionally the splice_start is the first residue that is spliced,
# and splice_end is the first residue that's not spliced
splice_start = comparison_sequence.replace('-', '').find(found_alignment)
splice_end = splice_start + len(found_alignment)
if run_emboss != '':
run_emboss_needle(new_emboss_file, found_alignment, sequence_of_interest, run_emboss)
return found_alignment, (splice_start, splice_end), (no_gap_reference_start, no_gap_reference_end)
def fasta_creation(file_name, sequence, subunits):
"""Creates a fasta file with the given file_name, and replicates the sequence within it the specified number of times
to create a homo multimer if subunits is greater than 1."""
with open(file_name, 'w') as outfile:
for _unused_x in range(subunits):
outfile.write('>{0}\n{1}\n'.format(Path(file_name).stem, sequence))