Skip to content

Latest commit

 

History

History
246 lines (186 loc) · 8.71 KB

spacer_finder_Burstein.md

File metadata and controls

246 lines (186 loc) · 8.71 KB
jupyter
jupytext kernelspec
formats text_representation
ipynb,md
extension format_name format_version jupytext_version
.md
markdown
1.3
1.13.0
display_name language name
Python 3 (ipykernel)
python
python3

Detect repeats and spacers in Burstein2016 data

  • BLAST their detected CRISPR repeats against metagenomic data
  • Detect spacers, get abundance, measure turnover
  • Detect protospacers?
from Bio.Blast.Applications import NcbiblastnCommandline
#help(NcbiblastnCommandline)

from Bio.Seq import Seq
from Bio import SeqIO
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
def extract_spacers(fasta_file, blast_data, spacer_len = 30, similarity_threshold = 0.85, progress_bar = True):
    """
    First draft copied from spacer_finder.ipynb, designed for S. thermophilus 100 read length data
    
    Given the BLAST output from blasting the CRISPR repeat against a fasta file, take that fasta file and the 
    BLAST output and extract the neighbouring spacers.
    
    Inputs:
    fasta_file : the fasta file of reads corresponding to blast_data, i.e. SRR1873863.fasta. This is assumed to have reads
                      of length 100. 
    blast_data : a pandas dataframe of the BLAST results from blasting the repeat against fasta_file
    spacer_len : the expected spacer length - the shortest spacer will be length >= similarity_threshold * spacer_len, 
                      the longest spacer will be length spacer_len unless the distance between two repeats on the 
                      same read is longer.
                      
    Returns:
    spacers : a list of all the spacers detected next to a CRISPR repeat
    """
    
    min_length = round(spacer_len * similarity_threshold)
    
    reverse = False

    spacers = []
    ids = []

    if progress_bar == True:
        wrapper = tqdm
    if progress_bar == False:
        def wrapper(arg):
            return arg
    
    hit_ids = list(blast_data['subject_id'])
    
    for record in wrapper(SeqIO.parse(fasta_file, "fasta")):
        
        if record.id not in hit_ids:
            continue
        
        data = blast_data[blast_data['subject_id'] == record.id] # select matches from blast result

        if len(data) == 1: # only one match

            subject_start = int(data['subject_start'])
            subject_end = int(data['subject_end'])

            # check if reversed
            if subject_start > subject_end: # match is to reverse complement
                reverse = True 
            else:
                reverse = False

            start = min(subject_start, subject_end)
            end = max(subject_start, subject_end)

            repeat = record.seq[start -1 : end]

            # get the spacer at the start of the read
            if start > min_length + 1: 
                spacer_start = max(start - spacer_len - 1, 0)
                spacer = record.seq[spacer_start : start -1]
                if reverse == True:
                    spacers.append(spacer.reverse_complement())
                else:
                    spacers.append(spacer)

                ids.append(str(record.id) + "_a")
                
            # get the spacer at the end of the read
            if 100 - end > min_length:
                spacer = record.seq[end : end + spacer_len]
                if reverse == True:
                    spacers.append(spacer.reverse_complement())
                else:
                    spacers.append(spacer)
                
                ids.append(str(record.id) + "_b")

        elif len(data) == 2: # multiple repeat matches, get the spacer in between

            # check if reversed       
            if int(data.iloc[0]['subject_start']) > int(data.iloc[0]['subject_end']): # match is to reverse complement
                reverse = True    

                spacer_start = min(data['subject_start'])
                spacer_end = max(data['subject_end'])

                spacer = record.seq[spacer_start : spacer_end - 1]
                spacers.append(spacer.reverse_complement())
                #print(spacer)

            else:
                reverse = False
                spacer_start = min(data['subject_end'])
                spacer_end = max(data['subject_start'])

                spacer = record.seq[spacer_start : spacer_end - 1]
                spacers.append(spacer)

                #print(spacer)

            ids.append(str(record.id))
            
        else: # can't think that it's possible to have more than 2 hits, but if it happens check it out
            print(record.id)
            print(data)
            
    return spacers, ids

BLAST Output format:

http://www.metagenomics.wiki/tools/blast/blastn-output-format-6

Column headers: qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore

1. qseqid query (e.g., gene) sequence id
2. sseqid subject (e.g., reference genome) sequence id
3. pident percentage of identical matches
4. length alignment length
5. mismatch number of mismatches
6. gapopen number of gap openings
7. qstart start of alignment in query
8. qend end of alignment in query
9. sstart start of alignment in subject
10. send end of alignment in subject
11. evalue expect value
12. bitscore bit score
# load detected CRISPR repeats from paper
# source: https://static-content.springer.com/esm/art%3A10.1038%2Fncomms10613/MediaObjects/41467_2016_BFncomms10613_MOESM952_ESM.xlsx
Burstein_repeats = pd.read_csv("41467_2016_BFncomms10613_MOESM952_ESM.csv", skiprows = [0,1])
Burstein_repeats['query_id'] = Burstein_repeats['Genome bin'] + "_" + Burstein_repeats['Array location']
# get reverse complement of repeat sequence
repeat_reverse_complements = []
for repeat in Burstein_repeats['Repeat sequence']:
    repeat = Seq(repeat)
    repeat_reverse_complements.append(str(repeat.reverse_complement()))
    
Burstein_repeats['Repeat reverse complement'] = repeat_reverse_complements
# there are 197 genomes and 144 unique repeats
Burstein_repeats_unique = Burstein_repeats.drop_duplicates(subset = 'Repeat sequence')
Burstein_repeats_unique
Burstein_repeats_unique[Burstein_repeats_unique['Repeat sequence'].isin(Burstein_repeats_unique['Repeat reverse complement'])]
# these are the queries not included in the blast - duplicated repeats
duplicated_queries = Burstein_repeats[~Burstein_repeats['query_id'].isin(Burstein_repeats_unique['query_id'])]['query_id']
# make fasta file from repeat data
datapath = "/media/madeleine/My Passport/Data/Burstein2016"

with open("%s/Burstein_repeats_unique.fasta" %(datapath), 'w') as f:
    for i, row in Burstein_repeats_unique.iterrows(): # duplicates occur at different time points
        f.write(">" + str(row['Genome bin']) + "_" + str(row['Array location']) + "\n")
        f.write(str(row['Repeat sequence']) + "\n")

Note: I ran up to 173/346 of query suffix 1 for SRR1658462 with the repeat files in "Burstein_repeats_split", but after that I removed duplicate repeats and created new split files to run on. This should not affect the results because the query naming convention has not changed, just the total number of queries and which ones are in which query suffix split file.

query_suffixes = ["1", "2", "3", "4"]

accession_list = ["SRR1658343", "SRR1658462", "SRR1658465", "SRR1658467", "SRR1658469", "SRR1658472"]

datapath = "/media/madeleine/My Passport1/Data/Burstein2016"

for accession in accession_list:
    
    done_files = !ls -t "$datapath/$accession"_repeat_blast_out | tail -n +2 | cut -d "." -f1 # exclude the most recent file - might be partially complete
    
    for s in query_suffixes:
        query = "%s/Burstein_repeats_%s" %(datapath, s)
        for fn in tqdm(os.listdir("%s/%s" %(datapath, accession))):
            
            if "%s_blast_%s" %(fn, s) in done_files:
                continue
            
            outfile = "%s/%s_repeat_blast_out/%s_blast_%s.txt" %(datapath, accession, fn, s)
            filename = "%s/%s/%s" %(datapath, accession, fn)
            
            blastn_obj = NcbiblastnCommandline(query=query, subject=filename, 
                               evalue=10**-4, #num_descriptions = 100, 
                               max_target_seqs = 500000, dust = "no", 
                               # see http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 for outformat
                               outfmt = 6, 
                               task = "blastn",
                               out = outfile)

            stdout, stderr = blastn_obj()