Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Load entire chromosome at once rather than indvidual sequences #51

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions liftoff/extract_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,11 @@ def create_feature_db_connections(args):
disable_genes = True
if args.db is None:
try:
print("Creating GFF db")
feature_db = gffutils.create_db(args.g, args.g + "_db", merge_strategy="create_unique", force=True,
disable_infer_transcripts=disable_transcripts,
disable_infer_genes=disable_genes, verbose=True)

print("Done creating GFF db")
except:
find_problem_line(args.g)
feature_db_name = args.db
Expand Down Expand Up @@ -144,8 +145,10 @@ def add_intermediates(intermediate_ids, intermediate_dict, feature_db):

def get_gene_sequences(parent_dict, ref_chroms, args, liftover_type):
fai = Fasta(args.reference)
print("Getting gene sequences")
if liftover_type == "unplaced":
open(args.dir + "/unplaced_genes.fa", 'w')

for chrom in ref_chroms:
fasta_out = get_fasta_out(chrom, args.reference, liftover_type, args.dir)
sorted_parents = sorted(list(parent_dict.values()), key=lambda x: x.seqid)
Expand All @@ -154,6 +157,7 @@ def get_gene_sequences(parent_dict, ref_chroms, args, liftover_type):
"GFF does not contain any gene features. Use -f to provide a list of other feature types to lift over.")
write_gene_sequences_to_file(chrom, args.reference, fai, sorted_parents, fasta_out, args)
fasta_out.close()
print("Done getting gene sequences")


def get_fasta_out(chrom_name, reference_fasta_name, liftover_type, inter_files):
Expand All @@ -173,18 +177,23 @@ def get_fasta_out(chrom_name, reference_fasta_name, liftover_type, inter_files):


def write_gene_sequences_to_file(chrom_name, reference_fasta_name, reference_fasta_idx, parents, fasta_out, args):
import time
pkerpedjiev marked this conversation as resolved.
Show resolved Hide resolved
if chrom_name == reference_fasta_name:
current_chrom = parents[0].seqid
else:
current_chrom = chrom_name
chrom_seq = reference_fasta_idx[current_chrom]
for parent in parents:

print(f"Loading chrom seq: {current_chrom}")
chrom_seq = reference_fasta_idx[current_chrom][:].seq
for i,parent in enumerate(parents):
if parent.seqid != current_chrom and chrom_name == reference_fasta_name:
current_chrom = parent.seqid
chrom_seq = reference_fasta_idx[current_chrom]
print(f"Loading chrom seq: {current_chrom}")
chrom_seq = reference_fasta_idx[current_chrom][:].seq
print(f"Done loading chrom seq: {current_chrom}")
if parent.seqid == chrom_name or chrom_name == reference_fasta_name:
gene_length = parent.end - parent.start + 1
parent.start = round(max(1, parent.start - args.flank * gene_length))
parent.end = round(min(parent.end + args.flank * gene_length, len(chrom_seq) - 1))
parent_seq = chrom_seq[parent.start - 1: parent.end].seq
parent_seq = chrom_seq[parent.start - 1: parent.end]
fasta_out.write(">" + parent.id + "\n" + str(parent_seq) + "\n")