From 0298d7438db0c97f0e1abef3a7165ecbfd47d827 Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Fri, 30 Oct 2020 16:27:50 +0000 Subject: [PATCH 1/3] Load entire chrom before individual genes --- liftoff/extract_features.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/liftoff/extract_features.py b/liftoff/extract_features.py index 29661c8..3812723 100644 --- a/liftoff/extract_features.py +++ b/liftoff/extract_features.py @@ -31,10 +31,11 @@ def create_feature_db_connections(args): disable_genes = True if args.db is None: try: + print("creating db") feature_db = gffutils.create_db(args.g, args.g + "_db", merge_strategy="create_unique", force=True, disable_infer_transcripts=disable_transcripts, disable_infer_genes=disable_genes, verbose=True) - + print('done') except: find_problem_line(args.g) feature_db_name = args.db @@ -144,19 +145,27 @@ def add_intermediates(intermediate_ids, intermediate_dict, feature_db): def get_gene_sequences(parent_dict, ref_chroms, args, liftover_type): fai = Fasta(args.reference) + print("getting gene sequences") if liftover_type == "unplaced": open(args.dir + "/unplaced_genes.fa", 'w') + + print("ref_chroms:", ref_chroms) for chrom in ref_chroms: + print("chrom:", chrom) fasta_out = get_fasta_out(chrom, args.reference, liftover_type, args.dir) + print("fasta_out:", fasta_out) sorted_parents = sorted(list(parent_dict.values()), key=lambda x: x.seqid) + print("len(sorted_parents)", len(sorted_parents)) if len(sorted_parents) == 0: sys.exit( "GFF does not contain any gene features. Use -f to provide a list of other feature types to lift over.") write_gene_sequences_to_file(chrom, args.reference, fai, sorted_parents, fasta_out, args) fasta_out.close() + print("done getting gene sequences") def get_fasta_out(chrom_name, reference_fasta_name, liftover_type, inter_files): + print("liftover_type:", liftover_type) if chrom_name == reference_fasta_name and (liftover_type == "chrm_by_chrm" or liftover_type == "copies"): fasta_out_name = "reference_all" elif liftover_type == "unmapped": @@ -173,18 +182,28 @@ def get_fasta_out(chrom_name, reference_fasta_name, liftover_type, inter_files): def write_gene_sequences_to_file(chrom_name, reference_fasta_name, reference_fasta_idx, parents, fasta_out, args): + import time if chrom_name == reference_fasta_name: current_chrom = parents[0].seqid else: current_chrom = chrom_name - chrom_seq = reference_fasta_idx[current_chrom] - for parent in parents: + t1 = time.time() + chrom_seq = reference_fasta_idx[current_chrom][:].seq + t2 = time.time() + print(f"loaded chrom_seq: {t2 - t1}") + for i,parent in enumerate(parents): + print("i:", i) if parent.seqid != current_chrom and chrom_name == reference_fasta_name: current_chrom = parent.seqid - chrom_seq = reference_fasta_idx[current_chrom] + chrom_seq = reference_fasta_idx[current_chrom][:].seq + print("chrom_seq:", chrom_seq) if parent.seqid == chrom_name or chrom_name == reference_fasta_name: gene_length = parent.end - parent.start + 1 parent.start = round(max(1, parent.start - args.flank * gene_length)) parent.end = round(min(parent.end + args.flank * gene_length, len(chrom_seq) - 1)) - parent_seq = chrom_seq[parent.start - 1: parent.end].seq + t1 = time.time() + parent_seq = chrom_seq[parent.start - 1: parent.end] + t2 = time.time() + length = parent.end - parent.start + 1 + print(f'read: {parent.start - 1} {parent.end} ({length}) time: {t2 - t1}') fasta_out.write(">" + parent.id + "\n" + str(parent_seq) + "\n") From 981c48f39dd24ee8b35e4083cbdb672118b0d2c1 Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Fri, 30 Oct 2020 19:24:44 +0000 Subject: [PATCH 2/3] Added logging --- liftoff/extract_features.py | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/liftoff/extract_features.py b/liftoff/extract_features.py index 3812723..183c069 100644 --- a/liftoff/extract_features.py +++ b/liftoff/extract_features.py @@ -31,11 +31,11 @@ def create_feature_db_connections(args): disable_genes = True if args.db is None: try: - print("creating db") + print("Creating GFF db") feature_db = gffutils.create_db(args.g, args.g + "_db", merge_strategy="create_unique", force=True, disable_infer_transcripts=disable_transcripts, disable_infer_genes=disable_genes, verbose=True) - print('done') + print("Done creating GFF db") except: find_problem_line(args.g) feature_db_name = args.db @@ -145,27 +145,22 @@ def add_intermediates(intermediate_ids, intermediate_dict, feature_db): def get_gene_sequences(parent_dict, ref_chroms, args, liftover_type): fai = Fasta(args.reference) - print("getting gene sequences") + print("Getting gene sequences") if liftover_type == "unplaced": open(args.dir + "/unplaced_genes.fa", 'w') - print("ref_chroms:", ref_chroms) for chrom in ref_chroms: - print("chrom:", chrom) fasta_out = get_fasta_out(chrom, args.reference, liftover_type, args.dir) - print("fasta_out:", fasta_out) sorted_parents = sorted(list(parent_dict.values()), key=lambda x: x.seqid) - print("len(sorted_parents)", len(sorted_parents)) if len(sorted_parents) == 0: sys.exit( "GFF does not contain any gene features. Use -f to provide a list of other feature types to lift over.") write_gene_sequences_to_file(chrom, args.reference, fai, sorted_parents, fasta_out, args) fasta_out.close() - print("done getting gene sequences") + print("Done getting gene sequences") def get_fasta_out(chrom_name, reference_fasta_name, liftover_type, inter_files): - print("liftover_type:", liftover_type) if chrom_name == reference_fasta_name and (liftover_type == "chrm_by_chrm" or liftover_type == "copies"): fasta_out_name = "reference_all" elif liftover_type == "unmapped": @@ -187,23 +182,18 @@ def write_gene_sequences_to_file(chrom_name, reference_fasta_name, reference_fas current_chrom = parents[0].seqid else: current_chrom = chrom_name - t1 = time.time() + + print(f"Loading chrom seq: {current_chrom}") chrom_seq = reference_fasta_idx[current_chrom][:].seq - t2 = time.time() - print(f"loaded chrom_seq: {t2 - t1}") for i,parent in enumerate(parents): - print("i:", i) if parent.seqid != current_chrom and chrom_name == reference_fasta_name: current_chrom = parent.seqid + print(f"Loading chrom seq: {current_chrom}") chrom_seq = reference_fasta_idx[current_chrom][:].seq - print("chrom_seq:", chrom_seq) + print(f"Done loading chrom seq: {current_chrom}") if parent.seqid == chrom_name or chrom_name == reference_fasta_name: gene_length = parent.end - parent.start + 1 parent.start = round(max(1, parent.start - args.flank * gene_length)) parent.end = round(min(parent.end + args.flank * gene_length, len(chrom_seq) - 1)) - t1 = time.time() parent_seq = chrom_seq[parent.start - 1: parent.end] - t2 = time.time() - length = parent.end - parent.start + 1 - print(f'read: {parent.start - 1} {parent.end} ({length}) time: {t2 - t1}') fasta_out.write(">" + parent.id + "\n" + str(parent_seq) + "\n") From 07f34193e95b83886433702e136454b8ba2cda40 Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Fri, 30 Oct 2020 15:31:20 -0400 Subject: [PATCH 3/3] Update liftoff/extract_features.py --- liftoff/extract_features.py | 1 - 1 file changed, 1 deletion(-) diff --git a/liftoff/extract_features.py b/liftoff/extract_features.py index 183c069..a60d4d9 100644 --- a/liftoff/extract_features.py +++ b/liftoff/extract_features.py @@ -177,7 +177,6 @@ def get_fasta_out(chrom_name, reference_fasta_name, liftover_type, inter_files): def write_gene_sequences_to_file(chrom_name, reference_fasta_name, reference_fasta_idx, parents, fasta_out, args): - import time if chrom_name == reference_fasta_name: current_chrom = parents[0].seqid else: