Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastian-luna-valero committed May 29, 2018
1 parent 8941d34 commit 30c2eeb
Show file tree
Hide file tree
Showing 8 changed files with 505 additions and 3 deletions.
56 changes: 55 additions & 1 deletion CGAT/tools/bed2bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,14 @@
of end of the contig, the interval is only moved as much as is
possible without doing this.
rename-chr
++++++++++
Renames chromosome names. Source and target names are supplied as a file
with two columns. Examples are available at:
https://github.com/dpryan79/ChromosomeMappings
Note that unmapped chromosomes are dropped from the output file.
Other options
+++++++++++++
Expand Down Expand Up @@ -138,6 +146,10 @@
cgat bed2bed --method=block -I transcripts.bed.gz -O transcripts.blocked.bed.gz
Rename UCSC chromosomes to ENSEMBL.
cat ucsc.bed | cgat bed2bed --method=rename-chr --rename-chr-file=ucsc2ensembl.txt > ensembl.bed
Usage
-----
Expand All @@ -156,6 +168,7 @@
import CGAT.Intervals as Intervals
from collections import defaultdict as defaultdict
import pysam
import csv


def filterNames(iterator, names):
Expand Down Expand Up @@ -464,6 +477,26 @@ def extendInterval(iterator, contigs, distance):
(ninput, noutput, nskipped))


def renameChromosomes(iterator, chr_map):

ninput, noutput, nskipped = 0, 0, 0

for bed in iterator:
ninput += 1

if bed.contig in chr_map.keys():
bed.contig = chr_map[bed.contig]
else:
nskipped += 1
continue

noutput += 1
yield bed

E.info("ninput = %i, noutput=%i, nskipped=%i" %
(ninput, noutput, nskipped))


def main(argv=sys.argv):

parser = E.OptionParser(version="%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $",
Expand All @@ -474,7 +507,7 @@ def main(argv=sys.argv):
action="append",
choices=("merge", "filter-genome", "bins",
"block", "sanitize-genome", "shift", "extend",
"filter-names"),
"filter-names", "rename-chr"),
help="method to apply [default=%default]")

parser.add_option("--num-bins", dest="num_bins", type="int",
Expand Down Expand Up @@ -535,11 +568,16 @@ def main(argv=sys.argv):
parser.add_option("--filter-names-file", dest="names", type="string",
help="list of names to keep. One per line")

parser.add_option("--rename-chr-file", dest="rename_chr_file", type="string",
help="mapping table between old and new chromosome names."
"TAB separated 2-column file.")

parser.set_defaults(methods=[],
merge_distance=0,
binning_method="equal-bases",
merge_by_name=False,
genome_file=None,
rename_chr_file=None,
bam_file=None,
num_bins=5,
merge_min_intervals=1,
Expand All @@ -553,6 +591,7 @@ def main(argv=sys.argv):
(options, args) = E.start(parser, add_pipe_options=True)

contigs = None
chr_map = None

# Why provide full indexed genome, when a tsv of contig sizes would do?
if options.genome_file:
Expand All @@ -563,6 +602,17 @@ def main(argv=sys.argv):
samfile = pysam.AlignmentFile(options.bam_file)
contigs = dict(list(zip(samfile.references, samfile.lengths)))

if options.rename_chr_file:
chr_map = {}
with open(options.rename_chr_file, 'r') as filein:
reader = csv.reader(filein, delimiter='\t')
for row in reader:
if len(row) != 2:
raise ValueError("Mapping table must have exactly two columns")
chr_map[row[0]] = row[1]
if not len(chr_map.keys()) > 0:
raise ValueError("Empty mapping dictionnary")

processor = Bed.iterator(options.stdin)

for method in options.methods:
Expand Down Expand Up @@ -618,6 +668,10 @@ def main(argv=sys.argv):
raise ValueError("please supply list of names to filter")
names = [name.strip() for name in open(options.names)]
processor = filterNames(processor, names)
elif method == "rename-chr":
if not chr_map:
raise ValueError("please supply mapping file")
processor = renameChromosomes(processor, chr_map)

noutput = 0
for bed in processor:
Expand Down
65 changes: 63 additions & 2 deletions CGAT/tools/gff2gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@
agp file to map coordinates from contigs to scaffolds
``rename-chr``
Renames chromosome names. Source and target names are supplied as a file
with two columns. Examples are available at:
https://github.com/dpryan79/ChromosomeMappings
Note that unmapped chromosomes are dropped from the output file.
Usage
Expand All @@ -110,10 +116,15 @@
The "--skip-missing" option prevents an exception being
raised if entries are found on missing chromosomes
Another example, to rename UCSC chromosomes to ENSEMBL.
cat ucsc.gff
| gff2gff.py --method=rename-chr
--rename-chr-file=ucsc2ensembl.txt > ensembl.gff
Type::
python gff2gff.py --help
cgat gff2gff --help
for command line help.
Expand All @@ -135,6 +146,7 @@
import CGAT.Genomics as Genomics
import CGAT.IndexedFasta as IndexedFasta
import CGAT.Intervals as Intervals
import csv


def combineGFF(gffs,
Expand Down Expand Up @@ -359,6 +371,27 @@ def cropGFF(gffs, filename_gff):
(ninput, noutput, ncropped, ndeleted))


def renameChromosomes(gffs, chr_map):

ninput, noutput, nskipped = 0, 0, 0

for gff in gffs:

ninput += 1

if gff.contig in chr_map.keys():
gff.contig = chr_map[gff.contig]
else:
nskipped += 1
continue

noutput += 1
yield gff

E.info("ninput = %i, noutput=%i, nskipped=%i" %
(ninput, noutput, nskipped))


def main(argv=None):

if argv is None:
Expand All @@ -381,7 +414,8 @@ def main(argv=None):
"merge-features",
"sanitize",
"to-forward-coordinates",
"to-forward-strand"),
"to-forward-strand",
"rename-chr"),
help="method to apply [%default]")

parser.add_option(
Expand Down Expand Up @@ -495,11 +529,17 @@ def main(argv=None):
"--max-features", dest="max_features", type="int",
help="maximum number of features to merge/join [%default].")

parser.add_option(
"--rename-chr-file", dest="rename_chr_file", type="string",
help="mapping table between old and new chromosome names."
"TAB separated 2-column file.")

parser.set_defaults(
input_filename_contigs=False,
filename_crop_gff=None,
input_filename_agp=False,
genome_file=None,
rename_chr_file=None,
add_up_flank=None,
add_down_flank=None,
complement_groups=False,
Expand Down Expand Up @@ -531,6 +571,8 @@ def main(argv=None):

contigs = None
genome_fasta = None
chr_map = None

if options.input_filename_contigs:
contigs = Genomics.readContigSizes(
IOTools.open_file(options.input_filename_contigs, "r"))
Expand All @@ -539,6 +581,18 @@ def main(argv=None):
genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
contigs = genome_fasta.getContigSizes()

if options.rename_chr_file:
chr_map = {}
with open(options.rename_chr_file, 'r') as filein:
reader = csv.reader(filein, delimiter='\t')
for row in reader:
if len(row) != 2:
raise ValueError(
"Mapping table must have exactly two columns")
chr_map[row[0]] = row[1]
if not len(chr_map.keys()) > 0:
raise ValueError("Empty mapping dictionnary")

if options.assembly_report:
df = pd.read_csv(options.assembly_report, comment="#",
header=None, sep="\t")
Expand Down Expand Up @@ -836,6 +890,13 @@ def assemblyReport(id):
len(list(filtered_contigs.keys())),
str(filtered_contigs)))

elif options.method == "rename-chr":
if not chr_map:
raise ValueError("please supply mapping file")

for gff in renameChromosomes(gffs, chr_map):
options.stdout.write(str(gff) + "\n")

else:

for gff in gffs:
Expand Down
Loading

0 comments on commit 30c2eeb

Please sign in to comment.