Skip to content

Commit

Permalink
new method rename-chr (#389)
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinrue authored and sebastian-luna-valero committed Apr 5, 2018
1 parent 1663098 commit 4165d66
Showing 1 changed file with 57 additions and 2 deletions.
59 changes: 57 additions & 2 deletions CGAT/scripts/bed2bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,14 @@
of end of the contig, the interval is only moved as much as is
possible without doing this.
rename-chr
+++++
Renames chromosome names. Source and target names are supplied as a file
with two columns. Examples are available at:
https://github.com/dpryan79/ChromosomeMappings
Note that unmapped chromosomes are dropped from the output file.
Other options
+++++++++++++
Expand Down Expand Up @@ -138,6 +146,11 @@
cgat bed2bed --method=block -I transcripts.bed.gz -O transcripts.blocked.bed.gz
Rename UCSC chromosomes to ENSEMBL.
cat ucsc.bed | cgat bed2bed --method=rename-chr --rename-chr-file=ucsc2ensembl.txt > ensembl.bed
Usage
-----
Expand All @@ -156,7 +169,7 @@
import CGAT.Intervals as Intervals
from collections import defaultdict as defaultdict
import pysam

import csv

def filterNames(iterator, names):
""" Select only those intervals whose name is in names """
Expand Down Expand Up @@ -464,6 +477,26 @@ def extendInterval(iterator, contigs, distance):
(ninput, noutput, nskipped))


def renameChromosomes(iterator, chr_map):

ninput, noutput, nskipped = 0, 0, 0

for bed in iterator:
ninput += 1

if bed.contig in chr_map.keys():
bed.contig = chr_map[bed.contig]
else:
nskipped += 1
continue

noutput += 1
yield bed

E.info("ninput = %i, noutput=%i, nskipped=%i" %
(ninput, noutput, nskipped))


def main(argv=sys.argv):

parser = E.OptionParser(version="%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $",
Expand All @@ -474,7 +507,7 @@ def main(argv=sys.argv):
action="append",
choices=("merge", "filter-genome", "bins",
"block", "sanitize-genome", "shift", "extend",
"filter-names"),
"filter-names", "rename-chr"),
help="method to apply [default=%default]")

parser.add_option("--num-bins", dest="num_bins", type="int",
Expand Down Expand Up @@ -535,11 +568,16 @@ def main(argv=sys.argv):
parser.add_option("--filter-names-file", dest="names", type="string",
help="list of names to keep. One per line")

parser.add_option("--rename-chr-file", dest="rename_chr_file", type="string",
help="mapping table between old and new chromosome names."
"TAB separated 2-column file.")

parser.set_defaults(methods=[],
merge_distance=0,
binning_method="equal-bases",
merge_by_name=False,
genome_file=None,
rename_chr_file=None,
bam_file=None,
num_bins=5,
merge_min_intervals=1,
Expand All @@ -553,6 +591,7 @@ def main(argv=sys.argv):
(options, args) = E.Start(parser, add_pipe_options=True)

contigs = None
chr_map = None

# Why provide full indexed genome, when a tsv of contig sizes would do?
if options.genome_file:
Expand All @@ -562,6 +601,18 @@ def main(argv=sys.argv):
if options.bam_file:
samfile = pysam.AlignmentFile(options.bam_file)
contigs = dict(list(zip(samfile.references, samfile.lengths)))

if options.rename_chr_file:
chr_map = {}
with open(options.rename_chr_file, 'r') as filein:
reader = csv.reader(filein, delimiter='\t')
for row in reader:
if len(row) != 2:
raise ValueError("Mapping table must have exactly two columns")
chr_map[row[0]] = row[1]
if not len(chr_map.keys()) > 0:
raise ValueError("Empty mapping dictionnary")


processor = Bed.iterator(options.stdin)

Expand Down Expand Up @@ -618,6 +669,10 @@ def main(argv=sys.argv):
raise ValueError("please supply list of names to filter")
names = [name.strip() for name in open(options.names)]
processor = filterNames(processor, names)
elif method == "rename-chr":
if not chr_map:
raise ValueError("please supply mapping file")
processor = renameChromosomes(processor, chr_map)

noutput = 0
for bed in processor:
Expand Down

0 comments on commit 4165d66

Please sign in to comment.