migrating CGATOxford/cgat#389 and CGATOxford/cgat#390

cgat-developers · May 29, 2018 · 30c2eeb · 30c2eeb
1 parent 8941d34
commit 30c2eeb
Show file tree

Hide file tree

Showing 8 changed files with 505 additions and 3 deletions.
diff --git a/CGAT/tools/bed2bed.py b/CGAT/tools/bed2bed.py
@@ -104,6 +104,14 @@
 of end of the contig, the interval is only moved as much as is
 possible without doing this.
 
+rename-chr
+++++++++++
+
+Renames chromosome names. Source and target names are supplied as a file
+with two columns. Examples are available at:
+https://github.com/dpryan79/ChromosomeMappings
+Note that unmapped chromosomes are dropped from the output file.
+
 Other options
 +++++++++++++
 
@@ -138,6 +146,10 @@
 
     cgat bed2bed --method=block -I transcripts.bed.gz -O transcripts.blocked.bed.gz
 
+Rename UCSC chromosomes to ENSEMBL.
+
+    cat ucsc.bed | cgat bed2bed --method=rename-chr --rename-chr-file=ucsc2ensembl.txt > ensembl.bed
+
 Usage
 -----
 
@@ -156,6 +168,7 @@
 import CGAT.Intervals as Intervals
 from collections import defaultdict as defaultdict
 import pysam
+import csv
 
 
 def filterNames(iterator, names):
@@ -464,6 +477,26 @@ def extendInterval(iterator, contigs, distance):
            (ninput, noutput, nskipped))
 
 
+def renameChromosomes(iterator, chr_map):
+
+    ninput, noutput, nskipped = 0, 0, 0
+
+    for bed in iterator:
+        ninput += 1
+
+        if bed.contig in chr_map.keys():
+            bed.contig = chr_map[bed.contig]
+        else:
+            nskipped += 1
+            continue
+
+        noutput += 1
+        yield bed
+
+    E.info("ninput = %i, noutput=%i, nskipped=%i" %
+           (ninput, noutput, nskipped))
+
+
 def main(argv=sys.argv):
 
     parser = E.OptionParser(version="%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $",
@@ -474,7 +507,7 @@ def main(argv=sys.argv):
                       action="append",
                       choices=("merge", "filter-genome", "bins",
                                "block", "sanitize-genome", "shift", "extend",
-                               "filter-names"),
+                               "filter-names", "rename-chr"),
                       help="method to apply [default=%default]")
 
     parser.add_option("--num-bins", dest="num_bins", type="int",
@@ -535,11 +568,16 @@ def main(argv=sys.argv):
     parser.add_option("--filter-names-file", dest="names", type="string",
                       help="list of names to keep. One per line")
 
+    parser.add_option("--rename-chr-file", dest="rename_chr_file", type="string",
+                      help="mapping table between old and new chromosome names."
+                      "TAB separated 2-column file.")
+
     parser.set_defaults(methods=[],
                         merge_distance=0,
                         binning_method="equal-bases",
                         merge_by_name=False,
                         genome_file=None,
+                        rename_chr_file=None,
                         bam_file=None,
                         num_bins=5,
                         merge_min_intervals=1,
@@ -553,6 +591,7 @@ def main(argv=sys.argv):
     (options, args) = E.start(parser, add_pipe_options=True)
 
     contigs = None
+    chr_map = None
 
     # Why provide full indexed genome, when a tsv of contig sizes would do?
     if options.genome_file:
@@ -563,6 +602,17 @@ def main(argv=sys.argv):
         samfile = pysam.AlignmentFile(options.bam_file)
         contigs = dict(list(zip(samfile.references, samfile.lengths)))
 
+    if options.rename_chr_file:
+        chr_map = {}
+        with open(options.rename_chr_file, 'r') as filein:
+            reader = csv.reader(filein, delimiter='\t')
+            for row in reader:
+                if len(row) != 2:
+                    raise ValueError("Mapping table must have exactly two columns")
+                chr_map[row[0]] = row[1]
+        if not len(chr_map.keys()) > 0:
+            raise ValueError("Empty mapping dictionnary")
+
     processor = Bed.iterator(options.stdin)
 
     for method in options.methods:
@@ -618,6 +668,10 @@ def main(argv=sys.argv):
                 raise ValueError("please supply list of names to filter")
             names = [name.strip() for name in open(options.names)]
             processor = filterNames(processor, names)
+        elif method == "rename-chr":
+            if not chr_map:
+                raise ValueError("please supply mapping file")
+            processor = renameChromosomes(processor, chr_map)
 
     noutput = 0
     for bed in processor:

diff --git a/CGAT/tools/gff2gff.py b/CGAT/tools/gff2gff.py
@@ -89,6 +89,12 @@
 
     agp file to map coordinates from contigs to scaffolds
 
+``rename-chr``
+
+    Renames chromosome names. Source and target names are supplied as a file
+    with two columns. Examples are available at:
+    https://github.com/dpryan79/ChromosomeMappings
+    Note that unmapped chromosomes are dropped from the output file.
 
 
 Usage
@@ -110,10 +116,15 @@
 The "--skip-missing" option prevents an exception being
 raised if entries are found on missing chromosomes
 
+Another example, to rename UCSC chromosomes to ENSEMBL.
+
+    cat ucsc.gff
+    | gff2gff.py --method=rename-chr
+                 --rename-chr-file=ucsc2ensembl.txt > ensembl.gff
 
 Type::
 
-   python gff2gff.py --help
+   cgat gff2gff --help
 
 for command line help.
 
@@ -135,6 +146,7 @@
 import CGAT.Genomics as Genomics
 import CGAT.IndexedFasta as IndexedFasta
 import CGAT.Intervals as Intervals
+import csv
 
 
 def combineGFF(gffs,
@@ -359,6 +371,27 @@ def cropGFF(gffs, filename_gff):
            (ninput, noutput, ncropped, ndeleted))
 
 
+def renameChromosomes(gffs, chr_map):
+
+    ninput, noutput, nskipped = 0, 0, 0
+
+    for gff in gffs:
+
+        ninput += 1
+
+        if gff.contig in chr_map.keys():
+            gff.contig = chr_map[gff.contig]
+        else:
+            nskipped += 1
+            continue
+
+        noutput += 1
+        yield gff
+
+    E.info("ninput = %i, noutput=%i, nskipped=%i" %
+           (ninput, noutput, nskipped))
+
+
 def main(argv=None):
 
     if argv is None:
@@ -381,7 +414,8 @@ def main(argv=None):
                           "merge-features",
                           "sanitize",
                           "to-forward-coordinates",
-                          "to-forward-strand"),
+                          "to-forward-strand",
+                          "rename-chr"),
                       help="method to apply [%default]")
 
     parser.add_option(
@@ -495,11 +529,17 @@ def main(argv=None):
         "--max-features", dest="max_features", type="int",
         help="maximum number of features to merge/join [%default].")
 
+    parser.add_option(
+        "--rename-chr-file", dest="rename_chr_file", type="string",
+        help="mapping table between old and new chromosome names."
+        "TAB separated 2-column file.")
+
     parser.set_defaults(
         input_filename_contigs=False,
         filename_crop_gff=None,
         input_filename_agp=False,
         genome_file=None,
+        rename_chr_file=None,
         add_up_flank=None,
         add_down_flank=None,
         complement_groups=False,
@@ -531,6 +571,8 @@ def main(argv=None):
 
     contigs = None
     genome_fasta = None
+    chr_map = None
+
     if options.input_filename_contigs:
         contigs = Genomics.readContigSizes(
             IOTools.open_file(options.input_filename_contigs, "r"))
@@ -539,6 +581,18 @@ def main(argv=None):
         genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
         contigs = genome_fasta.getContigSizes()
 
+    if options.rename_chr_file:
+        chr_map = {}
+        with open(options.rename_chr_file, 'r') as filein:
+            reader = csv.reader(filein, delimiter='\t')
+            for row in reader:
+                if len(row) != 2:
+                    raise ValueError(
+                        "Mapping table must have exactly two columns")
+                chr_map[row[0]] = row[1]
+        if not len(chr_map.keys()) > 0:
+            raise ValueError("Empty mapping dictionnary")
+
     if options.assembly_report:
         df = pd.read_csv(options.assembly_report, comment="#",
                          header=None, sep="\t")
@@ -836,6 +890,13 @@ def assemblyReport(id):
                     len(list(filtered_contigs.keys())),
                     str(filtered_contigs)))
 
+    elif options.method == "rename-chr":
+        if not chr_map:
+                raise ValueError("please supply mapping file")
+
+        for gff in renameChromosomes(gffs, chr_map):
+            options.stdout.write(str(gff) + "\n")
+
     else:
 
         for gff in gffs: