CGATOxford · sebastian-luna-valero · Aug 29, 2017 · Jul 31, 2017 · Aug 10, 2017 · Aug 10, 2017
diff --git a/CGAT/Biomart.py b/CGAT/Biomart.py
@@ -128,7 +128,7 @@ def biomart_iterator(columns,
                          path=path, archive=archive)
 
     if filters is not None:
-        filter_names = rpy2.robjects.vectors.StrVector(filters)
+        filter_names = rpy2.robjects.vectors.StrVector(list(filters))
     else:
         filter_names = ""
 
@@ -139,7 +139,7 @@ def biomart_iterator(columns,
 
     # result is a dataframe
     result = R.getBM(
-        attributes=rpy2.robjects.vectors.StrVector(columns),
+        attributes=rpy2.robjects.vectors.StrVector(list(columns)),
         filters=filter_names,
         values=filter_values,
         mart=mart)

diff --git a/CGAT/GTF.py b/CGAT/GTF.py
@@ -117,10 +117,11 @@ def transcript_iterator(gff_iterator, strict=True):
     found = set()
 
     for gff in gff_iterator:
-
         # ignore entries without transcript or gene id
         try:
             this = gff.transcript_id + gff.gene_id
+        except KeyError:
+            continue
         except AttributeError:
             continue
 
@@ -936,8 +937,12 @@ def fromGTF(self, other, gene_id=None, transcript_id=None):
         self.start = other.start
         self.end = other.end
         self.score = other.score
+
         self.strand = other.strand
-        self.frame = other.frame
+        try:
+            self.frame = other.frame
+        except:
+            pass
         if gene_id is not None:
             self.gene_id = gene_id
         else:
@@ -975,7 +980,11 @@ def copy(self, other):
         self.end = other.end
         self.score = other.score
         self.strand = other.strand
-        self.frame = other.frame
+
+        try:
+            self.frame = other.frame
+        except ValueError:
+            pass
         # gene_id and transcript_id can be optional
         try:
             self.gene_id = other.gene_id
@@ -988,6 +997,7 @@ def copy(self, other):
 
         self.attributes = collections.OrderedDict(other.asDict().items())
         # from gff - remove gene_id and transcript_id from attributes
+
         try:
             del self.attributes["gene_id"]
             del self.attributes["transcript_id"]

diff --git a/CGAT/IndexedFasta.py b/CGAT/IndexedFasta.py
@@ -740,6 +740,11 @@ def _add(src, target):
         elif "chrM" not in self.mSynonyms and "MT" in self.mSynonyms:
             self.mSynonyms["chrM"] = "MT"
 
+        # this does the same thing for "chrMito" used in the yeast
+        # genome
+        if "chrM" in self.mIndex and "chrMito" not in self.mIndex:
+            self.mSynonyms["chrMito"] = "chrM"
+
         for key, val in k:
             _add(key, val)
 

diff --git a/CGAT/scripts/bed2bed.py b/CGAT/scripts/bed2bed.py
@@ -93,7 +93,7 @@
 filter-names
 ++++++++++++
 
-Output intervals whose names are in list of desired names. Names are 
+Output intervals whose names are in list of desired names. Names are
 supplied as a file with one name on each line.
 
 shift
@@ -222,7 +222,7 @@ def iterate_chunks(iterator):
             if bed.contig != last.contig:
 
                 for s in to_join:
-                    if sorted(to_join[s]):
+                    if to_join[s]:
                         yield to_join[s]
                     to_join[s] = []
                     max_end[s] = 0
@@ -429,7 +429,6 @@ def shiftIntervals(iterator, contigs, offset):
            (ninput, noutput, nskipped_contig, nskipped_range))
 
 
-
 def extendInterval(iterator, contigs, distance):
 
     ninput, noutput, nskipped = 0, 0, 0

diff --git a/CGAT/scripts/gff2gff.py b/CGAT/scripts/gff2gff.py
@@ -95,7 +95,7 @@
 -----
 
 For many downstream applications it is helpful to make sure
-that a :term:`gff` formatted file contains only features on 
+that a :term:`gff` formatted file contains only features on
 placed chromosomes.
 
 As an example, to sanitise hg38 chromosome names and remove
@@ -302,7 +302,9 @@ def cropGFF(gffs, filename_gff):
     E.info("reading gff for cropping: started.")
 
     other_gffs = GTF.iterator(IOTools.openFile(filename_gff, "r"))
+
     cropper = GTF.readAsIntervals(other_gffs)
+
     ntotal = 0
     for contig in list(cropper.keys()):
         intersector = bx.intervals.intersection.Intersecter()
@@ -323,10 +325,12 @@ def cropGFF(gffs, filename_gff):
         ninput += 1
 
         if gff.contig in cropper:
+
             start, end = gff.start, gff.end
             overlaps = cropper[gff.contig].find(start, end)
 
             if overlaps:
+
                 l = end - start
                 a = numpy.ones(l)
                 for i in overlaps:
@@ -335,7 +339,6 @@ def cropGFF(gffs, filename_gff):
                     a[s:e] = 0
 
                 segments = Intervals.fromArray(a)
-
                 if len(segments) == 0:
                     ndeleted += 1
                 else:
@@ -349,6 +352,7 @@ def cropGFF(gffs, filename_gff):
                 continue
 
         noutput += 1
+
         yield(gff)
 
     E.info("ninput=%i, noutput=%i, ncropped=%i, ndeleted=%i" %
@@ -443,6 +447,30 @@ def main(argv=None):
         help="path to assembly report file which allows mapping of "
         "ensembl to ucsc contigs when running method sanitize [%default].")
 
+    parser.add_option(
+        "--assembly-report-hasids",
+        dest="assembly_report_hasIDs", type="int",
+        help="path to assembly report file which allows mapping of "
+        "ensembl to ucsc contigs when running method sanitize [%default].")
+
+    parser.add_option(
+        "--assembly-report-ucsccol", dest="assembly_report_ucsccol",
+        type="int",
+        help="column in the assembly report containing ucsc contig ids"
+        "[%default].")
+
+    parser.add_option(
+        "--assembly-report-ensemblcol", dest="assembly_report_ensemblcol",
+        type="int",
+        help="column in the assembly report containing ensembl contig ids"
+        "[%default].")
+
+    parser.add_option(
+        "--assembly-extras", dest="assembly_extras",
+        type="str",
+        help="additional mismatches between gtf and fasta to fix when"
+        "sanitizing the genome [%default].")
+
     parser.add_option(
         "--extension-upstream", dest="extension_upstream", type="float",
         help="extension for upstream end [%default].")
@@ -493,6 +521,10 @@ def main(argv=None):
         group_field=None,
         contig_pattern=None,
         assembly_report=None,
+        assembly_report_hasIDs=1,
+        assembly_report_ensemblcol=4,
+        assembly_report_ucsccol=9,
+        assembly_extras=None
     )
 
     (options, args) = E.Start(parser, argv=argv)
@@ -513,16 +545,27 @@ def main(argv=None):
         # fixes naming inconsistency in assembly report: ensembl chromosome
         # contigs found in columnn 0, ensembl unassigned contigs found in
         # column 4.
-        df.ix[df[1] == "assembled-molecule", 4] = df.ix[df[1] == "assembled-molecule", 0]
-        if options.sanitize_method == "ucsc":
-            assembly_dict = df.set_index(4)[9].to_dict()
-        elif options.sanitize_method == "ensembl":
-            assembly_dict = df.set_index(9)[4].to_dict()
+        if options.assembly_report_hasIDs == 1:
+            ucsccol = options.assembly_report_ucsccol
+            ensemblcol = options.assembly_report_ensemblcol
+            df.ix[df[1] == "assembled-molecule", ensemblcol] = df.ix[
+                df[1] == "assembled-molecule", 0]
+            if options.sanitize_method == "ucsc":
+                assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict()
+            elif options.sanitize_method == "ensembl":
+                assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict()
+            else:
+                raise ValueError(''' When using assembly report,
+                please specify sanitize method as either
+                "ucsc" or "ensembl" to specify direction of conversion
+                ''')
         else:
-            raise ValueError(''' When using assembly report,
-            please specify sanitize method as either
-            "ucsc" or "ensembl" to specify direction of conversion
-            ''')
+            assembly_dict = {}
+        if options.assembly_extras is not None:
+            assembly_extras = options.assembly_extras.split(",")
+            for item in assembly_extras:
+                item = item.split("-")
+                assembly_dict[item[0]] = item[1]
 
     if options.method in ("forward_coordinates", "forward_strand",
                           "add-flank", "add-upstream-flank",

diff --git a/CGAT/scripts/gtf2gff.py b/CGAT/scripts/gtf2gff.py
@@ -754,7 +754,6 @@ def annotateRegulons(iterator, fasta, tss, options):
            (ngenes, ntranscripts, nregulons))
 
 
-
 def annotateGREATDomains(iterator, fasta, options):
     """build great domains
 
@@ -1076,7 +1075,6 @@ def _add(interval, anno):
            (ngenes, ntranscripts, nskipped))
 
 
-
 def main(argv=None):
 
     if not argv:

diff --git a/CGAT/scripts/gtf2gtf.py b/CGAT/scripts/gtf2gtf.py
@@ -287,7 +287,6 @@
 import CGAT.IOTools as IOTools
 
 
-
 def find_retained_introns(gene):
     '''Given a bundle of transcripts, find intervals matching retained
     introns. A retained intron is defined as an interval from an exon/intron
@@ -1290,6 +1289,8 @@ def selectRepresentativeTranscript(gene):
             strands = set([x.strand for x in gffs])
             contigs = set([x.contig for x in gffs])
             if len(strands) > 1:
+                E.warn('Skipping gene %s on multiple strands' % gffs[0].gene_id)
+                break
                 raise ValueError(
                     "can not merge gene '%s' on multiple strands: %s" % (
                         gffs[0].gene_id, str(strands)))

diff --git a/CGAT/scripts/gtf2tsv.py b/CGAT/scripts/gtf2tsv.py
@@ -292,7 +292,6 @@ def main(argv=None):
         options.stdout.write("\t".join(header) + "\n")
 
         for gtf in GTF.iterator(options.stdin):
-
             attributes = []
             for a in list(gtf.keys()):
                 if a in ("gene_id", "transcript_id"):

diff --git a/CGAT/scripts/runGO.py b/CGAT/scripts/runGO.py
@@ -176,14 +176,12 @@ def connectToEnsembl(options):
         dbhandle = MySQLdb.connect(
             host=options.database_host,
             user=options.database_username,
-            passwd=options.database_password,
             db=options.database_name,
             port=options.database_port)
     else:
         dbhandle = MySQLdb.connect(
             host=options.database_host,
             user=options.database_username,
-            passwd=options.database_password,
             db=options.database_name,
             unix_socket=options.database_socket)
     return dbhandle
@@ -766,7 +764,7 @@ def _g():
                 "genes_in_bg\t%i\tinput background\n" % nbackground)
             outfile.write(
                 "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n" % (
-                len(go_results.mBackgroundGenes)))
+                    len(go_results.mBackgroundGenes)))
             outfile.write(
                 "associations_in_fg\t%i\tassociations in sample\n" %
                 go_results.mSampleCountsTotal)