Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kb debugging scripts python3 annotations #348

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CGAT/Biomart.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def biomart_iterator(columns,
path=path, archive=archive)

if filters is not None:
filter_names = rpy2.robjects.vectors.StrVector(filters)
filter_names = rpy2.robjects.vectors.StrVector(list(filters))
else:
filter_names = ""

Expand All @@ -139,7 +139,7 @@ def biomart_iterator(columns,

# result is a dataframe
result = R.getBM(
attributes=rpy2.robjects.vectors.StrVector(columns),
attributes=rpy2.robjects.vectors.StrVector(list(columns)),
filters=filter_names,
values=filter_values,
mart=mart)
Expand Down
16 changes: 13 additions & 3 deletions CGAT/GTF.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,11 @@ def transcript_iterator(gff_iterator, strict=True):
found = set()

for gff in gff_iterator:

# ignore entries without transcript or gene id
try:
this = gff.transcript_id + gff.gene_id
except KeyError:
continue
except AttributeError:
continue

Expand Down Expand Up @@ -936,8 +937,12 @@ def fromGTF(self, other, gene_id=None, transcript_id=None):
self.start = other.start
self.end = other.end
self.score = other.score

self.strand = other.strand
self.frame = other.frame
try:
self.frame = other.frame
except:
pass
if gene_id is not None:
self.gene_id = gene_id
else:
Expand Down Expand Up @@ -975,7 +980,11 @@ def copy(self, other):
self.end = other.end
self.score = other.score
self.strand = other.strand
self.frame = other.frame

try:
self.frame = other.frame
except ValueError:
pass
# gene_id and transcript_id can be optional
try:
self.gene_id = other.gene_id
Expand All @@ -988,6 +997,7 @@ def copy(self, other):

self.attributes = collections.OrderedDict(other.asDict().items())
# from gff - remove gene_id and transcript_id from attributes

try:
del self.attributes["gene_id"]
del self.attributes["transcript_id"]
Expand Down
5 changes: 5 additions & 0 deletions CGAT/IndexedFasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,11 @@ def _add(src, target):
elif "chrM" not in self.mSynonyms and "MT" in self.mSynonyms:
self.mSynonyms["chrM"] = "MT"

# this does the same thing for "chrMito" used in the yeast
# genome
if "chrM" in self.mIndex and "chrMito" not in self.mIndex:
self.mSynonyms["chrMito"] = "chrM"

for key, val in k:
_add(key, val)

Expand Down
5 changes: 2 additions & 3 deletions CGAT/scripts/bed2bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
filter-names
++++++++++++

Output intervals whose names are in list of desired names. Names are
Output intervals whose names are in list of desired names. Names are
supplied as a file with one name on each line.

shift
Expand Down Expand Up @@ -222,7 +222,7 @@ def iterate_chunks(iterator):
if bed.contig != last.contig:

for s in to_join:
if sorted(to_join[s]):
if to_join[s]:
yield to_join[s]
to_join[s] = []
max_end[s] = 0
Expand Down Expand Up @@ -429,7 +429,6 @@ def shiftIntervals(iterator, contigs, offset):
(ninput, noutput, nskipped_contig, nskipped_range))



def extendInterval(iterator, contigs, distance):

ninput, noutput, nskipped = 0, 0, 0
Expand Down
65 changes: 54 additions & 11 deletions CGAT/scripts/gff2gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@
-----

For many downstream applications it is helpful to make sure
that a :term:`gff` formatted file contains only features on
that a :term:`gff` formatted file contains only features on
placed chromosomes.

As an example, to sanitise hg38 chromosome names and remove
Expand Down Expand Up @@ -302,7 +302,9 @@ def cropGFF(gffs, filename_gff):
E.info("reading gff for cropping: started.")

other_gffs = GTF.iterator(IOTools.openFile(filename_gff, "r"))

cropper = GTF.readAsIntervals(other_gffs)

ntotal = 0
for contig in list(cropper.keys()):
intersector = bx.intervals.intersection.Intersecter()
Expand All @@ -323,10 +325,12 @@ def cropGFF(gffs, filename_gff):
ninput += 1

if gff.contig in cropper:

start, end = gff.start, gff.end
overlaps = cropper[gff.contig].find(start, end)

if overlaps:

l = end - start
a = numpy.ones(l)
for i in overlaps:
Expand All @@ -335,7 +339,6 @@ def cropGFF(gffs, filename_gff):
a[s:e] = 0

segments = Intervals.fromArray(a)

if len(segments) == 0:
ndeleted += 1
else:
Expand All @@ -349,6 +352,7 @@ def cropGFF(gffs, filename_gff):
continue

noutput += 1

yield(gff)

E.info("ninput=%i, noutput=%i, ncropped=%i, ndeleted=%i" %
Expand Down Expand Up @@ -443,6 +447,30 @@ def main(argv=None):
help="path to assembly report file which allows mapping of "
"ensembl to ucsc contigs when running method sanitize [%default].")

parser.add_option(
"--assembly-report-hasids",
dest="assembly_report_hasIDs", type="int",
help="path to assembly report file which allows mapping of "
"ensembl to ucsc contigs when running method sanitize [%default].")

parser.add_option(
"--assembly-report-ucsccol", dest="assembly_report_ucsccol",
type="int",
help="column in the assembly report containing ucsc contig ids"
"[%default].")

parser.add_option(
"--assembly-report-ensemblcol", dest="assembly_report_ensemblcol",
type="int",
help="column in the assembly report containing ensembl contig ids"
"[%default].")

parser.add_option(
"--assembly-extras", dest="assembly_extras",
type="str",
help="additional mismatches between gtf and fasta to fix when"
"sanitizing the genome [%default].")

parser.add_option(
"--extension-upstream", dest="extension_upstream", type="float",
help="extension for upstream end [%default].")
Expand Down Expand Up @@ -493,6 +521,10 @@ def main(argv=None):
group_field=None,
contig_pattern=None,
assembly_report=None,
assembly_report_hasIDs=1,
assembly_report_ensemblcol=4,
assembly_report_ucsccol=9,
assembly_extras=None
)

(options, args) = E.Start(parser, argv=argv)
Expand All @@ -513,16 +545,27 @@ def main(argv=None):
# fixes naming inconsistency in assembly report: ensembl chromosome
# contigs found in columnn 0, ensembl unassigned contigs found in
# column 4.
df.ix[df[1] == "assembled-molecule", 4] = df.ix[df[1] == "assembled-molecule", 0]
if options.sanitize_method == "ucsc":
assembly_dict = df.set_index(4)[9].to_dict()
elif options.sanitize_method == "ensembl":
assembly_dict = df.set_index(9)[4].to_dict()
if options.assembly_report_hasIDs == 1:
ucsccol = options.assembly_report_ucsccol
ensemblcol = options.assembly_report_ensemblcol
df.ix[df[1] == "assembled-molecule", ensemblcol] = df.ix[
df[1] == "assembled-molecule", 0]
if options.sanitize_method == "ucsc":
assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict()
elif options.sanitize_method == "ensembl":
assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict()
else:
raise ValueError(''' When using assembly report,
please specify sanitize method as either
"ucsc" or "ensembl" to specify direction of conversion
''')
else:
raise ValueError(''' When using assembly report,
please specify sanitize method as either
"ucsc" or "ensembl" to specify direction of conversion
''')
assembly_dict = {}
if options.assembly_extras is not None:
assembly_extras = options.assembly_extras.split(",")
for item in assembly_extras:
item = item.split("-")
assembly_dict[item[0]] = item[1]

if options.method in ("forward_coordinates", "forward_strand",
"add-flank", "add-upstream-flank",
Expand Down
2 changes: 0 additions & 2 deletions CGAT/scripts/gtf2gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,6 @@ def annotateRegulons(iterator, fasta, tss, options):
(ngenes, ntranscripts, nregulons))



def annotateGREATDomains(iterator, fasta, options):
"""build great domains

Expand Down Expand Up @@ -1076,7 +1075,6 @@ def _add(interval, anno):
(ngenes, ntranscripts, nskipped))



def main(argv=None):

if not argv:
Expand Down
3 changes: 2 additions & 1 deletion CGAT/scripts/gtf2gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,6 @@
import CGAT.IOTools as IOTools



def find_retained_introns(gene):
'''Given a bundle of transcripts, find intervals matching retained
introns. A retained intron is defined as an interval from an exon/intron
Expand Down Expand Up @@ -1290,6 +1289,8 @@ def selectRepresentativeTranscript(gene):
strands = set([x.strand for x in gffs])
contigs = set([x.contig for x in gffs])
if len(strands) > 1:
E.warn('Skipping gene %s on multiple strands' % gffs[0].gene_id)
break
raise ValueError(
"can not merge gene '%s' on multiple strands: %s" % (
gffs[0].gene_id, str(strands)))
Expand Down
1 change: 0 additions & 1 deletion CGAT/scripts/gtf2tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,6 @@ def main(argv=None):
options.stdout.write("\t".join(header) + "\n")

for gtf in GTF.iterator(options.stdin):

attributes = []
for a in list(gtf.keys()):
if a in ("gene_id", "transcript_id"):
Expand Down
4 changes: 1 addition & 3 deletions CGAT/scripts/runGO.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,14 +176,12 @@ def connectToEnsembl(options):
dbhandle = MySQLdb.connect(
host=options.database_host,
user=options.database_username,
passwd=options.database_password,
db=options.database_name,
port=options.database_port)
else:
dbhandle = MySQLdb.connect(
host=options.database_host,
user=options.database_username,
passwd=options.database_password,
db=options.database_name,
unix_socket=options.database_socket)
return dbhandle
Expand Down Expand Up @@ -766,7 +764,7 @@ def _g():
"genes_in_bg\t%i\tinput background\n" % nbackground)
outfile.write(
"genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n" % (
len(go_results.mBackgroundGenes)))
len(go_results.mBackgroundGenes)))
outfile.write(
"associations_in_fg\t%i\tassociations in sample\n" %
go_results.mSampleCountsTotal)
Expand Down