Skip to content

Commit

Permalink
Added command line arguments --pctmarker_thresh and --pctORFscore_thr…
Browse files Browse the repository at this point in the history
…esh to ShortBRED-Quantify. Added utils folder, with script to help rename input fasta files when needed
  • Loading branch information
jimkaminski committed Feb 4, 2016
1 parent d5d0897 commit 04d884f
Show file tree
Hide file tree
Showing 8 changed files with 130,113 additions and 16 deletions.
37,319 changes: 37,319 additions & 0 deletions example/ecoli_K12_W3110.faa

Large diffs are not rendered by default.

92,681 changes: 92,681 additions & 0 deletions example/ecoli_K12_W3110.fna

Large diffs are not rendered by default.

13 changes: 12 additions & 1 deletion example/sources.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,15 @@ wgs.fna - This file contains 100 simulated Illumina reads. They were created usi

McElroy et al.: GemSIM: general, error-model based simulator of next-generation sequencing data. BMC Genomics (2012) 13:74.

Kanehisa, M. and Goto, S.; (2000). KEGG: Kyoto Encyclopedia of Genes and Genomes. Nucleic Acids Res. 28, 27-30
Kanehisa, M. and Goto, S.; (2000). KEGG: Kyoto Encyclopedia of Genes and Genomes. Nucleic Acids Res. 28, 27-30

ecoli_K12_W3110.faa - This file of ORFs corresponds to entry 637000110 from
version 3.5 of the Integrated Microbial Genomes (IMG).

ecoli_K12_W3110.fna - This file of nucleotide sequences corresponds to entry 637000110 from
version 3.5 of the Integrated Microbial Genomes (IMG).

Markowitz VM, Chen IM, Palaniappan K, Chu K, Szeto E, Grechkin Y, et al. IMG:
the Integrated Microbial Genomes database and comparative analysis system.
Nucleic Acids Res. 2012;40(Database issue):D115–22. doi: 10.1093/nar/gkr1044.
pmid:22194640
7 changes: 4 additions & 3 deletions shortbred_identify.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
from Bio.Data import CodonTable
from Bio import SeqIO

VERSION="0.9.2"
VERSION="0.9.3"

###############################################################################
#COMMAND LINE ARGUMENTS
Expand Down Expand Up @@ -204,7 +204,8 @@
iMode = 0

################################################################################
# Step Zero: Choose program mode based on files supplied by the user.
# Step Zero: Choose program mode based on files supplied by the user.


if (args.sGOIProts!="" and args.sRefProts!=""):
log.write("Mode 1: Building everything..." + "\n")
Expand Down Expand Up @@ -237,7 +238,7 @@

#Make directories for clustfile and database.
if(iMode==1 or iMode==2):

pb.CheckFastaForBadProtNames(args.sGOIProts)
dirClust = src.check_create_dir( dirTmp + os.sep + "clust" )
dirClustDB = src.check_create_dir( dirTmp + os.sep + "clustdb" )

Expand Down
21 changes: 15 additions & 6 deletions shortbred_quantify.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from Bio.Seq import Seq
from Bio import SeqIO

VERSION="0.9.2"
VERSION="0.9.3"


################################################################################
Expand Down Expand Up @@ -105,6 +105,11 @@
grpParam.add_argument('--maxrejects', type=float, dest='iMaxRejects', help='Enter the number of markers allowed to hit read.', default = 32)
grpParam.add_argument('--unannotated', action='store_const',dest='bUnannotated', help='Indicates genome is unannotated. ShortBRED will use tblastn to \
search AA markers against the db of six possible translations of your genome data. ', const=True, default = False)
grpParam.add_argument('--pctmarker_thresh',dest='dPctMarkerThresh', type=float,help='Indicates the share of a familiy\'s markers that must map to ORF to be counted. ', default = 0.1)
grpParam.add_argument('--pctORFscore_thresh',dest='dPctORFScoreThresh', type=float,help='Indicates the share of total ORF score that a family must receive to be counted. ', default = 0.1)



grpParam.add_argument('--EM', action='store_const',dest='bEM', help='Indicates user would like to run EM algorithm \
on the quasi-markers. ', const=True, default = False)
grpParam.add_argument('--bayes', type=str,dest='strBayes', help='Output files for Bayes Results', default = "")
Expand Down Expand Up @@ -152,6 +157,7 @@
dirTmp = ("tmp" + str(os.getpid()) + '%.0f' % round((time.time()*1000), 1))

dirTmp = src.check_create_dir( dirTmp )
dirTmp = os.path.abspath(dirTmp)

# Assign file names
if args.strHits != "":
Expand Down Expand Up @@ -187,6 +193,7 @@
strMethod = "unannotated_genome"

src.CheckDependency(args.strTBLASTN,"","tblastn")
src.CheckDependency(args.strMakeBlastDB,"","makeblastdb")
#We assume that genomes will be a single fasta file, and that they will be
# smaller than 900 MB, the upper bound for passing a single file to usearch.
strSize = "small"
Expand Down Expand Up @@ -233,7 +240,7 @@
dictQMPossibleOverlap = {}
dictType = {}

#FIX THIS ONE!

if (args.strBlast == ""):
strBlast = str(dirTmp) + os.sep + strMethod+ "full_results.tab"
else:
Expand Down Expand Up @@ -282,7 +289,7 @@
astrFams =[]
# Only retain those families which could validly map to this QM at the given settings.
for strFam in astrAllFams:
print strFam
#print strFam
mtchFam = re.search(r'(.*)_w=(.*)',strFam)
strID = mtchFam.group(1)
dProp = float(mtchFam.group(2))
Expand Down Expand Up @@ -588,11 +595,11 @@
##########################################################################

if strMethod=="annotated_genome":
dictFinalCounts = sq.NormalizeGenomeCounts(strHitsFile,dictFamCounts,bUnannotated=False)
dictFinalCounts = sq.NormalizeGenomeCounts(strHitsFile,dictFamCounts,bUnannotated=False,dPctMarkerThresh=args.dPctMarkerThresh)
sys.stderr.write("Normalizing hits to genome... \n")

elif strMethod=="unannotated_genome":
dictFinalCounts = sq.NormalizeGenomeCounts(strHitsFile,dictFamCounts,bUnannotated=True)
dictFinalCounts = sq.NormalizeGenomeCounts(strHitsFile,dictFamCounts,bUnannotated=True,dPctMarkerThresh=args.dPctMarkerThresh)
sys.stderr.write("Normalizing hits to genome... \n")


Expand All @@ -606,9 +613,11 @@
# Add final details to log
with open(str(dirTmp + os.sep + os.path.basename(args.strMarkers)+ ".log"), "a") as log:
log.write("Total Reads Processed: " + str(iTotalReadCount) + "\n")
log.write("Average Read Length: " + str(dAvgReadLength) + "\n")
log.write("Average Read Length Specified by User: " + str(args.iAvgReadBP) + "\n")
log.write("Average Read Length Calculated by ShortBRED: " + str(dAvgReadLength) + "\n")
log.write("Min Read Length: " + str(iMin) + "\n")


sys.stderr.write("Processing complete. \n")
########################################################################################
# This is part of a possible EM application that is not fully implemented yet.
Expand Down
18 changes: 18 additions & 0 deletions src/process_blast.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,24 @@ def FindJMMarker( setGenes, dictGenes, dictGOIHits,dictRefHits,iShortRegion=25,i

return atupJM

###############################################################################
def CheckFastaForBadProtNames(fileFasta):
reBadChars=re.compile(r'[\\\/\*]')
setProtNames = set()

for gene in SeqIO.parse(fileFasta, "fasta"):
mtchBad = reBadChars.search(gene.id)
assert (mtchBad == None),("\nOne or more of the sequences in your "+
"input file has an id that ShortBRED cannot use as a valid folder "+
"name during the clustering step, so ShortBRED has stopped. Please edit ** "+
fileFasta + " ** to remove any slashes,asterisks, etc. from the fasta ids. The program utils/AdjustFastaHeadersForShortBRED.py "+
"in the ShortBRED folder can do this for you. ShortBRED halted on this gene/protein:" + gene.id)
assert (gene.id not in setProtNames),("\nShortBRED uses the first word of the seq id to identify each "+
"input sequence, and two or more of your sequences share the same starting word. Please edit ** "+
fileFasta + " ** to avoid duplicate ids. The program utils/AdjustFastaHeadersForShortBRED.py can add unique identifiers to your data if needed. ShortBRED halted on this gene/protein: " + gene.id)
setProtNames.add(gene.id)


###############################################################################
def MakeFamilyFastaFiles ( strMapFile, fileFasta, dirOut):
#Makes a fasta file containing genes for each family in dictFams.
Expand Down
24 changes: 18 additions & 6 deletions src/quantify_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ def MakedbRapsearch2 ( strMarkers, strDBName,strPrerapPath):
return

def MakedbBLASTnuc ( strMakeBlastDB, strDBName,strGenome,dirTmp):
# This functions calls usearch to make a database of the ShortBRED markers.
print "Making blastdb in " + dirTmp
# This functions calls usearch to make a database of the ShortBRED markers.
p = subprocess.check_call([strMakeBlastDB, "-in", strGenome, "-out", strDBName,
"-dbtype", "nucl", "-logfile", dirTmp + os.sep + "blast_nuc_db.log"])
return
Expand Down Expand Up @@ -110,9 +111,13 @@ def MakeDictFamilyCounts (strMarkers,strFamilyOut):
dictFamMarkerCounts[strFam] = 1
return dictFamMarkerCounts

def CalcFinalCount (dictORFMatches,dictFamMarkerCounts,bUnannotated,dThresh=.10,):
def CalcFinalCount (dictORFMatches,dictFamMarkerCounts,bUnannotated,dPctORFScoreThresh,dPctMarkerThresh):
# Takes two dictionaries, each have protein families has the keys.
# One has the number of markers hitting the ORF, the other has all possible markers.

# Make two paramaters her avaialble as arguments:
# 1) dPctORFScoreThresh, which controls how much a share of an ORF score a fam must receive
# 2) dPctMarkerThresh, which controls the threshhold for even being counted as a hit


aaCounts = []
Expand All @@ -121,6 +126,11 @@ def CalcFinalCount (dictORFMatches,dictFamMarkerCounts,bUnannotated,dThresh=.10,

for strFam in dictORFMatches:
dScore = dictORFMatches[strFam] / float(dictFamMarkerCounts[strFam])
#print strFam,dScore,dPctMarkerThresh
if (dScore < dPctMarkerThresh):
dScore = 0
#print strFam,dScore,dPctMarkerThresh

aFamScore = [strFam,dScore]
aaCounts.append(aFamScore)

Expand All @@ -131,8 +141,10 @@ def CalcFinalCount (dictORFMatches,dictFamMarkerCounts,bUnannotated,dThresh=.10,
if (bUnannotated==False):
aaAboveThresh = []
for aFamScore in aaCounts:
if (aFamScore[1] / dSum)>=dThresh:
aaAboveThresh.append(aFamScore)
if dSum >0:
if (aFamScore[1] / dSum)>=dPctORFScoreThresh:
aaAboveThresh.append(aFamScore)
#print aaAboveThresh, dSum

for aFamScore in aaAboveThresh:
aNewScore = [aFamScore[0],aFamScore[1] * (aFamScore[1]/dSum) ]
Expand All @@ -157,7 +169,7 @@ def CalcFinalCount (dictORFMatches,dictFamMarkerCounts,bUnannotated,dThresh=.10,
"""

def NormalizeGenomeCounts (strValidHits,dictFamCounts,bUnannotated,dThresh=.1):
def NormalizeGenomeCounts (strValidHits,dictFamCounts,bUnannotated,dPctORFScoreThresh=.1,dPctMarkerThresh=.1):
dictFinalCounts = {}
for strFam in dictFamCounts.keys():
dictFinalCounts[strFam] = 0
Expand Down Expand Up @@ -200,7 +212,7 @@ def NormalizeGenomeCounts (strValidHits,dictFamCounts,bUnannotated,dThresh=.1):
dictFamMatches[strFam] = 1

# Normalize Counts
aaCount = CalcFinalCount (dictFamMatches,dictFamCounts,bUnannotated,dThresh=.1)
aaCount = CalcFinalCount (dictFamMatches,dictFamCounts,bUnannotated,dPctORFScoreThresh,dPctMarkerThresh)

for aFamScore in aaCount:
dictFinalCounts[aFamScore[0]] = dictFinalCounts[aFamScore[0]] + aFamScore[1]
Expand Down
46 changes: 46 additions & 0 deletions utils/AdjustFastaHeadersForShortBRED.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#########################################################################
# Jim Kaminski
# Huttenhower Lab
# 2/3/2016
#########################################################################



"""
This script makes small changes to an input fasta file to format the sequence
ids for ShortBRED. It will do the following:
* Convert the first --numspaces (args.iSpacesToChange) to "_"'s
* Add an unique ID when two seqs have the exact same name.
* Replace characters like "*,[,:" with _ .
"""
import sys
import argparse
import re
from argparse import RawTextHelpFormatter

parser = argparse.ArgumentParser(description='This script makes small changes to an input fasta file to format the sequence \
ids for ShortBRED. It will do the following: \
* Convert the first --numspaces (args.iSpace) to "_"\'s \
* Add an unique ID when two seqs have the exact same name. \
* Replace characters like "*,[,:" with _ . ',formatter_class=RawTextHelpFormatter)
parser.add_argument("--numspaces", default = 2, type=int, dest='iSpacesToChange')
args = parser.parse_args()

dictHeaderCounts = {}
reBadChars=re.compile(r'[\\\/\*\=\:\'\[\]\.\;]')

for strLine in sys.stdin:
strLine = strLine.strip()
if strLine[0]==">":
strHeader = strLine[1:].replace(" ","_",args.iSpacesToChange)
strHeader = re.sub(reBadChars,"_",strHeader)
dictHeaderCounts[strHeader] = dictHeaderCounts.get(strHeader,0)+1
if dictHeaderCounts[strHeader] > 1:
strHeader = strHeader + "___Copy"+str(dictHeaderCounts[strHeader]).zfill(4)
print ">" + strHeader
else:
print strLine



0 comments on commit 04d884f

Please sign in to comment.