Added command line arguments --pctmarker_thresh and --pctORFscore_thr…

…esh to ShortBRED-Quantify. Added utils folder, with script to help rename input fasta files when needed
biobakery · Feb 4, 2016 · 04d884f · 04d884f
1 parent d5d0897
commit 04d884f
Show file tree

Hide file tree

Showing 8 changed files with 130,113 additions and 16 deletions.
diff --git a/example/ecoli_K12_W3110.faa b/example/ecoli_K12_W3110.faa
diff --git a/example/ecoli_K12_W3110.fna b/example/ecoli_K12_W3110.fna
diff --git a/example/sources.txt b/example/sources.txt
@@ -9,4 +9,15 @@ wgs.fna - This file contains 100 simulated Illumina reads. They were created usi
 
 McElroy et al.: GemSIM: general, error-model based simulator of next-generation sequencing data. BMC Genomics (2012) 13:74.
 
-Kanehisa, M. and Goto, S.; (2000). KEGG: Kyoto Encyclopedia of Genes and Genomes. Nucleic Acids Res. 28, 27-30
+Kanehisa, M. and Goto, S.; (2000). KEGG: Kyoto Encyclopedia of Genes and Genomes. Nucleic Acids Res. 28, 27-30
+
+ecoli_K12_W3110.faa - This file of ORFs corresponds to entry 637000110 from
+version 3.5 of the Integrated Microbial Genomes (IMG). 
+
+ecoli_K12_W3110.fna - This file of nucleotide sequences corresponds to entry 637000110 from
+version 3.5 of the Integrated Microbial Genomes (IMG).
+
+Markowitz VM, Chen IM, Palaniappan K, Chu K, Szeto E, Grechkin Y, et al. IMG:
+the Integrated Microbial Genomes database and comparative analysis system.
+Nucleic Acids Res. 2012;40(Database issue):D115–22. doi: 10.1093/nar/gkr1044.
+pmid:22194640
diff --git a/shortbred_identify.py b/shortbred_identify.py
@@ -57,7 +57,7 @@
 from Bio.Data import CodonTable
 from Bio import SeqIO
 
-VERSION="0.9.2"
+VERSION="0.9.3"
 
 ###############################################################################
 #COMMAND LINE ARGUMENTS
@@ -204,7 +204,8 @@
 iMode = 0
 
 ################################################################################
-# Step Zero: Choose program mode based on files supplied by the user.
+# Step Zero: Choose program mode based on files supplied by the user. 
+
 
 if (args.sGOIProts!="" and args.sRefProts!=""):
 	log.write("Mode 1: Building everything..." + "\n")
@@ -237,7 +238,7 @@
 
 #Make directories for clustfile and database.
 if(iMode==1 or iMode==2):
-
+	pb.CheckFastaForBadProtNames(args.sGOIProts)    
 	dirClust = src.check_create_dir( dirTmp + os.sep + "clust" )
 	dirClustDB = src.check_create_dir( dirTmp + os.sep + "clustdb" )
 

diff --git a/shortbred_quantify.py b/shortbred_quantify.py
@@ -48,7 +48,7 @@
 from Bio.Seq import Seq
 from Bio import SeqIO
 
-VERSION="0.9.2"
+VERSION="0.9.3"
 
 
 ################################################################################
@@ -105,6 +105,11 @@
 grpParam.add_argument('--maxrejects', type=float, dest='iMaxRejects', help='Enter the number of markers allowed to hit read.', default = 32)
 grpParam.add_argument('--unannotated', action='store_const',dest='bUnannotated', help='Indicates genome is unannotated. ShortBRED will use tblastn to \
 search AA markers against the db of six possible translations of your genome data. ', const=True, default = False)
+grpParam.add_argument('--pctmarker_thresh',dest='dPctMarkerThresh', type=float,help='Indicates the share of a familiy\'s markers that must map to ORF to be counted. ', default = 0.1)
+grpParam.add_argument('--pctORFscore_thresh',dest='dPctORFScoreThresh', type=float,help='Indicates the share of total ORF score that a family must receive to be counted. ', default = 0.1)
+
+
+
 grpParam.add_argument('--EM', action='store_const',dest='bEM', help='Indicates user would like to run EM algorithm \
  on the quasi-markers. ', const=True, default = False)
 grpParam.add_argument('--bayes', type=str,dest='strBayes', help='Output files for Bayes Results', default = "")
@@ -152,6 +157,7 @@
     dirTmp = ("tmp" + str(os.getpid()) + '%.0f' % round((time.time()*1000), 1))
 
 dirTmp = src.check_create_dir( dirTmp )
+dirTmp = os.path.abspath(dirTmp)
 
 # Assign file names
 if args.strHits != "":
@@ -187,6 +193,7 @@
 	strMethod = "unannotated_genome"
 
 	src.CheckDependency(args.strTBLASTN,"","tblastn")
+	src.CheckDependency(args.strMakeBlastDB,"","makeblastdb")
     #We assume that genomes will be a single fasta file, and that they will be
 	# smaller than 900 MB, the upper bound for passing a single file to usearch.
 	strSize = "small"
@@ -233,7 +240,7 @@
 dictQMPossibleOverlap = {}
 dictType = {}
 
-#FIX THIS ONE!
+
 if (args.strBlast == ""):
 	strBlast = str(dirTmp) + os.sep + strMethod+ "full_results.tab"
 else:
@@ -282,7 +289,7 @@
 			astrFams =[]
 			# Only retain those families which could validly map to this QM at the given settings.
 			for strFam in astrAllFams:
-				print strFam
+				#print strFam
 				mtchFam = re.search(r'(.*)_w=(.*)',strFam)
 				strID = mtchFam.group(1)
 				dProp = float(mtchFam.group(2))
@@ -588,11 +595,11 @@
 ##########################################################################
 
 if strMethod=="annotated_genome":
-	dictFinalCounts = sq.NormalizeGenomeCounts(strHitsFile,dictFamCounts,bUnannotated=False)
+	dictFinalCounts = sq.NormalizeGenomeCounts(strHitsFile,dictFamCounts,bUnannotated=False,dPctMarkerThresh=args.dPctMarkerThresh)
 	sys.stderr.write("Normalizing hits to genome... \n")
 
 elif strMethod=="unannotated_genome":
-	dictFinalCounts = sq.NormalizeGenomeCounts(strHitsFile,dictFamCounts,bUnannotated=True)
+	dictFinalCounts = sq.NormalizeGenomeCounts(strHitsFile,dictFamCounts,bUnannotated=True,dPctMarkerThresh=args.dPctMarkerThresh)
 	sys.stderr.write("Normalizing hits to genome... \n")
 
 
@@ -606,9 +613,11 @@
 # Add final details to log
 with open(str(dirTmp + os.sep + os.path.basename(args.strMarkers)+ ".log"), "a") as log:
 	log.write("Total Reads Processed: " + str(iTotalReadCount) + "\n")
-	log.write("Average Read Length: " + str(dAvgReadLength) + "\n")
+	log.write("Average Read Length Specified by User: " + str(args.iAvgReadBP) + "\n")
+	log.write("Average Read Length Calculated by ShortBRED: " + str(dAvgReadLength) + "\n")
 	log.write("Min Read Length: " + str(iMin) + "\n")
 
+
 sys.stderr.write("Processing complete. \n")
 ########################################################################################
 # This is part of a possible EM application that is not fully implemented yet.

diff --git a/src/process_blast.py b/src/process_blast.py
@@ -241,6 +241,24 @@ def FindJMMarker( setGenes, dictGenes, dictGOIHits,dictRefHits,iShortRegion=25,i
 
 	return atupJM
 
+###############################################################################
+def CheckFastaForBadProtNames(fileFasta):
+    reBadChars=re.compile(r'[\\\/\*]')
+    setProtNames = set()
+
+    for gene in SeqIO.parse(fileFasta, "fasta"):
+        mtchBad = reBadChars.search(gene.id)
+        assert (mtchBad == None),("\nOne or more of the sequences in your "+
+        "input file has an id that ShortBRED cannot use as a valid folder "+
+        "name during the clustering step, so ShortBRED has stopped. Please edit  ** "+
+        fileFasta + " ** to remove any slashes,asterisks, etc. from the fasta ids. The program utils/AdjustFastaHeadersForShortBRED.py "+
+        "in the ShortBRED folder can do this for you.  ShortBRED halted on this gene/protein:" + gene.id)
+        assert (gene.id not in setProtNames),("\nShortBRED uses the first word of the seq id to identify each "+
+        "input sequence, and two or more of your sequences share the same starting word. Please edit ** "+
+        fileFasta + " ** to avoid duplicate ids. The program utils/AdjustFastaHeadersForShortBRED.py can add unique identifiers to your data if needed. ShortBRED halted on this gene/protein: " + gene.id)
+        setProtNames.add(gene.id)
+
+
 ###############################################################################
 def MakeFamilyFastaFiles ( strMapFile, fileFasta, dirOut):
 #Makes a fasta file containing genes for each family in dictFams.

diff --git a/src/quantify_functions.py b/src/quantify_functions.py
@@ -52,7 +52,8 @@ def MakedbRapsearch2 ( strMarkers, strDBName,strPrerapPath):
 	return
 
 def MakedbBLASTnuc ( strMakeBlastDB, strDBName,strGenome,dirTmp):
-    # This functions calls usearch to make a database of the ShortBRED markers.
+	print "Making blastdb in " + dirTmp
+	# This functions calls usearch to make a database of the ShortBRED markers.
 	p = subprocess.check_call([strMakeBlastDB, "-in", strGenome, "-out", strDBName,
 		"-dbtype", "nucl", "-logfile", dirTmp + os.sep + "blast_nuc_db.log"])
 	return
@@ -110,9 +111,13 @@ def MakeDictFamilyCounts (strMarkers,strFamilyOut):
 				dictFamMarkerCounts[strFam] = 1
 	return dictFamMarkerCounts
 
-def CalcFinalCount (dictORFMatches,dictFamMarkerCounts,bUnannotated,dThresh=.10,):
+def CalcFinalCount (dictORFMatches,dictFamMarkerCounts,bUnannotated,dPctORFScoreThresh,dPctMarkerThresh):
     # Takes two dictionaries, each have protein families has the keys.
 	# One has the number of markers hitting the ORF, the other has all possible markers.
+
+    # Make two paramaters her avaialble as arguments:
+    # 1) dPctORFScoreThresh, which controls how much a share of an ORF score a fam must receive
+    # 2) dPctMarkerThresh, which controls the threshhold for even being counted as a hit
 
 
 	aaCounts = []
@@ -121,6 +126,11 @@ def CalcFinalCount (dictORFMatches,dictFamMarkerCounts,bUnannotated,dThresh=.10,
 
 	for strFam in dictORFMatches:
 		dScore = dictORFMatches[strFam] / float(dictFamMarkerCounts[strFam])
+		#print strFam,dScore,dPctMarkerThresh
+		if (dScore < dPctMarkerThresh):
+			dScore = 0
+			#print strFam,dScore,dPctMarkerThresh
+
 		aFamScore = [strFam,dScore]
 		aaCounts.append(aFamScore)
 
@@ -131,8 +141,10 @@ def CalcFinalCount (dictORFMatches,dictFamMarkerCounts,bUnannotated,dThresh=.10,
 	if (bUnannotated==False):
 		aaAboveThresh = []
 		for aFamScore in aaCounts:
-			if (aFamScore[1] / dSum)>=dThresh:
-				aaAboveThresh.append(aFamScore)
+			if dSum >0:
+				if (aFamScore[1] / dSum)>=dPctORFScoreThresh:
+					aaAboveThresh.append(aFamScore)
+					#print aaAboveThresh, dSum
 
 		for aFamScore in aaAboveThresh:
 			aNewScore = [aFamScore[0],aFamScore[1] * (aFamScore[1]/dSum) ]
@@ -157,7 +169,7 @@ def CalcFinalCount (dictORFMatches,dictFamMarkerCounts,bUnannotated,dThresh=.10,
 
 	"""
 
-def NormalizeGenomeCounts (strValidHits,dictFamCounts,bUnannotated,dThresh=.1):
+def NormalizeGenomeCounts (strValidHits,dictFamCounts,bUnannotated,dPctORFScoreThresh=.1,dPctMarkerThresh=.1):
 	dictFinalCounts = {}
 	for strFam in dictFamCounts.keys():
 		dictFinalCounts[strFam] = 0
@@ -200,7 +212,7 @@ def NormalizeGenomeCounts (strValidHits,dictFamCounts,bUnannotated,dThresh=.1):
 					dictFamMatches[strFam] = 1
 
 		# Normalize Counts
-		aaCount = CalcFinalCount (dictFamMatches,dictFamCounts,bUnannotated,dThresh=.1)
+		aaCount = CalcFinalCount (dictFamMatches,dictFamCounts,bUnannotated,dPctORFScoreThresh,dPctMarkerThresh)
 
 		for aFamScore in aaCount:
 			dictFinalCounts[aFamScore[0]] = dictFinalCounts[aFamScore[0]] + aFamScore[1]

diff --git a/utils/AdjustFastaHeadersForShortBRED.py b/utils/AdjustFastaHeadersForShortBRED.py
@@ -0,0 +1,46 @@
+#########################################################################
+# Jim Kaminski
+# Huttenhower Lab
+# 2/3/2016
+#########################################################################
+
+
+
+"""
+This script makes small changes to an input fasta file to format the sequence
+ids for ShortBRED. It will do the following:
+    * Convert the first --numspaces (args.iSpacesToChange) to "_"'s
+    * Add an unique ID when two seqs have the exact same name.
+    * Replace characters like "*,[,:" with _ .
+
+"""
+import sys
+import argparse
+import re
+from argparse import RawTextHelpFormatter
+
+parser = argparse.ArgumentParser(description='This script makes small changes to an input fasta file to format the sequence \
+ids for ShortBRED. It will do the following: \
+    * Convert the first --numspaces (args.iSpace) to "_"\'s \
+    * Add an unique ID when two seqs have the exact same name. \
+    * Replace characters like "*,[,:" with _ . ',formatter_class=RawTextHelpFormatter) 
+parser.add_argument("--numspaces", default = 2, type=int, dest='iSpacesToChange')
+args = parser.parse_args()
+
+dictHeaderCounts = {}
+reBadChars=re.compile(r'[\\\/\*\=\:\'\[\]\.\;]')
+
+for strLine in sys.stdin:
+    strLine = strLine.strip()
+    if strLine[0]==">":
+        strHeader = strLine[1:].replace(" ","_",args.iSpacesToChange)
+        strHeader = re.sub(reBadChars,"_",strHeader)
+        dictHeaderCounts[strHeader] = dictHeaderCounts.get(strHeader,0)+1
+        if dictHeaderCounts[strHeader] > 1:
+            strHeader = strHeader + "___Copy"+str(dictHeaderCounts[strHeader]).zfill(4)
+        print ">" + strHeader
+    else:
+        print strLine
+
+
+