- scorefile instead of add_suffix parameter

- branch filter is turned on by default - filter branches upon scores being <= instead of < threshold - removed deprecated function of motif specific filter
bjlang · Mar 8, 2019 · a66221e · a66221e
1 parent 237f720
commit a66221e
Show file tree

Hide file tree

Showing 5 changed files with 88 additions and 79 deletions.
diff --git a/README.md b/README.md
@@ -36,14 +36,14 @@ cd example
 
 # Create a joblist file 'alljobs_simulation' containing the branch_scoring jobs
 REforge.py data/tree_simulation.nwk data/motifs.wtmx data/species_lost_simulation.txt elements_simul.ls \
-  --windowsize 200 --add_suffix _simulation -bg=background/
+  --windowsize 200 --scorefile scores_simulation -bg=background/
 
 # Run job list as batch or in parallel 
 bash alljobs_simulation > scores_simulation
 
 # Run the association test
 REforge_statistics.py data/tree_simulation.nwk data/motifs.wtmx data/species_lost_simulation.txt \
-   elements_simul.ls --add_suffix _simulation
+   elements_simul.ls --scorefile scores_simulation
 # This generates a file 'significant_elements_simulation' that alphabetically lists the elements, their P-value and the number of branches
 ```
 
@@ -60,17 +60,16 @@ Generate the REforge branch_scoring commands for all CREs.
 ```
 REforge.py <tree> <motiffile> <lost_species_list> <element_list>
 ```
-This creates for every CRE a REforge_branch_scoring.py job. Each line in alljobs<suffix> consists a single job. Each job is completely independent of any other job, thus each job can be run in parallel to others.
+This creates for every CRE a REforge_branch_scoring.py job. Each line in alljobs_<scorefile> consists a single job. Each job is completely independent of any other job, thus each job can be run in parallel to others.
 
-Execute that alljobs file. Either sequentially by doing
+Execute that alljobs file. Either sequentially via
 ```
-chmod +x alljobs_simulation
-./alljobs_simulation > scores_simulation
+bash alljobs_simulation > scores_simulation
 ```
 or run it in parallel by using a compute cluster.
 Every job returns a line in the following format:
 motif_file	CRE_file	(branch_start>branch_end:branch_score	)*	<
-which should be concatenated into a file called "scores<suffix>".
+which should be concatenated into a file called "<scorefile>".
 
 ## Step 2: Association test
 ```
@@ -81,19 +80,19 @@ REforge_statistics.py classifies branches into trait-loss and trait-preserving a
 ## Common Parameters
 #### REforge.py
 ```
---add_suffix <suffix>
-Appends suffix to every generated file
+--scorefile <name>
+Name file <name> instead of "scores"
 --windowsize/-w <n>
 Scoring window used in sequence scoring
 --background/-bg <folder>
-Background used for sequence scoring. Either a file or a folder with backgrounds for different GC contents
+Background used for sequence scoring. Either a file or a folder structure with backgrounds for different GC contents
 --scrCrrIter <n>
-Number score correction iterations. 0 turns the score correction off
+Stubb score is corrected with the average score of <number> of shuffled sequences. Default is 10. 0 turns the score correction off
 ```
 #### REforge_statistics.py
 ```
---add_suffix <suffix>
-Appends suffix to every generated file
+--scorefile <name>
+Use <name> as scorefile instead of "scores"
 --filterspecies <comma separated list>
 Exclude species from analyses
 --elements <file>
@@ -110,12 +109,12 @@ Analyse only the elements specified <file>
 ```
 --no_ancestral_filter
 Turn of ancestral score filtering
+--no_branch_filter
+Turn of branch score filtering
 --no_fixed_TP
 Do not fix transition probabilites while computing branch scores
 --filter_branch_threshold <x>
-Filter branches if start and end node are below <x>
---filter_branches <file>
-Like --filter_branch_threshold but with motif specific branch thresholds from <file>
+Threshold for branch filter; By default branches are filtered if start and end node score are below 0
 --filter_GC_change <x>
 Filter branches with a GC content change above <x>
 --filter_length_change <x>

diff --git a/REforge.py b/REforge.py
@@ -18,55 +18,55 @@
 __description__ = "Analyses transcription factor motifs with respect to their differences in binding within a phylogeny of CRM sequences and the associated phenotype"
 __default_windowsize__ = 500
 windowsize = __default_windowsize__
-suffix = ""									# suffix ending for newly generated files
-
-
-BPCOUNT = False
-SIMPLE_SCORING = False
-
+scorefile = "scores"
 
 def __get_arguments():
-	global suffix, windowsize
+	global scorefile, windowsize
 	import argparse
 	app = argparse.ArgumentParser(description=__description__)
-
 	app.add_argument("treefile", 		type=str, help="phylogenetic tree given in newick format")
 	app.add_argument("motiffile",		type=str, help="wtmx file containing all transcription factor motifs")
 	app.add_argument("lossfile", 		type=str, help="file listing trait loss species; one line per species")
 	app.add_argument("elementfile", 	type=str, help="file listing putative elements; one line per fasta file (.fa or .fasta)")
 
-	app.add_argument("--add_suffix", 	type=str, help="add this suffix to files")
+	app.add_argument("--scorefile", 		type=str, help="name of score file")
 	app.add_argument("--windowsize", "-w",	type=str, help="windowsize to be used for element scoring")
-	app.add_argument("--background", "-bg",	type=str, help="use given file as background file for rescoring (set this to activate rescoring)")
+	app.add_argument("--background", "-bg",	type=str, help="background file - passed into the score function")
 	app.add_argument("--scrCrrIter", 		type=int, help="corrects stubb score with average score of <number> of shuffled sequences. Default is 10", default=10) 
-	app.add_argument("--no_ancestral_filter", 	action="store_true", help="elements are filtered if the ancestral element scores <= 0")
-	app.add_argument("--no_fixed_TP", 			action="store_true", help="don't fix stubb's transition probablities on ancestral sequence") 
-	app.add_argument("--filter_branch_threshold", type=float, help="if set, branches with score difference below threshold are filtered (behavior is changed by --relaxed_filter)")
-	app.add_argument("--filter_branches", 		type=str, help="like --filter_branch_threshold but with motif specific branch thresholds")
+	app.add_argument("--no_ancestral_filter", 	action="store_true", help="elements not are filtered if the ancestral element scores <= 0")
+	app.add_argument("--no_branch_filter", 	action="store_true", help="branches are not filtered if start and end score <= filter_branch_threshold") 
+	app.add_argument("--no_fixed_TP", 		action="store_true", help="Stubb's transition probablities are not fixed on the ancestral sequence") 
+
+	app.add_argument("--filter_branch_threshold", 	type=float, help="Default is 0")
+	# app.add_argument("--filter_branches", 		type=str, help="like --filter_branch_threshold but with motif specific branch thresholds")
 	app.add_argument("--filter_GC_change",		type=float, help="if set, branches with a GC content change above the threshold are filtered")
 	app.add_argument("--filter_length_change", 	type=float, help="if set, branches with a relative length change above the threshold are filtered")
-
-
+
+	app.add_argument("--alignment", type=str)	
+
 	app.add_argument("--verbose", "-v", action="store_true")
 	app.add_argument("--debug", "-d", action="store_true")
 
 	args = app.parse_args()
+	if args.no_branch_filter:			assert args.filter_branch_threshold is None
 	if args.windowsize is not None:		windowsize = args.windowsize
-	if args.add_suffix is not None: 	suffix = args.add_suffix
+	if args.scorefile is not None: 		scorefile = args.scorefile
 	return args
 
-def create_branch_scoring_job_list(elements, tf, windowsize, args, filter_branch_threshold=None, filter_branches=None, background=None, stubb=False, append_to="jobFile"):
+def create_branch_scoring_job_list(elements, tf, windowsize, args, filter_branch_threshold=None, background=None, stubb=False, append_to="jobFile"):
 	with open(append_to, "a") as job_file:
 		for cne in elements:
 			arguments = ""
 			if args.no_fixed_TP:						arguments += "--no_fixed_TP "
 			if background is not None:					arguments += "-bg={0} ".format(background)
-			if filter_branches is not None:				arguments += "--filter_branches {0} ".format(filter_branches)
-			elif filter_branch_threshold is not None:	arguments += "--filter_branch_threshold {0} ".format(filter_branch_threshold)
+			# if filter_branches is not None:				arguments += "--filter_branches {0} ".format(filter_branches)
+			if filter_branch_threshold is not None:		arguments += "--filter_branch_threshold {0} ".format(filter_branch_threshold)
 			if args.filter_GC_change is not None:		arguments += "--filter_GC_change {0} ".format(args.filter_GC_change)
 			if args.filter_length_change is not None:	arguments += "--filter_length_change {0} ".format(args.filter_length_change)
 			if args.scrCrrIter is not None:				arguments += "--scrCrrIter {0} ".format(args.scrCrrIter)
 			if args.no_ancestral_filter:				arguments += "--no_ancestral_filter "
+			if args.no_branch_filter:					arguments += "--no_branch_filter "
+			if args.alignment:							arguments += "--alignment {0} ".format(args.alignment)
 			if args.debug:								arguments += "-d "
 			if args.verbose:							arguments += "-v "
 			job_file.write("echo -en {0}'\t'\"{1}\"'\t'; REforge_branch_scoring.py {0} {3} \"{1}\" {2} {4}; echo '<'\n".format(tf, cne, windowsize, args.treefile, arguments, ))
@@ -91,15 +91,15 @@ def __analyse_sequences():
 	#########################
 	# create a job file per transcription factor, for computing branch scores (eventually by extracting scores from the simulation) for every CNE and running them
 
-	subprocess.check_call("> alljobs%s"%suffix, shell=True)													# (re-)create a new joblist file
-	subprocess.check_call(["chmod", "+x", "alljobs%s"%suffix])
-	allJobsFile = "alljobs%s"%suffix
+	subprocess.check_call("> alljobs_%s"%scorefile, shell=True)													# (re-)create a new joblist file
+	subprocess.check_call(["chmod", "+x", "alljobs_%s"%scorefile])
+	allJobsFile = "alljobs_%s"%scorefile
 	if not args.no_fixed_TP:
 		subprocess.check_call(["mkdir", "-p", "tempDir"])
 
-	if not os.path.exists("scores%s"%suffix):	subprocess.check_call(["touch", "scores%s"%suffix])
+	if not os.path.exists(scorefile):	subprocess.check_call(["touch", scorefile])
 	create_branch_scoring_job_list(elements, args.motiffile, windowsize, args, 
-			filter_branch_threshold=args.filter_branch_threshold, filter_branches=args.filter_branches,
+			filter_branch_threshold=args.filter_branch_threshold, #filter_branches=args.filter_branches,
 			background=args.background, stubb=True, append_to=allJobsFile)
 
 if __name__ == "__main__":	

diff --git a/REforge_branch_scoring.py b/REforge_branch_scoring.py
@@ -13,26 +13,31 @@
 
 
 __description__ = "Helper functions for the motif analysis pipeline. compute branch scores on a complete phylogeny."
+__default_branch_filter_threshold__ = 0
 
 sequence_score_cache = {"" : 0}
 
 def __get_arguments():
 	import argparse
 
 	app = argparse.ArgumentParser(description=__description__)
-	app.add_argument("motiffile",			type=str, help="motif as position frequency matrix")
-	app.add_argument("treefile", 			type=str, help="phylogenetic tree. If elements might use a subtree ancestral nodes must be unnamed")
-	app.add_argument("element", 			type=str, help="path to fasta file containing element's sequence alignment")							
-	# species sequence data can be given in a BerkeleyDB file 'ALI'. In this case the parameter specifies the element name
+	app.add_argument("motiffile",		type=str, help="motif as position frequency matrix")
+	app.add_argument("treefile", 		type=str, help="phylogenetic tree. If elements might use a subtree ancestral nodes must be unnamed")
+	app.add_argument("element", 		type=str, help="path to fasta file containing element's sequence alignment")							
+	# species sequence data can be given in a BerkeleyDB file specified by --alignment. In this case the parameter specifies the element name
 	app.add_argument("scoring_window_length", 	type=str, help="scoring window length")
 	app.add_argument("--background", "-bg",	type=str, help="background file - passed into the score function")
-	app.add_argument("--no_ancestral_filter", 		action="store_true", help="elements are filtered if the ancestral element scores <= 0")
-	app.add_argument("--no_fixed_TP", 		action="store_true", help="fix stubb's transition probablities on ancestral sequence") 
-	app.add_argument("--filter_branch_threshold", "-dt",	type=float, help="if set, branches are filtered if start and end node score below the given threshold")
-	app.add_argument("--filter_branches", "-db",	type=str, help="like --filter_branch_threshold but with motif specific branch thresholds - deprecated")
+	app.add_argument("--scrCrrIter", 		type=int, help="corrects stubb score with average score of <number> of shuffled sequences. Default is 10", default=10) 
+	app.add_argument("--no_ancestral_filter", 	action="store_true", help="elements not are filtered if the ancestral element scores <= 0")
+	app.add_argument("--no_branch_filter", 	action="store_true", help="branches are not filtered if start and end score <= filter_branch_threshold") 
+	app.add_argument("--no_fixed_TP", 		action="store_true", help="Stubb's transition probablities are not fixed on the ancestral sequence") 
+
+	app.add_argument("--filter_branch_threshold", 	type=float, default=0, help="Default is 0")
+	# app.add_argument("--filter_branches", "-db",	type=str, help="like --filter_branch_threshold but with motif specific branch thresholds - deprecated")
 	app.add_argument("--filter_GC_change", "-fgc",	type=float, help="if set, branches with a GC content change above the threshold are filtered")
 	app.add_argument("--filter_length_change", "-fl",	type=float, help="if set, branches with a relative length change above the threshold are filtered")
-	app.add_argument("--scrCrrIter", 		type=int, help="corrects stubb score with average score of <number> of shuffled sequences. Default is 10", default=10) 
+
+	app.add_argument("--alignment", type=str)
 
 	app.add_argument("--verbose", "-v",		action="store_true")
 	app.add_argument("--debug", "-d",		action="store_true")
@@ -54,7 +59,7 @@ def read_threshold_collection(threshold_file_path, motif_name):		# returns {"stu
 
 
 def traverse_tree(parent, wtmx_file, scoring_window_length, sequences, background="", filter_threshold=None, 
-					filter_threshold_dict=None, filter_GC_change=None, filter_length_change=None, 
+					filter_GC_change=None, filter_length_change=None, 
 					fixProbs=None, scrCrrMthd='stubb', scrCrrIter=10, anc0filter=False):
 	""" traverses the phylegenetic tree, scores every sequence and computes branch scores """
 	# anc0Filter: should be activated only for the root node
@@ -68,10 +73,10 @@ def traverse_tree(parent, wtmx_file, scoring_window_length, sequences, backgroun
 		if sequence.replace('N','') == '':
 			return (float('nan'), 0, 0)
 		gc = computeGCContent(sequence=sequence)/100
-		if filter_threshold_dict is not None:
-			gc_rounded = roundToList( [x[0] for x in filter_threshold_dict.keys()] , gc )
-		else:
-			gc_rounded = None
+		# if filter_threshold_dict is not None:
+		# 	gc_rounded = roundToList( [x[0] for x in filter_threshold_dict.keys()] , gc )
+		# else:
+		# 	gc_rounded = None
 
 		if sequence in sequence_score_cache.keys():
 			scores = [sequence_score_cache[sequence]]
@@ -106,15 +111,16 @@ def traverse_tree(parent, wtmx_file, scoring_window_length, sequences, backgroun
 
 		for child in parent:
 			child_score, child_gc, child_length = traverse_tree(child, wtmx_file, scoring_window_length, sequences,
-						background, filter_threshold, filter_threshold_dict, filter_GC_change, filter_length_change, 
+						background, filter_threshold, filter_GC_change, filter_length_change, #filter_threshold_dict,
 						fixProbs, scrCrrMthd=scrCrrMthd, scrCrrIter=scrCrrIter, anc0filter=False)
 
 			motif_name = os.path.splitext(os.path.basename(wtmx_file))[0]
 			# branch score filter step
-			if not any([filter_threshold_dict is not None and \
-						parent_score < filter_threshold_dict[(gc_rounded, motif_name)] and \
-						child_score < filter_threshold_dict[(gc_rounded, motif_name)],
-					filter_threshold is not None and parent_score < filter_threshold and child_score < filter_threshold,
+			if not any([
+					# filter_threshold_dict is not None and \
+					# 	parent_score < filter_threshold_dict[(gc_rounded, motif_name)] and \
+					# 	child_score < filter_threshold_dict[(gc_rounded, motif_name)],
+					filter_threshold is not None and parent_score <= filter_threshold and child_score <= filter_threshold,
 					filter_GC_change is not None and abs(child_gc - gc) > filter_GC_change,
 					filter_length_change is not None and abs(child_length - length) < filter_length_change*(child_length+length)/2 ]): 
 				print("%s>%s:%f\t"%(parent, child, child_score  - parent_score), end="")
@@ -143,17 +149,17 @@ def __score_branches():
 
 
 	# read the input files
-	phylo_tree, sequences = read_sequences_and_prune_tree(args.element, args.treefile)
+	phylo_tree, sequences = read_sequences_and_prune_tree(args.element, args.treefile, alignment=args.alignment)
 
-	if args.filter_branches:
-		filter_threshold_dict = read_threshold_collection(args.filter_branches, os.path.splitext(os.path.basename(args.motiffile))[0])["stubb"]
-	else:
-		filter_threshold_dict = None
+	# if args.filter_branches:
+	# 	filter_threshold_dict = read_threshold_collection(args.filter_branches, os.path.splitext(os.path.basename(args.motiffile))[0])["stubb"]
+	# else:
+	# 	filter_threshold_dict = None
 
 	logging.info("Branch scoring: use %s as background and %s as window size"%(args.background, args.scoring_window_length) )
 	scrCrrMthd = "stubb" if args.scrCrrIter > 0 else None
 	traverse_tree(phylo_tree.root, args.motiffile, float(args.scoring_window_length), sequences, args.background, 
-					filter_threshold=args.filter_branch_threshold, filter_threshold_dict=filter_threshold_dict,
+					filter_threshold=None if args.no_branch_filter else args.filter_branch_threshold, # filter_threshold_dict=filter_threshold_dict,
 					filter_GC_change=args.filter_GC_change, filter_length_change=args.filter_length_change,
 					fixProbs=not args.no_fixed_TP, scrCrrMthd=scrCrrMthd, scrCrrIter=args.scrCrrIter,
 					anc0filter=not args.no_ancestral_filter)