Skip to content

Commit

Permalink
- scorefile instead of add_suffix parameter
Browse files Browse the repository at this point in the history
- branch filter is turned on by default
- filter branches upon scores being <= instead of < threshold
- removed deprecated function of motif specific filter
  • Loading branch information
BjoernLanger committed Mar 8, 2019
1 parent 237f720 commit a66221e
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 79 deletions.
31 changes: 15 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ cd example
# Create a joblist file 'alljobs_simulation' containing the branch_scoring jobs
REforge.py data/tree_simulation.nwk data/motifs.wtmx data/species_lost_simulation.txt elements_simul.ls \
--windowsize 200 --add_suffix _simulation -bg=background/
--windowsize 200 --scorefile scores_simulation -bg=background/
# Run job list as batch or in parallel
bash alljobs_simulation > scores_simulation
# Run the association test
REforge_statistics.py data/tree_simulation.nwk data/motifs.wtmx data/species_lost_simulation.txt \
elements_simul.ls --add_suffix _simulation
elements_simul.ls --scorefile scores_simulation
# This generates a file 'significant_elements_simulation' that alphabetically lists the elements, their P-value and the number of branches
```

Expand All @@ -60,17 +60,16 @@ Generate the REforge branch_scoring commands for all CREs.
```
REforge.py <tree> <motiffile> <lost_species_list> <element_list>
```
This creates for every CRE a REforge_branch_scoring.py job. Each line in alljobs<suffix> consists a single job. Each job is completely independent of any other job, thus each job can be run in parallel to others.
This creates for every CRE a REforge_branch_scoring.py job. Each line in alljobs_<scorefile> consists a single job. Each job is completely independent of any other job, thus each job can be run in parallel to others.

Execute that alljobs file. Either sequentially by doing
Execute that alljobs file. Either sequentially via
```
chmod +x alljobs_simulation
./alljobs_simulation > scores_simulation
bash alljobs_simulation > scores_simulation
```
or run it in parallel by using a compute cluster.
Every job returns a line in the following format:
motif_file CRE_file (branch_start>branch_end:branch_score )* <
which should be concatenated into a file called "scores<suffix>".
which should be concatenated into a file called "<scorefile>".

## Step 2: Association test
```
Expand All @@ -81,19 +80,19 @@ REforge_statistics.py classifies branches into trait-loss and trait-preserving a
## Common Parameters
#### REforge.py
```
--add_suffix <suffix>
Appends suffix to every generated file
--scorefile <name>
Name file <name> instead of "scores"
--windowsize/-w <n>
Scoring window used in sequence scoring
--background/-bg <folder>
Background used for sequence scoring. Either a file or a folder with backgrounds for different GC contents
Background used for sequence scoring. Either a file or a folder structure with backgrounds for different GC contents
--scrCrrIter <n>
Number score correction iterations. 0 turns the score correction off
Stubb score is corrected with the average score of <number> of shuffled sequences. Default is 10. 0 turns the score correction off
```
#### REforge_statistics.py
```
--add_suffix <suffix>
Appends suffix to every generated file
--scorefile <name>
Use <name> as scorefile instead of "scores"
--filterspecies <comma separated list>
Exclude species from analyses
--elements <file>
Expand All @@ -110,12 +109,12 @@ Analyse only the elements specified <file>
```
--no_ancestral_filter
Turn of ancestral score filtering
--no_branch_filter
Turn of branch score filtering
--no_fixed_TP
Do not fix transition probabilites while computing branch scores
--filter_branch_threshold <x>
Filter branches if start and end node are below <x>
--filter_branches <file>
Like --filter_branch_threshold but with motif specific branch thresholds from <file>
Threshold for branch filter; By default branches are filtered if start and end node score are below 0
--filter_GC_change <x>
Filter branches with a GC content change above <x>
--filter_length_change <x>
Expand Down
50 changes: 25 additions & 25 deletions REforge.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,55 +18,55 @@
__description__ = "Analyses transcription factor motifs with respect to their differences in binding within a phylogeny of CRM sequences and the associated phenotype"
__default_windowsize__ = 500
windowsize = __default_windowsize__
suffix = "" # suffix ending for newly generated files


BPCOUNT = False
SIMPLE_SCORING = False

scorefile = "scores"

def __get_arguments():
global suffix, windowsize
global scorefile, windowsize
import argparse
app = argparse.ArgumentParser(description=__description__)

app.add_argument("treefile", type=str, help="phylogenetic tree given in newick format")
app.add_argument("motiffile", type=str, help="wtmx file containing all transcription factor motifs")
app.add_argument("lossfile", type=str, help="file listing trait loss species; one line per species")
app.add_argument("elementfile", type=str, help="file listing putative elements; one line per fasta file (.fa or .fasta)")

app.add_argument("--add_suffix", type=str, help="add this suffix to files")
app.add_argument("--scorefile", type=str, help="name of score file")
app.add_argument("--windowsize", "-w", type=str, help="windowsize to be used for element scoring")
app.add_argument("--background", "-bg", type=str, help="use given file as background file for rescoring (set this to activate rescoring)")
app.add_argument("--background", "-bg", type=str, help="background file - passed into the score function")
app.add_argument("--scrCrrIter", type=int, help="corrects stubb score with average score of <number> of shuffled sequences. Default is 10", default=10)
app.add_argument("--no_ancestral_filter", action="store_true", help="elements are filtered if the ancestral element scores <= 0")
app.add_argument("--no_fixed_TP", action="store_true", help="don't fix stubb's transition probablities on ancestral sequence")
app.add_argument("--filter_branch_threshold", type=float, help="if set, branches with score difference below threshold are filtered (behavior is changed by --relaxed_filter)")
app.add_argument("--filter_branches", type=str, help="like --filter_branch_threshold but with motif specific branch thresholds")
app.add_argument("--no_ancestral_filter", action="store_true", help="elements not are filtered if the ancestral element scores <= 0")
app.add_argument("--no_branch_filter", action="store_true", help="branches are not filtered if start and end score <= filter_branch_threshold")
app.add_argument("--no_fixed_TP", action="store_true", help="Stubb's transition probablities are not fixed on the ancestral sequence")

app.add_argument("--filter_branch_threshold", type=float, help="Default is 0")
# app.add_argument("--filter_branches", type=str, help="like --filter_branch_threshold but with motif specific branch thresholds")
app.add_argument("--filter_GC_change", type=float, help="if set, branches with a GC content change above the threshold are filtered")
app.add_argument("--filter_length_change", type=float, help="if set, branches with a relative length change above the threshold are filtered")



app.add_argument("--alignment", type=str)

app.add_argument("--verbose", "-v", action="store_true")
app.add_argument("--debug", "-d", action="store_true")

args = app.parse_args()
if args.no_branch_filter: assert args.filter_branch_threshold is None
if args.windowsize is not None: windowsize = args.windowsize
if args.add_suffix is not None: suffix = args.add_suffix
if args.scorefile is not None: scorefile = args.scorefile
return args

def create_branch_scoring_job_list(elements, tf, windowsize, args, filter_branch_threshold=None, filter_branches=None, background=None, stubb=False, append_to="jobFile"):
def create_branch_scoring_job_list(elements, tf, windowsize, args, filter_branch_threshold=None, background=None, stubb=False, append_to="jobFile"):
with open(append_to, "a") as job_file:
for cne in elements:
arguments = ""
if args.no_fixed_TP: arguments += "--no_fixed_TP "
if background is not None: arguments += "-bg={0} ".format(background)
if filter_branches is not None: arguments += "--filter_branches {0} ".format(filter_branches)
elif filter_branch_threshold is not None: arguments += "--filter_branch_threshold {0} ".format(filter_branch_threshold)
# if filter_branches is not None: arguments += "--filter_branches {0} ".format(filter_branches)
if filter_branch_threshold is not None: arguments += "--filter_branch_threshold {0} ".format(filter_branch_threshold)
if args.filter_GC_change is not None: arguments += "--filter_GC_change {0} ".format(args.filter_GC_change)
if args.filter_length_change is not None: arguments += "--filter_length_change {0} ".format(args.filter_length_change)
if args.scrCrrIter is not None: arguments += "--scrCrrIter {0} ".format(args.scrCrrIter)
if args.no_ancestral_filter: arguments += "--no_ancestral_filter "
if args.no_branch_filter: arguments += "--no_branch_filter "
if args.alignment: arguments += "--alignment {0} ".format(args.alignment)
if args.debug: arguments += "-d "
if args.verbose: arguments += "-v "
job_file.write("echo -en {0}'\t'\"{1}\"'\t'; REforge_branch_scoring.py {0} {3} \"{1}\" {2} {4}; echo '<'\n".format(tf, cne, windowsize, args.treefile, arguments, ))
Expand All @@ -91,15 +91,15 @@ def __analyse_sequences():
#########################
# create a job file per transcription factor, for computing branch scores (eventually by extracting scores from the simulation) for every CNE and running them

subprocess.check_call("> alljobs%s"%suffix, shell=True) # (re-)create a new joblist file
subprocess.check_call(["chmod", "+x", "alljobs%s"%suffix])
allJobsFile = "alljobs%s"%suffix
subprocess.check_call("> alljobs_%s"%scorefile, shell=True) # (re-)create a new joblist file
subprocess.check_call(["chmod", "+x", "alljobs_%s"%scorefile])
allJobsFile = "alljobs_%s"%scorefile
if not args.no_fixed_TP:
subprocess.check_call(["mkdir", "-p", "tempDir"])

if not os.path.exists("scores%s"%suffix): subprocess.check_call(["touch", "scores%s"%suffix])
if not os.path.exists(scorefile): subprocess.check_call(["touch", scorefile])
create_branch_scoring_job_list(elements, args.motiffile, windowsize, args,
filter_branch_threshold=args.filter_branch_threshold, filter_branches=args.filter_branches,
filter_branch_threshold=args.filter_branch_threshold, #filter_branches=args.filter_branches,
background=args.background, stubb=True, append_to=allJobsFile)

if __name__ == "__main__":
Expand Down
56 changes: 31 additions & 25 deletions REforge_branch_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,31 @@


__description__ = "Helper functions for the motif analysis pipeline. compute branch scores on a complete phylogeny."
__default_branch_filter_threshold__ = 0

sequence_score_cache = {"" : 0}

def __get_arguments():
import argparse

app = argparse.ArgumentParser(description=__description__)
app.add_argument("motiffile", type=str, help="motif as position frequency matrix")
app.add_argument("treefile", type=str, help="phylogenetic tree. If elements might use a subtree ancestral nodes must be unnamed")
app.add_argument("element", type=str, help="path to fasta file containing element's sequence alignment")
# species sequence data can be given in a BerkeleyDB file 'ALI'. In this case the parameter specifies the element name
app.add_argument("motiffile", type=str, help="motif as position frequency matrix")
app.add_argument("treefile", type=str, help="phylogenetic tree. If elements might use a subtree ancestral nodes must be unnamed")
app.add_argument("element", type=str, help="path to fasta file containing element's sequence alignment")
# species sequence data can be given in a BerkeleyDB file specified by --alignment. In this case the parameter specifies the element name
app.add_argument("scoring_window_length", type=str, help="scoring window length")
app.add_argument("--background", "-bg", type=str, help="background file - passed into the score function")
app.add_argument("--no_ancestral_filter", action="store_true", help="elements are filtered if the ancestral element scores <= 0")
app.add_argument("--no_fixed_TP", action="store_true", help="fix stubb's transition probablities on ancestral sequence")
app.add_argument("--filter_branch_threshold", "-dt", type=float, help="if set, branches are filtered if start and end node score below the given threshold")
app.add_argument("--filter_branches", "-db", type=str, help="like --filter_branch_threshold but with motif specific branch thresholds - deprecated")
app.add_argument("--scrCrrIter", type=int, help="corrects stubb score with average score of <number> of shuffled sequences. Default is 10", default=10)
app.add_argument("--no_ancestral_filter", action="store_true", help="elements not are filtered if the ancestral element scores <= 0")
app.add_argument("--no_branch_filter", action="store_true", help="branches are not filtered if start and end score <= filter_branch_threshold")
app.add_argument("--no_fixed_TP", action="store_true", help="Stubb's transition probablities are not fixed on the ancestral sequence")

app.add_argument("--filter_branch_threshold", type=float, default=0, help="Default is 0")
# app.add_argument("--filter_branches", "-db", type=str, help="like --filter_branch_threshold but with motif specific branch thresholds - deprecated")
app.add_argument("--filter_GC_change", "-fgc", type=float, help="if set, branches with a GC content change above the threshold are filtered")
app.add_argument("--filter_length_change", "-fl", type=float, help="if set, branches with a relative length change above the threshold are filtered")
app.add_argument("--scrCrrIter", type=int, help="corrects stubb score with average score of <number> of shuffled sequences. Default is 10", default=10)

app.add_argument("--alignment", type=str)

app.add_argument("--verbose", "-v", action="store_true")
app.add_argument("--debug", "-d", action="store_true")
Expand All @@ -54,7 +59,7 @@ def read_threshold_collection(threshold_file_path, motif_name): # returns {"stu


def traverse_tree(parent, wtmx_file, scoring_window_length, sequences, background="", filter_threshold=None,
filter_threshold_dict=None, filter_GC_change=None, filter_length_change=None,
filter_GC_change=None, filter_length_change=None,
fixProbs=None, scrCrrMthd='stubb', scrCrrIter=10, anc0filter=False):
""" traverses the phylegenetic tree, scores every sequence and computes branch scores """
# anc0Filter: should be activated only for the root node
Expand All @@ -68,10 +73,10 @@ def traverse_tree(parent, wtmx_file, scoring_window_length, sequences, backgroun
if sequence.replace('N','') == '':
return (float('nan'), 0, 0)
gc = computeGCContent(sequence=sequence)/100
if filter_threshold_dict is not None:
gc_rounded = roundToList( [x[0] for x in filter_threshold_dict.keys()] , gc )
else:
gc_rounded = None
# if filter_threshold_dict is not None:
# gc_rounded = roundToList( [x[0] for x in filter_threshold_dict.keys()] , gc )
# else:
# gc_rounded = None

if sequence in sequence_score_cache.keys():
scores = [sequence_score_cache[sequence]]
Expand Down Expand Up @@ -106,15 +111,16 @@ def traverse_tree(parent, wtmx_file, scoring_window_length, sequences, backgroun

for child in parent:
child_score, child_gc, child_length = traverse_tree(child, wtmx_file, scoring_window_length, sequences,
background, filter_threshold, filter_threshold_dict, filter_GC_change, filter_length_change,
background, filter_threshold, filter_GC_change, filter_length_change, #filter_threshold_dict,
fixProbs, scrCrrMthd=scrCrrMthd, scrCrrIter=scrCrrIter, anc0filter=False)

motif_name = os.path.splitext(os.path.basename(wtmx_file))[0]
# branch score filter step
if not any([filter_threshold_dict is not None and \
parent_score < filter_threshold_dict[(gc_rounded, motif_name)] and \
child_score < filter_threshold_dict[(gc_rounded, motif_name)],
filter_threshold is not None and parent_score < filter_threshold and child_score < filter_threshold,
if not any([
# filter_threshold_dict is not None and \
# parent_score < filter_threshold_dict[(gc_rounded, motif_name)] and \
# child_score < filter_threshold_dict[(gc_rounded, motif_name)],
filter_threshold is not None and parent_score <= filter_threshold and child_score <= filter_threshold,
filter_GC_change is not None and abs(child_gc - gc) > filter_GC_change,
filter_length_change is not None and abs(child_length - length) < filter_length_change*(child_length+length)/2 ]):
print("%s>%s:%f\t"%(parent, child, child_score - parent_score), end="")
Expand Down Expand Up @@ -143,17 +149,17 @@ def __score_branches():


# read the input files
phylo_tree, sequences = read_sequences_and_prune_tree(args.element, args.treefile)
phylo_tree, sequences = read_sequences_and_prune_tree(args.element, args.treefile, alignment=args.alignment)

if args.filter_branches:
filter_threshold_dict = read_threshold_collection(args.filter_branches, os.path.splitext(os.path.basename(args.motiffile))[0])["stubb"]
else:
filter_threshold_dict = None
# if args.filter_branches:
# filter_threshold_dict = read_threshold_collection(args.filter_branches, os.path.splitext(os.path.basename(args.motiffile))[0])["stubb"]
# else:
# filter_threshold_dict = None

logging.info("Branch scoring: use %s as background and %s as window size"%(args.background, args.scoring_window_length) )
scrCrrMthd = "stubb" if args.scrCrrIter > 0 else None
traverse_tree(phylo_tree.root, args.motiffile, float(args.scoring_window_length), sequences, args.background,
filter_threshold=args.filter_branch_threshold, filter_threshold_dict=filter_threshold_dict,
filter_threshold=None if args.no_branch_filter else args.filter_branch_threshold, # filter_threshold_dict=filter_threshold_dict,
filter_GC_change=args.filter_GC_change, filter_length_change=args.filter_length_change,
fixProbs=not args.no_fixed_TP, scrCrrMthd=scrCrrMthd, scrCrrIter=args.scrCrrIter,
anc0filter=not args.no_ancestral_filter)
Expand Down
Loading

0 comments on commit a66221e

Please sign in to comment.