Skip to content

Commit

Permalink
v3.1 commit
Browse files Browse the repository at this point in the history
  • Loading branch information
dportik committed Sep 19, 2019
1 parent 546c93b commit cbbabf1
Show file tree
Hide file tree
Showing 251 changed files with 598 additions and 51,884 deletions.
162 changes: 89 additions & 73 deletions Goodness_of_Fit/Optimize_Functions_GOF.py

Large diffs are not rendered by default.

26 changes: 14 additions & 12 deletions Goodness_of_Fit/Simulate_and_Optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@
#===========================================================================

#**************
snps = "/Users/dan/Dropbox/dadi_inputs/General_Script/dadi_2pops_North_South_snps.txt"
snps = "/Users/portik/Documents/GitHub/Testing_version/dadi_pipeline/Example_Data/dadi_2pops_North_South_snps.txt"

#Create python dictionary from snps file
dd = dadi.Misc.make_data_dict(snps)
Expand All @@ -114,13 +114,12 @@
fs = dadi.Spectrum.from_data_dict(dd, pop_ids=pop_ids, projections = proj, polarized = False)

#print some useful information about the afs or jsfs
print "\n\n============================================================================\nData for site frequency spectrum\n============================================================================\n"
print "projection", proj
print "sample sizes", fs.sample_sizes
sfs_sum = numpy.around(fs.S(), 2)
print "Sum of SFS = ", sfs_sum, '\n', '\n'


print("\n\n============================================================================")
print("\nData for site frequency spectrum\n")
print("Projection: {}".format(proj))
print("Sample sizes: {}".format(fs.sample_sizes))
print("Sum of SFS: {}".format(numpy.around(fs.S(), 2)))
print("\n============================================================================\n")

#================================================================================
# Fit the empirical data based on prior optimization results, obtain model SFS
Expand Down Expand Up @@ -185,7 +184,7 @@ def sym_mig(params, ns, pts):
#================================================================================
'''
We will use a function from the Optimize_Functions_GOF.py script:
Perform_Sims(sim_number, model_fs, pts, model_name, func, rounds, param_number,
Perform_Sims(sim_number, model_fs, pts, model_name, func, rounds, param_number, fs_folded,
reps=None, maxiters=None, folds=None)
Mandatory Arguments =
Expand All @@ -196,6 +195,7 @@ def sym_mig(params, ns, pts):
func: access the model function from within this script
rounds: number of optimization rounds to perform
param_number: number of parameters in the model selected (can count in params line for the model)
fs_folded: A Boolean value indicating whether the empirical fs is folded (True) or not (False).
Optional Arguments =
reps: a list of integers controlling the number of replicates in each of the optimization rounds
Expand All @@ -205,11 +205,11 @@ def sym_mig(params, ns, pts):

#**************
#Set the number of simulations to perform here. This should be ~100 or more.
sims = 100
sims = 3

#**************
#Enter the number of parameters found in the model to test.
p_num = 4
pnum = 4

#**************
#Set the number of rounds here.
Expand All @@ -220,8 +220,10 @@ def sym_mig(params, ns, pts):
maxiters = [5,10,20]
folds = [3,2,1]


#Execute the optimization routine for each of the simulated SFS.
#Here, you will want to change the "sym_mig" and sym_mig arguments to match your model, but
#everything else can stay as it is (as the actual values can be changed above).
Optimize_Functions_GOF.Perform_Sims(sims, scaled_fs, pts, "sym_mig", sym_mig, rounds, p_num, fs_folded=fs_folded, reps=reps, maxiters=maxiters, folds=folds)
Optimize_Functions_GOF.Perform_Sims(sims, scaled_fs, pts, "sym_mig", sym_mig, rounds, pnum, fs_folded=fs_folded,
reps=reps, maxiters=maxiters, folds=folds)

130 changes: 79 additions & 51 deletions Optimize_Functions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
'''
-------------------------
Written for Python 2.7 and 3.7
Python modules required:
-Numpy
-Scipy
-dadi
-------------------------
Daniel Portik
[email protected]
https://github.com/dportik
Updated September 2019
'''
import sys
import os
import numpy
Expand All @@ -15,28 +29,31 @@ def parse_params(param_number, in_params=None, in_upper=None, in_lower=None):
# in_lower: a list of lower bound values
#--------------------------------------------------------------------------------------
param_number = int(param_number)

#param set
if in_params is None:
params = [1] * param_number
elif len(in_params) != param_number:
raise ValueError("Set of input parameters does not contain the correct number of values: {}".format(param_number))
else:
params = in_params

#upper bound
if in_upper is None:
upper_bound = [30] * param_number
elif len(in_upper) != param_number:
raise ValueError("Upper bound set for parameters does not contain the correct number of values: {}".format(param_number))
else:
upper_bound = in_upper

#lower bounds
if in_lower is None:
lower_bound = [0.01] * param_number
elif len(in_lower) != param_number:
raise ValueError("Lower bound set for parameters does not contain the correct number of values: {}".format(param_number))
else:
lower_bound = in_lower
#send back values
return params, upper_bound, lower_bound

def parse_opt_settings(rounds, reps=None, maxiters=None, folds=None):
Expand All @@ -50,6 +67,7 @@ def parse_opt_settings(rounds, reps=None, maxiters=None, folds=None):
# folds: a list of integers controlling the fold argument when perturbing input parameter values
#--------------------------------------------------------------------------------------
rounds = int(rounds)

#rep set
#create scheme where final replicates will be 20, and all previous 10
if reps is None:
Expand All @@ -62,13 +80,15 @@ def parse_opt_settings(rounds, reps=None, maxiters=None, folds=None):
raise ValueError("List length of replicate values does match the number of rounds: {}".format(rounds))
else:
reps_list = reps

#maxiters
if maxiters is None:
maxiters_list = [5] * rounds
elif len(maxiters) != rounds:
raise ValueError("List length of maxiter values does match the number of rounds: {}".format(rounds))
else:
maxiters_list = maxiters

#folds
#create scheme so if rounds is greater than three, will always end with two fold and then one fold
if folds is None:
Expand All @@ -85,7 +105,7 @@ def parse_opt_settings(rounds, reps=None, maxiters=None, folds=None):
raise ValueError("List length of fold values does match the number of rounds: {}".format(rounds))
else:
folds_list = folds
#send back values
return reps_list, maxiters_list, folds_list

def collect_results(fs, sim_model, params_opt, roundrep, fs_folded):
Expand All @@ -99,48 +119,36 @@ def collect_results(fs, sim_model, params_opt, roundrep, fs_folded):
# fs_folded: a Boolean (True, False) for whether empirical spectrum is folded or not
#--------------------------------------------------------------------------------------

#create empty list to store results
temp_results = []

#calculate likelihood
ll = dadi.Inference.ll_multinom(sim_model, fs)
ll = numpy.around(ll, 2)
print "\t\t\tLikelihood = ", ll
print("\t\t\tLikelihood = {:,}".format(ll))

#calculate AIC
aic = ( -2*( float(ll))) + (2*len(params_opt))
print "\t\t\tAIC = ", aic
print("\t\t\tAIC = {:,}".format(aic))

#calculate theta
theta = dadi.Inference.optimal_sfs_scaling(sim_model, fs)
theta = numpy.around(theta, 2)
print "\t\t\tTheta = ", theta
print("\t\t\tTheta = {:,}".format(theta))

#get Chi^2
scaled_sim_model = sim_model*theta
if fs_folded is True:
#calculate Chi^2 statistic for folded
scaled_sim_model = sim_model*theta
folded_sim_model = scaled_sim_model.fold()
chi2 = numpy.sum((folded_sim_model - fs)**2/folded_sim_model)
chi2 = numpy.around(chi2, 2)
print "\t\t\tChi-Squared = ", chi2

elif fs_folded is False:
#calculate Chi^2 statistic for unfolded
scaled_sim_model = sim_model*theta
chi2 = numpy.sum((scaled_sim_model - fs)**2/scaled_sim_model)
chi2 = numpy.around(chi2, 2)
print "\t\t\tChi-Squared = ", chi2

print("\t\t\tChi-Squared = {:,}".format(chi2))

#store key results in temporary sublist, append to larger results list
temp_results.append(roundrep)
temp_results.append(ll)
temp_results.append(aic)
temp_results.append(chi2)
temp_results.append(theta)
temp_results.append(params_opt)
temp_results = [roundrep, ll, aic, chi2, theta, params_opt]

#send list of results back
return temp_results

def write_log(outfile, model_name, rep_results, roundrep):
Expand All @@ -162,13 +170,15 @@ def write_log(outfile, model_name, rep_results, roundrep):
fh_log.write(line)
fh_templog.close()
except IOError:
print "Nothing written to log file this replicate..."
print("Nothing written to log file this replicate...")
fh_log.write("likelihood = {}\n".format(rep_results[1]))
fh_log.write("theta = {}\n".format(rep_results[4]))
fh_log.write("Optimized parameters = {}\n".format(rep_results[5]))
fh_log.close()

def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded=True, reps=None, maxiters=None, folds=None, in_params=None, in_upper=None, in_lower=None, param_labels=" "):
def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded=True,
reps=None, maxiters=None, folds=None, in_params=None,
in_upper=None, in_lower=None, param_labels=" "):
#--------------------------------------------------------------------------------------
# Mandatory Arguments =
#(1) fs: spectrum object name
Expand Down Expand Up @@ -196,24 +206,24 @@ def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, f
#call function that determines if our replicates, maxiter, and fold have been set or need to be generated for us
reps_list, maxiters_list, folds_list = parse_opt_settings(rounds, reps, maxiters, folds)

print "\n\n============================================================================\nModel {}\n============================================================================".format(model_name)
print("\n\n============================================================================"
"\nModel {}\n============================================================================\n\n".format(model_name))

#start keeping track of time it takes to complete optimizations for this model
tb_round = datetime.now()
tbr = datetime.now()

# We need an output file that will store all summary info for each replicate, across rounds
outname = "{0}.{1}.optimized.txt".format(outfile,model_name)
fh_out = open(outname, 'a')
fh_out.write("Model"+'\t'+"Replicate"+'\t'+"log-likelihood"+'\t'+"AIC"+'\t'+"chi-squared"+'\t'+"theta"+'\t'+"optimized_params({})".format(param_labels)+'\n')
fh_out.close()

outname = "{0}.{1}.optimized.txt".format(outfile, model_name)
with open(outname, 'a') as fh_out:
fh_out.write("Model\tReplicate\tlog-likelihood\tAIC\tchi-squared\ttheta\toptimized_params({})\n".format(param_labels))

#Create list to store sublists of [roundnum_repnum, log-likelihood, AIC, chi^2 test stat, theta, parameter values] for every replicate
results_list = []

#for every round, execute the assigned number of replicates with other round-defined args (maxiter, fold, best_params)
rounds = int(rounds)
for r in range(rounds):
print "\tBeginning Optimizations for Round {}:".format(r+1)
print("\tBeginning Optimizations for Round {}:".format(r+1))

#make sure first round params are assigned (either user input or auto generated)
if r == int(0):
Expand All @@ -224,28 +234,34 @@ def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, f

#perform an optimization routine for each rep number in this round number
for rep in range(1, (reps_list[r]+1) ):
print "\t\tRound {0} Replicate {1} of {2}:".format(r+1, rep, (reps_list[r]))
print("\n\t\tRound {0} Replicate {1} of {2}:".format(r+1, rep, (reps_list[r])))

#keep track of start time for rep
tb_rep = datetime.now()

#create an extrapolating function
func_exec = dadi.Numerics.make_extrap_log_func(func)


#perturb starting parameters
params_perturbed = dadi.Misc.perturb_params(best_params, fold=folds_list[r], upper_bound=upper_bound, lower_bound=lower_bound)

params_perturbed = dadi.Misc.perturb_params(best_params, fold=folds_list[r],
upper_bound=upper_bound, lower_bound=lower_bound)

print("\n\t\t\tStarting parameters = [{}]".format(", ".join([str(numpy.around(x, 6)) for x in params_perturbed])))

#optimize from perturbed parameters
params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts, lower_bound=lower_bound, upper_bound=upper_bound, verbose=1, maxiter=maxiters_list[r], output_file = "{}.log.txt".format(model_name))
print "\t\t\tOptimized parameters = ", params_opt

params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
lower_bound=lower_bound, upper_bound=upper_bound,
verbose=1, maxiter=maxiters_list[r],
output_file = "{}.log.txt".format(model_name))

print("\t\t\tOptimized parameters =[{}]\n".format(", ".join([str(numpy.around(x, 6)) for x in params_opt[0]])))

#simulate the model with the optimized parameters
sim_model = func_exec(params_opt, fs.sample_sizes, pts)
sim_model = func_exec(params_opt[0], fs.sample_sizes, pts)

#collect results into a list using function above - [roundnum_repnum, log-likelihood, AIC, chi^2 test stat, theta, parameter values]
roundrep = "Round_{0}_Replicate_{1}".format(r+1, rep)
rep_results = collect_results(fs, sim_model, params_opt, roundrep, fs_folded)
rep_results = collect_results(fs, sim_model, params_opt[0], roundrep, fs_folded)

#reproduce replicate log to bigger log file, because constantly re-written
write_log(outfile, model_name, rep_results, roundrep)
Expand All @@ -254,25 +270,37 @@ def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, f
results_list.append(rep_results)

#write all this info to our main results file
fh_out = open(outname, 'a')
#join the param values together with commas
easy_p = ",".join(str(numpy.around(x, 4)) for x in rep_results[5])
fh_out.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(model_name, rep_results[0], rep_results[1], rep_results[2], rep_results[3], rep_results[4], easy_p))
fh_out.close()
with open(outname, 'a') as fh_out:
#join the param values together with commas
easy_p = ",".join([str(numpy.around(x, 4)) for x in rep_results[5]])
fh_out.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(model_name, rep_results[0],
rep_results[1], rep_results[2],
rep_results[3], rep_results[4],
easy_p))

#calculate elapsed time for replicate
tf_rep = datetime.now()
te_rep = tf_rep - tb_rep
print "\n\t\t\tReplicate time: {0} (H:M:S)\n".format(te_rep)
print("\n\t\t\tReplicate time: {0} (H:M:S)\n".format(te_rep))

#Now that this round is over, sort results in order of likelihood score, we'll use the parameters from the best rep to start the next round as the loop continues
#Now that this round is over, sort results in order of likelihood score
#we'll use the parameters from the best rep to start the next round as the loop continues
results_list.sort(key=lambda x: float(x[1]), reverse=True)
print "\tBest so far: {0}, ll = {1}\n\n".format(results_list[0][0], results_list[0][1])
print("\n\t----------------------------------------------\n"
"\tBest replicate: {0}\n"
"\t\tLikelihood = {1:,}\n\t\tAIC = {2:,}\n"
"\t\tChi-Squared = {3:,}\n\t\tParams = [{4}]\n"
"\t----------------------------------------------\n\n".format(results_list[0][0],
results_list[0][1],
results_list[0][2],
results_list[0][3],
", ".join([str(numpy.around(x, 4)) for x in rep_results[5]])))

#Now that all rounds are over, calculate elapsed time for the whole model
tf_round = datetime.now()
te_round = tf_round - tb_round
print "\n{0} Analysis Time for Model: {1} (H:M:S)\n\n============================================================================".format(model_name, te_round)
tfr = datetime.now()
ter = tfr - tbr
print("\nAnalysis Time for Model '{0}': {1} (H:M:S)\n\n"
"============================================================================".format(model_name, ter))

#cleanup file
os.remove("{}.log.txt".format(model_name))
Loading

0 comments on commit cbbabf1

Please sign in to comment.