Skip to content

Commit

Permalink
Move contig len hist to .ctp file header
Browse files Browse the repository at this point in the history
Note: ctp files have changed and are not backwards compatible!
      We are now on CTP version 3.

Removed options to dump/load CSV of contig length histogram. This histogram
now lives in CTP path file headers.
  • Loading branch information
noporpoise committed Oct 30, 2014
1 parent c30a348 commit e33c320
Show file tree
Hide file tree
Showing 31 changed files with 300 additions and 279 deletions.
28 changes: 15 additions & 13 deletions docs/file_formats/graph_format_v7.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,23 @@ Extension: .ctx
Version: 7

{
"fileFormat": "CtxGraph",
"formatVersion": 7,
"fileid": "a1c6b3f2e9864b2c",
"kmer_size": 15,
"ncols": 1,
"file_id": "file:a1c6b3f2e9864b2c",
"file_format": "CtxGraph",
"format_version": 7,
"sorted": true,
"idx_kmers_per_bckt": 2048,
"colours": [{
"colour": 0,
"sample": "WTCHG_101",
"inferred_edges": false,
"colourid": "7621b712cf5434a1"
}],
"graph": {
"kmer_size": 15,
"num_colours": 1,
"colours": [{
"colour": 0,
"sample": "WTCHG_101",
"inferred_edges": false,
"colourid": "7621b712cf5434a1"
}]
},
"commands": [{
"key": "6d8e1b05",
"key": "cmd:6d8e1b05",
"cmd": ["../../bin/ctx31", "build", "--sample", "RoadRunner", "-1", "in.sam", "-1", "thing.sam", "mix.k11.ctx"],
"cwd": "/Users/isaac/ninja-cortex/",
"outpath": "/Users/isaac/ninja-cortex/mix.k11.ctx",
Expand Down Expand Up @@ -125,7 +127,7 @@ less, cat etc.
"formatVersion": 7,
"fileid": "a1c6b3f2e9864b2c",
"kmer_size": 15,
"ncols": 1,
"num_colours": 1,

"fileid" is a random generated 16 digit hexadecimal key used to identify this
file. This should be regenerated everytime the file content is modified.
Expand Down
4 changes: 4 additions & 0 deletions libs/cJSON/cJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,10 @@ cJSON *cJSON_CreateString(const char *string) {cJSON *item=cJSON_New_Item();if(i
cJSON *cJSON_CreateArray(void) {cJSON *item=cJSON_New_Item();if(item)item->type=cJSON_Array;return item;}
cJSON *cJSON_CreateObject(void) {cJSON *item=cJSON_New_Item();if(item)item->type=cJSON_Object;return item;}

/* Isaac Added */
cJSON *cJSON_CreateInt(long num) {cJSON *item=cJSON_New_Item();if(item){item->type=cJSON_Number;item->valuedouble=num;item->valueint=num;}return item;}


/* Create Arrays: */
cJSON *cJSON_CreateIntArray(const int *numbers,int count) {int i;cJSON *n=0,*p=0,*a=cJSON_CreateArray();for(i=0;a && i<count;i++){n=cJSON_CreateNumber(numbers[i]);if(!i)a->child=n;else suffix_object(p,n);p=n;}return a;}
cJSON *cJSON_CreateFloatArray(const float *numbers,int count) {int i;cJSON *n=0,*p=0,*a=cJSON_CreateArray();for(i=0;a && i<count;i++){n=cJSON_CreateNumber(numbers[i]);if(!i)a->child=n;else suffix_object(p,n);p=n;}return a;}
Expand Down
2 changes: 2 additions & 0 deletions libs/cJSON/cJSON.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ extern cJSON *cJSON_CreateString(const char *string);
extern cJSON *cJSON_CreateArray(void);
extern cJSON *cJSON_CreateObject(void);

extern cJSON *cJSON_CreateInt(long num);

/* These utilities create an Array of count items. */
extern cJSON *cJSON_CreateIntArray(const int *numbers,int count);
extern cJSON *cJSON_CreateFloatArray(const float *numbers,int count);
Expand Down
30 changes: 3 additions & 27 deletions src/alignment/correct_aln_stats.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,8 @@ void correct_aln_stats_merge(CorrectAlnStats *restrict dst,
{
size_t i, j;

// Contig histogram
zsize_buf_capacity(&dst->contig_histgrm, src->contig_histgrm.len);
dst->contig_histgrm.len = MAX2(dst->contig_histgrm.len, src->contig_histgrm.len);
// Contig histogram - extend length if needed
zsize_buf_extend(&dst->contig_histgrm, src->contig_histgrm.len);

for(i = 0; i < src->contig_histgrm.len; i++)
dst->contig_histgrm.data[i] += src->contig_histgrm.data[i];
Expand Down Expand Up @@ -77,8 +76,7 @@ void correct_aln_stats_add_mp(CorrectAlnStats *stats, size_t gap_kmers,

void correct_aln_stats_add_contig(CorrectAlnStats *stats, size_t contig_len_bp)
{
zsize_buf_capacity(&stats->contig_histgrm, contig_len_bp+1);
stats->contig_histgrm.len = MAX2(stats->contig_histgrm.len, contig_len_bp+1);
zsize_buf_extend(&stats->contig_histgrm, contig_len_bp+1);
stats->contig_histgrm.data[contig_len_bp]++;
}

Expand Down Expand Up @@ -123,24 +121,6 @@ void correct_aln_stats_dump_fraglen(const CorrectAlnStats *stats, const char *pa
fclose(fout);
}

// Save fragment size vector
void correct_aln_stats_dump_contiglen(const CorrectAlnStats *stats, const char *path)
{
status("[CorrectAln] Saving contig length distribution to: %s", path);

size_t i;
FILE *fout = fopen(path, "w");
if(fout == NULL) { warn("Cannot cannot open: %s", path); return; }

fprintf(fout, "contig_len_bp\tcount\n");

for(i = 0; i < stats->contig_histgrm.len; i++) {
if(stats->contig_histgrm.data[i])
fprintf(fout, "%4zu\t%4zu\n", i, stats->contig_histgrm.data[i]);
}
fclose(fout);
}

void correct_aln_stats_print_summary(const CorrectAlnStats *stats,
size_t num_se_reads, size_t num_read_pairs)
{
Expand Down Expand Up @@ -300,7 +280,6 @@ void correct_aln_dump_stats(const CorrectAlnStats *aln_stats,
const LoadingStats *load_stats,
const char *dump_seqgap_hist_path,
const char *dump_fraglen_hist_path,
const char *dump_contiglen_hist_path,
size_t ht_num_kmers)
{
correct_aln_stats_print_summary(aln_stats, load_stats->num_se_reads,
Expand All @@ -315,8 +294,5 @@ void correct_aln_dump_stats(const CorrectAlnStats *aln_stats,
correct_aln_stats_dump_fraglen(aln_stats, dump_fraglen_hist_path);
}

if(dump_contiglen_hist_path)
correct_aln_stats_dump_contiglen(aln_stats, dump_contiglen_hist_path);

loading_stats_print_summary(load_stats, ht_num_kmers);
}
5 changes: 1 addition & 4 deletions src/alignment/correct_aln_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
#define CORRECT_ALN_STATS_H_

#include "loading_stats.h"

#include "madcrowlib/madcrow_buffer.h"
madcrow_buffer_wipe(zsize_buf, ZeroSizeBuffer, size_t);
#include "common_buffers.h"

#define ALN_STATS_MAX_GAP 128
#define ALN_STATS_MAX_FRAGLEN 1024
Expand Down Expand Up @@ -79,7 +77,6 @@ void correct_aln_dump_stats(const CorrectAlnStats *aln_stats,
const LoadingStats *load_stats,
const char *dump_seqgap_hist_path,
const char *dump_fraglen_hist_path,
const char *dump_contiglen_hist_path,
size_t ht_num_kmers);

#endif /* CORRECT_ALN_STATS_H_ */
1 change: 1 addition & 0 deletions src/basic/common_buffers.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ madcrow_buffer(byte_buf, ByteBuffer, uint8_t);
madcrow_buffer(uint32_buf, Uint32Buffer, uint32_t);
madcrow_buffer(int32_buf, Int32Buffer, int32_t);
madcrow_buffer(size_buf, SizeBuffer, size_t);
madcrow_buffer_wipe(zsize_buf, ZeroSizeBuffer, size_t);

#endif /* COMMON_BUFFERS_H_ */
41 changes: 15 additions & 26 deletions src/commands/ctx_contigs.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ const char contigs_usage[] =
" -L, --read-length <R> Expected read length for calc. contig confidences\n"
" -d, --depth <C> Expected depth for calc. contig confidences\n"
" -G, --genome <G> Genome size in bases\n"
" -C, --contig-hist <h.csv> Length distrib. from 'thread' [can have multiple]\n"
" -S, --confid-csv <save.csv> Save confidence table to <save.csv>\n"
"\n";

Expand All @@ -55,7 +54,6 @@ static struct option longopts[] =
{"read-length", required_argument, NULL, 'L'},
{"depth", required_argument, NULL, 'd'},
{"genome", required_argument, NULL, 'G'},
{"contig-hist", required_argument, NULL, 'C'},
{"confid-csv", required_argument, NULL, 'S'},
{NULL, 0, NULL, 0}
};
Expand All @@ -81,9 +79,6 @@ int ctx_contigs(int argc, char **argv)
GPathFileBuffer gpfiles;
gpfile_buf_alloc(&gpfiles, 8);

CharPtrBuffer contig_hist_paths;
char_ptr_buf_alloc(&contig_hist_paths, 8);

// Arg parsing
char cmd[100], shortopts[300];
cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
Expand Down Expand Up @@ -123,7 +118,6 @@ int ctx_contigs(int argc, char **argv)
case 'L': cmd_check(!exp_read_length,cmd); exp_read_length = cmd_size(cmd, optarg); break;
case 'd': cmd_check(exp_avg_bp_covg<0,cmd); exp_avg_bp_covg = cmd_udouble(cmd, optarg); break;
case 'G': cmd_check(!genome_size,cmd); genome_size = cmd_size_nonzero(cmd, optarg); break;
case 'C': char_ptr_buf_add(&contig_hist_paths, optarg); break;
case 'S': cmd_check(!conf_table_path,cmd); conf_table_path = optarg; break;
case ':': /* BADARG */
case '?': /* BADCH getopt_long has already printed error */
Expand All @@ -137,9 +131,6 @@ int ctx_contigs(int argc, char **argv)

bool sample_with_replacement = cmd_reseed;

if(contig_hist_paths.len > 0 && (exp_read_length || exp_avg_bp_covg >= 0))
cmd_print_usage("Only need --covg-hist <in.csv> (don't need --read-length / --depth)");

// Defaults
if(nthreads == 0) nthreads = DEFAULT_NTHREADS;

Expand Down Expand Up @@ -214,30 +205,29 @@ int ctx_contigs(int argc, char **argv)
total_mem = graph_mem + path_mem;
cmd_check_mem_limit(memargs.mem_to_use, total_mem);

// Load contig lengths
// Load contig hist distribution
ZeroSizeBuffer contig_hist;
memset(&contig_hist, 0, sizeof(contig_hist));

for(i = 0; i < gpfiles.len; i++) {
gpath_reader_load_contig_hist(gpfiles.data[i].json,
gpfiles.data[i].fltr.path.b,
&contig_hist);
}

// Calculate confidences
ContigConfidenceTable conf_table;
memset(&conf_table, 0, sizeof(conf_table));

if(contig_hist_paths.len > 0)
{
FILE *contig_fh;
const char *contig_path;
for(i = 0; i < contig_hist_paths.len; i++) {
contig_path = contig_hist_paths.data[i];
if((contig_fh = fopen(contig_path, "r")) == NULL)
die("Cannot open --contig-hist file: %s", contig_path);
conf_table_load_csv(&conf_table, contig_fh, contig_path);
}
conf_table_calc_csv(&conf_table, genome_size);
}
else {
conf_table_calc(&conf_table, exp_read_length, exp_avg_bp_covg);
}
conf_table_update_hist(&conf_table, genome_size,
contig_hist.data, contig_hist.len);

if(conf_table_path != NULL) {
conf_table_save(&conf_table, conf_table_path);
}

zsize_buf_dealloc(&contig_hist);

//
// Output file if printing
//
Expand Down Expand Up @@ -295,7 +285,6 @@ int ctx_contigs(int argc, char **argv)
seq_close(seed_buf.data[i]);

seq_file_ptr_buf_dealloc(&seed_buf);
char_ptr_buf_dealloc(&contig_hist_paths);

ctx_free(visited);
db_graph_dealloc(&db_graph);
Expand Down
5 changes: 3 additions & 2 deletions src/commands/ctx_correct.c
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,9 @@ int ctx_correct(int argc, char **argv)
// Run alignment
//
correct_reads(inputs->data, inputs->len,
args.dump_seq_sizes, args.dump_frag_sizes, args.dump_contig_sizes,
args.fq_zero, args.append_orig_seq, args.nthreads, &db_graph);
args.dump_seq_sizes, args.dump_frag_sizes,
args.fq_zero, args.append_orig_seq,
args.nthreads, &db_graph);

// Close and free output files
for(i = 0; i < inputs->len; i++)
Expand Down
16 changes: 15 additions & 1 deletion src/commands/ctx_pjoin.c
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,15 @@ int ctx_pjoin(int argc, char **argv)
for(i = 0; i < num_pfiles; i++)
gpath_reader_load_sample_names(&pfiles[i], &db_graph);

// Load contig hist distribution
ZeroSizeBuffer contig_histgrm;
memset(&contig_histgrm, 0, sizeof(contig_histgrm));

for(i = 0; i < num_pfiles; i++) {
gpath_reader_load_contig_hist(pfiles[i].json, pfiles[i].fltr.path.b,
&contig_histgrm);
}

// Load path files
for(i = 0; i < num_pfiles; i++)
gpath_reader_load(&pfiles[i], GPATH_ADD_MISSING_KMERS, &db_graph);
Expand All @@ -211,7 +220,12 @@ int ctx_pjoin(int argc, char **argv)
for(i = 0; i < num_pfiles; i++) hdrs[i] = pfiles[i].json;

// Write output file
gpath_save(gzout, out_ctp_path, output_threads, hdrs, num_pfiles, &db_graph);
gpath_save(gzout, out_ctp_path, output_threads, hdrs, num_pfiles,
contig_histgrm.data, contig_histgrm.len,
&db_graph);

zsize_buf_dealloc(&contig_histgrm);

gzclose(gzout);
ctx_free(hdrs);

Expand Down
44 changes: 24 additions & 20 deletions src/commands/ctx_thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ const char thread_usage[] =
" -E, --no-end-check Skip extra check after gap bridging\n"
" -g, --gap-hist <o.csv> Save size distribution of sequence gaps bridged\n"
" -G, --frag-hist <o.csv> Save size distribution of PE fragments\n"
" -C, --contig-hist <.csv> Save size distribution of assembled contigs\n"
"\n"
" -u, --use-new-paths Use paths as they are being added (higher err rate) [default: no]\n"
"\n"
Expand Down Expand Up @@ -89,7 +88,6 @@ static struct option longopts[] =
{"no-end-check", no_argument, NULL, 'E'},
{"gap-hist", required_argument, NULL, 'g'},
{"frag-hist", required_argument, NULL, 'G'},
{"contig-hist", required_argument, NULL, 'C'},
//
{"use-new-paths", required_argument, NULL, 'u'},
// Debug options
Expand Down Expand Up @@ -189,10 +187,27 @@ int ctx_thread(int argc, char **argv)

db_graph.node_in_cols = ctx_calloc(roundup_bits2bytes(kmers_in_hash), 1);

//
// Start up workers to add paths to the graph
//
GenPathWorker *workers;
workers = gen_paths_workers_alloc(args.nthreads, &db_graph);

// Setup for loading graphs graph
LoadingStats gstats;
loading_stats_init(&gstats);

// Path statistics
LoadingStats *load_stats = gen_paths_get_stats(workers);
CorrectAlnStats *aln_stats = gen_paths_get_aln_stats(workers);

// Load contig hist distribution
for(i = 0; i < gpfiles->len; i++) {
gpath_reader_load_contig_hist(gpfiles->data[i].json,
gpfiles->data[i].fltr.path.b,
&aln_stats->contig_histgrm);
}

GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
.boolean_covgs = false,
.must_exist_in_graph = false,
Expand All @@ -208,19 +223,11 @@ int ctx_thread(int argc, char **argv)
for(i = 0; i < gpfiles->len; i++)
gpath_reader_load(&gpfiles->data[i], GPATH_DIE_MISSING_KMERS, &db_graph);

//
// Start up the threads, do the work
//
GenPathWorker *workers;
workers = gen_paths_workers_alloc(args.nthreads,
args.dump_contig_sizes != NULL,
&db_graph);

// Deal with a set of files at once
// Can have different numbers of inputs vs threads
size_t start, end;
for(start = 0; start < inputs->len; start += MAX_IO_THREADS)
{
// Can have different numbers of inputs vs threads
end = MIN2(inputs->len, start+MAX_IO_THREADS);
generate_paths(inputs->data+start, end-start, workers, args.nthreads);
}
Expand All @@ -229,19 +236,11 @@ int ctx_thread(int argc, char **argv)
gpath_hash_print_stats(&db_graph.gphash);
gpath_store_print_stats(&db_graph.gpstore);

// Output statistics
LoadingStats *load_stats = gen_paths_get_stats(workers);
CorrectAlnStats *aln_stats = gen_paths_get_aln_stats(workers);

correct_aln_dump_stats(aln_stats, load_stats,
args.dump_seq_sizes,
args.dump_frag_sizes,
args.dump_contig_sizes,
db_graph.ht.num_kmers);

// ins_gap, err_gap no longer allocated after this line
gen_paths_workers_dealloc(workers, args.nthreads);

// Don't need GPathHash anymore
gpath_hash_dealloc(&db_graph.gphash);

Expand All @@ -252,12 +251,17 @@ int ctx_thread(int argc, char **argv)

// Write output file
gpath_save(gzout, args.out_ctp_path, output_threads,
hdrs, gpfiles->len, &db_graph);
hdrs, gpfiles->len,
aln_stats->contig_histgrm.data, aln_stats->contig_histgrm.len,
&db_graph);
gzclose(gzout);

// Optionally run path checks for debugging
// gpath_checks_all_paths(&db_graph, args.nthreads);

// ins_gap, err_gap no longer allocated after this line
gen_paths_workers_dealloc(workers, args.nthreads);

// Close and free input files etc.
read_thread_args_dealloc(&args);
db_graph_dealloc(&db_graph);
Expand Down
Loading

0 comments on commit e33c320

Please sign in to comment.