Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split main #9

Merged
merged 3 commits into from
Jul 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions MALVA
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,16 @@ else
(>&2 echo "[${SCRIPT_NAME}] Found KMC output")
fi

(>&2 echo "[${SCRIPT_NAME}] Running ${BIN_NAME}")
${EXECUTABLE} ${strip_chr} ${verbose} ${haploid} ${uniform} -k ${k} -r ${refk} -e ${erate} -f ${freq} -s ${samples} -c ${maxcov} -b ${bfsize} ${reference} ${vcf_file} ${kmc_out_prefix}
(>&2 echo "[${SCRIPT_NAME}] Running ${BIN_NAME} index")
if [ ! -f ${vcf_file}."c"${refk}".k"${k}".malvax" ]
then
${EXECUTABLE} index ${strip_chr} ${verbose} ${haploid} ${uniform} -k ${k} -r ${refk} -e ${erate} -f ${freq} -s ${samples} -c ${maxcov} -b ${bfsize} ${reference} ${vcf_file} ${kmc_out_prefix}
else
(>&2 echo "[${SCRIPT_NAME}] Index file exists already. Skipping creation.")
fi

(>&2 echo "[${SCRIPT_NAME}] Running ${BIN_NAME} call")
${EXECUTABLE} call ${strip_chr} ${verbose} ${haploid} ${uniform} -k ${k} -r ${refk} -e ${erate} -f ${freq} -s ${samples} -c ${maxcov} -b ${bfsize} ${reference} ${vcf_file} ${kmc_out_prefix}

(>&2 echo "[${SCRIPT_NAME}] Cleaning up")
rm -rf ${kmc_tmp_dir}
Expand Down
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,11 @@ make

## Usage
```
Usage: malva-geno [-k KMER-SIZE] [-r REF-KMER-SIZE] [-c MAX-COV] <reference> <variants> <kmc_output_prefix>
Usage: malva-geno <subcommand> [-k KMER-SIZE] [-r REF-KMER-SIZE] [-c MAX-COV] <reference> <variants> <kmc_output_prefix>

Arguments:
<subcommand> either index to create the reference index or call to call call the genotypes.

-h, --help display this help and exit
-k, --kmer-size size of the kmers to index (default:35)
-r, --ref-kmer-size size of the reference kmers to index (default:43)
Expand Down Expand Up @@ -123,7 +125,8 @@ The last command is equivalent to run:
```
mkdir -p kmc_tmp
../KMC/bin/kmc -m4 -k43 -fm chr20.sample.fa kmc.out kmc_tmp
../malva-geno -k 35 -r 43 -b 1 -f EUR_AF chr20.fa chr20.vcf kmc.out > chr20.genotyped.vcf
../malva-geno index -k 35 -r 43 -b 1 -f EUR_AF chr20.fa chr20.vcf kmc.out
../malva-geno call -k 35 -r 43 -b 1 -f EUR_AF chr20.fa chr20.vcf kmc.out > chr20.genotyped.vcf
```

This should take less than 1 minute to complete. You can also verify
Expand Down
23 changes: 22 additions & 1 deletion bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class BF {
}

public:
BF() : _mode(false), _bf(0, 0) { _size = 0; };
BF(const size_t size) : _mode(false), _bf(size, 0) { _size = size; }
~BF() {}

Expand Down Expand Up @@ -114,8 +115,28 @@ class BF {
return 0;
}

ostream& operator>>(ostream& stream)
{
stream.write(reinterpret_cast<const char*>(&_mode), sizeof(bool));
stream.write(reinterpret_cast<const char*>(&_size), sizeof(size_t));
_bf.serialize(stream);
// We don't serialize _brank since loading it will crash.
// TODO: this need some further investigation.
_counts.serialize(stream);
return stream;
}

istream& operator<<(istream& stream)
{
stream.read(reinterpret_cast<char*>(&_mode), sizeof(bool));
stream.read(reinterpret_cast<char*>(&_size), sizeof(size_t));
_bf.load(stream);
_brank = rank_support_v<1>(&_bf);
_counts.load(stream);
return stream;
}

private:
BF() {}
const BF &operator=(const BF &other) { return *this; }
const BF &operator=(const BF &&other) { return *this; }

Expand Down
32 changes: 32 additions & 0 deletions kmap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,38 @@ struct KMAP {

KMAP() {}

ostream& operator>>(ostream& stream)
{
size_t size = kmers.size();
stream.write(reinterpret_cast<const char*>(&size), sizeof(size_t));
for (const auto& pair : kmers)
{
string::size_type l = pair.first.length();
stream.write(reinterpret_cast<const char*>(&l), sizeof(string::size_type));
stream.write(reinterpret_cast<const char*>((char*) pair.first.data()), l);
stream.write(reinterpret_cast<const char*>(&pair.second), sizeof(int));
}
return stream;
}

istream& operator<<(istream& stream)
{
size_t size;
stream.read(reinterpret_cast<char*>(&size), sizeof(size_t));
for (size_t i = 0; i < size; ++i)
{
string::size_type l;
string k;
int v;
stream.read(reinterpret_cast<char*>(&l), sizeof(string::size_type));
k.resize(l);
stream.read(reinterpret_cast<char*>((char*) k.data()), l);
stream.read(reinterpret_cast<char*>(&v), sizeof(int));
kmers[k] = v;
}
return stream;
}

static const char _compl(const char &c) { return RCN[c]; }

string canonical(const char* kmer) {
Expand Down
101 changes: 98 additions & 3 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ using namespace std;

auto start_t = chrono::high_resolution_clock::now();

static const char* MALVA_IDX_SUFFIX = ".malvax";

void pelapsed(const string &s = "", const bool rollback = false) {
auto now_t = chrono::high_resolution_clock::now();
cerr << "[malva-geno/" << s << "] Time elapsed "
Expand Down Expand Up @@ -146,7 +148,34 @@ void print_cleaned_header(bcf_hdr_t *vcf_header, const bool verbose) {

// ---------------------------------------------------------------------------

int index_main(int argc, char *argv[]);
int call_main(int argc, char *argv[]);

int main(int argc, char *argv[]) {
if (argc < 2)
{
cerr << "malva missing arguments" << endl;
cerr << USAGE_MESSAGE << endl;
return 1;
}

if (strncmp(argv[1], "index", 5) == 0)
{
return index_main(argc-1, argv+1);
}
else if (strncmp(argv[1], "call", 4) == 0)
{
return call_main(argc-1, argv+1);
}
else
{
cerr << "Could not interpret command " << argv[1] <<"." << endl;
cerr << "Accepted commands are index and call." << endl;
return 1;
}
}

int index_main(int argc, char*argv[]) {
hts_set_log_level(HTS_LOG_OFF);

parse_arguments(argc, argv);
Expand Down Expand Up @@ -285,6 +314,70 @@ int main(int argc, char *argv[]) {

context_bf.switch_mode();

ofstream index_stream(opt::vcf_path + ".c" + to_string(opt::ref_k) + ".k" + to_string(opt::k) + MALVA_IDX_SUFFIX);

context_bf >> index_stream;
bf >> index_stream;
ref_bf >> index_stream;

kseq_destroy(reference);
gzclose(fasta_in);
cout.flush();
return 0;
}

int call_main(int argc, char *argv[]) {
hts_set_log_level(HTS_LOG_OFF);

parse_arguments(argc, argv);

// STEP 0: open and check input files
gzFile fasta_in = gzopen(opt::fasta_path.c_str(), "r");
kseq_t *reference = kseq_init(fasta_in);

htsFile *vcf = bcf_open(opt::vcf_path.c_str(), "r");
bcf_hdr_t *vcf_header = bcf_hdr_read(vcf);
int is_file_flag = 0;
if(opt::samples != "-")
is_file_flag = 1;
int set_samples_code = bcf_hdr_set_samples(vcf_header, opt::samples.c_str(), is_file_flag);
if(set_samples_code != 0) {
cerr << "ERROR: VCF samples subset (code: " << set_samples_code << ")" << endl;
return 1;
}
bcf1_t *vcf_record = bcf_init();

CKMCFile kmer_db;
if (!kmer_db.OpenForListing(opt::kmc_sample_path)) {
cerr << "ERROR: cannot open " << opt::kmc_sample_path << endl;
return 1;
}

BF bf;
KMAP ref_bf;
BF context_bf;
{
ifstream index_stream(opt::vcf_path + ".c" + to_string(opt::ref_k) + ".k" + to_string(opt::k) + MALVA_IDX_SUFFIX);

context_bf << index_stream;
bf << index_stream;
ref_bf << index_stream;
}

// References are stored in a map
pelapsed("Reference parsing");
unordered_map<string, string> refs;
int l;
while ((l = kseq_read(reference)) >= 0) {
string id = reference->name.s;
if(id.compare(0,3,"chr") == 0 && opt::strip_chr) {
id = id.substr(3);
}
string seq (reference->seq.s);
refs[id] = seq;
}
pelapsed("Reference processed");

// STEP 2: test variants present in read sample
pelapsed("KMC output processing");
uint32 klen, mode, min_counter, pref_len, sign_len, min_c, counter;
Expand Down Expand Up @@ -321,8 +414,10 @@ int main(int argc, char *argv[]) {
vcf_record = bcf_init();

pelapsed("VCF parsing and genotyping");
i = 0;
last_seq_name = "";
int i = 0;
string last_seq_name = "";
VB vb(opt::k, opt::error_rate);

while (bcf_read(vcf, vcf_header, vcf_record) == 0) {
bcf_unpack(vcf_record, BCF_UN_STR);
Variant v(vcf_header, vcf_record, opt::freq_key, opt::uniform);
Expand Down Expand Up @@ -376,7 +471,7 @@ int main(int argc, char *argv[]) {
vb.output_variants(opt::haploid, opt::verbose);
vb.clear();
}
log_line = "Processed " + to_string(i) + " variants";
string log_line = "Processed " + to_string(i) + " variants";
pelapsed(log_line);

bcf_hdr_destroy(vcf_header);
Expand Down