diff --git a/Cargo.toml b/Cargo.toml index 26ef28e..aa9bbce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "echtvar" -version = "0.1.1" +version = "0.1.2" edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/src/commands/annotate_cmd.rs b/src/commands/annotate_cmd.rs index 6951033..f3698e7 100644 --- a/src/commands/annotate_cmd.rs +++ b/src/commands/annotate_cmd.rs @@ -129,6 +129,8 @@ pub fn annotate_main( match v { Value::Int(i) => match fld.ftype { fields::FieldType::Categorical => { + // categorical missing_value must be set to the index of the missing_string + assert!(i >= 0, "can't have missing value for categorical!"); let val = [e.strings[fld.values_i][i as usize].as_bytes()]; record.push_info_string(fld.alias.as_bytes(), &val).expect( &format!("error adding string for {}", fld.alias).to_string(), diff --git a/src/lib/echtvar.rs b/src/lib/echtvar.rs index 8c919e5..efcae79 100644 --- a/src/lib/echtvar.rs +++ b/src/lib/echtvar.rs @@ -128,12 +128,9 @@ impl EchtVars { fc.read_to_string(&mut contents) .expect("eror reading config.json"); drop(fc); - let flds: Vec = json5::from_str(&contents).unwrap(); - eprintln!("fields: {:?}", flds); - for fld in flds { - let mut f = fld.clone(); - f.values_i = result.fields.len(); - result.fields.push(f); + let mut flds: Vec = json5::from_str(&contents).unwrap(); + for fld in flds.iter_mut() { + fld.values_i = result.fields.len(); if fld.ftype == fields::FieldType::Categorical { // read in the strings for this field. replace ';' with ',' to handle the filter field. let fname = format!("echtvar/strings/{}.txt", fld.alias); @@ -144,13 +141,24 @@ impl EchtVars { result .strings .push(BufReader::new(fh).lines().map(|l| l.unwrap().replace(";", ",")).collect()); + // update missing value to be the index of the missing_string + let strings_len = result.strings[result.strings.len() - 1].len(); + fld.missing_value = result.strings[result.strings.len() - 1].iter().position(|s| s == &fld.missing_string).unwrap_or(strings_len) as i32; + // if it wasn't in the list, add it. + if fld.missing_value == strings_len as i32 { + let rl = result.strings.len() - 1; + result.strings[rl].push(fld.missing_string.clone()); + } } else { result.strings.push(Vec::new()); } + let f = fld.clone(); + result.fields.push(f); } result.values.resize(result.fields.len(), vec![]); result.evalues.resize(result.fields.len(), Value::Int(0)); } + eprintln!("fields: {:?}", result.fields); result } @@ -363,6 +371,7 @@ impl EchtVars { if fld.ftype == fields::FieldType::Integer || fld.ftype == fields::FieldType::Categorical { + // for Categorical missing_value has been set to the index of missing_string let val = fld.missing_value as i32; self.evalues[fld.values_i] = Value::Int(val); expr_values[fld.values_i] = val as f64 diff --git a/tests/check-string-for-issue8.py b/tests/check-string-for-issue8.py new file mode 100644 index 0000000..c45ff3d --- /dev/null +++ b/tests/check-string-for-issue8.py @@ -0,0 +1,9 @@ +import sys +import gzip + + +for line in gzip.open(sys.argv[1], 'rt'): + if line[0] == '#': continue + + toks = line.strip().split("\t") + assert ";test_filter=OHNO" in toks[7] diff --git a/tests/make-string-test-for-issue8.py b/tests/make-string-test-for-issue8.py new file mode 100644 index 0000000..aecba09 --- /dev/null +++ b/tests/make-string-test-for-issue8.py @@ -0,0 +1,41 @@ +import itertools +import random +import sys + + +#random.seed(42) + +fhdb = open("string-issue-8.db.vcf", "w") +fhq = open("string-issue-8.query.vcf", "w") + +header = ("""##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##INFO= +##INFO= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO""") + +FILTERS = ["PASS", "FAIL", "OTHER"] + +for fh in (fhdb, fhq): + print(header, file=fh) + +for switch in [1]: + switch = switch<<20 + + ref = "A" + alt = "C" + i = 12345 + + val = random.randint(0, 100) + flt = random.choice(FILTERS) + print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\t{flt}\tval=s{val};num=3", file=fhdb) + alt = "T" + print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\t{flt}\tval=s{val};num=3", file=fhq) + + +with open("string-issue-8.json", "w") as fh: + fh.write("""[{"field": "FILTER", "alias":"test_filter", "missing_string": "OHNO"}]""") diff --git a/tests/string.sh b/tests/string.sh index bce97a6..afb2ded 100644 --- a/tests/string.sh +++ b/tests/string.sh @@ -16,3 +16,11 @@ python check-string.py string-anno.vcf.gz $echtvar anno -i "anno_num != 3" -e string.echtvar string.vcf string-anno.vcf.gz rm -f string.vcf string.echtvar # string-anno.vcf.gz + +python make-string-test-for-issue8.py +$echtvar encode issue8.echtvar string-issue-8.json string-issue-8.db.vcf +$echtvar anno -e issue8.echtvar string-issue-8.query.vcf issue-8.output.vcf.gz +python check-string-for-issue8.py issue-8.output.vcf.gz + +rm -r issue-8.output.vcf.gz issue8.echtvar string-issue-8.db.vcf string-issue-8.query.vcf string-issue-8.json +