Skip to content

Commit

Permalink
handle missing string value
Browse files Browse the repository at this point in the history
this closes #8. this was a major bug with a small change to fix.
  • Loading branch information
brentp committed Feb 7, 2022
1 parent 3c22d5d commit 76078f6
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 7 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "echtvar"
version = "0.1.1"
version = "0.1.2"
edition = "2018"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
2 changes: 2 additions & 0 deletions src/commands/annotate_cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ pub fn annotate_main(
match v {
Value::Int(i) => match fld.ftype {
fields::FieldType::Categorical => {
// categorical missing_value must be set to the index of the missing_string
assert!(i >= 0, "can't have missing value for categorical!");
let val = [e.strings[fld.values_i][i as usize].as_bytes()];
record.push_info_string(fld.alias.as_bytes(), &val).expect(
&format!("error adding string for {}", fld.alias).to_string(),
Expand Down
21 changes: 15 additions & 6 deletions src/lib/echtvar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,12 +128,9 @@ impl EchtVars {
fc.read_to_string(&mut contents)
.expect("eror reading config.json");
drop(fc);
let flds: Vec<fields::Field> = json5::from_str(&contents).unwrap();
eprintln!("fields: {:?}", flds);
for fld in flds {
let mut f = fld.clone();
f.values_i = result.fields.len();
result.fields.push(f);
let mut flds: Vec<fields::Field> = json5::from_str(&contents).unwrap();
for fld in flds.iter_mut() {
fld.values_i = result.fields.len();
if fld.ftype == fields::FieldType::Categorical {
// read in the strings for this field. replace ';' with ',' to handle the filter field.
let fname = format!("echtvar/strings/{}.txt", fld.alias);
Expand All @@ -144,13 +141,24 @@ impl EchtVars {
result
.strings
.push(BufReader::new(fh).lines().map(|l| l.unwrap().replace(";", ",")).collect());
// update missing value to be the index of the missing_string
let strings_len = result.strings[result.strings.len() - 1].len();
fld.missing_value = result.strings[result.strings.len() - 1].iter().position(|s| s == &fld.missing_string).unwrap_or(strings_len) as i32;
// if it wasn't in the list, add it.
if fld.missing_value == strings_len as i32 {
let rl = result.strings.len() - 1;
result.strings[rl].push(fld.missing_string.clone());
}
} else {
result.strings.push(Vec::new());
}
let f = fld.clone();
result.fields.push(f);
}
result.values.resize(result.fields.len(), vec![]);
result.evalues.resize(result.fields.len(), Value::Int(0));
}
eprintln!("fields: {:?}", result.fields);
result
}

Expand Down Expand Up @@ -363,6 +371,7 @@ impl EchtVars {
if fld.ftype == fields::FieldType::Integer
|| fld.ftype == fields::FieldType::Categorical
{
// for Categorical missing_value has been set to the index of missing_string
let val = fld.missing_value as i32;
self.evalues[fld.values_i] = Value::Int(val);
expr_values[fld.values_i] = val as f64
Expand Down
9 changes: 9 additions & 0 deletions tests/check-string-for-issue8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import sys
import gzip


for line in gzip.open(sys.argv[1], 'rt'):
if line[0] == '#': continue

toks = line.strip().split("\t")
assert ";test_filter=OHNO" in toks[7]
41 changes: 41 additions & 0 deletions tests/make-string-test-for-issue8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import itertools
import random
import sys


#random.seed(42)

fhdb = open("string-issue-8.db.vcf", "w")
fhq = open("string-issue-8.query.vcf", "w")

header = ("""##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##FILTER=<ID=FAIL,Description="All filters passed">
##FILTER=<ID=OTHER,Description="All filters passed">
##INFO=<ID=num,Number=1,Type=Integer,Description="random string value">
##INFO=<ID=val,Number=.,Type=String,Description="random string value">
##contig=<ID=chr1,length=248956422>
##contig=<ID=1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO""")

FILTERS = ["PASS", "FAIL", "OTHER"]

for fh in (fhdb, fhq):
print(header, file=fh)

for switch in [1]:
switch = switch<<20

ref = "A"
alt = "C"
i = 12345

val = random.randint(0, 100)
flt = random.choice(FILTERS)
print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\t{flt}\tval=s{val};num=3", file=fhdb)
alt = "T"
print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\t{flt}\tval=s{val};num=3", file=fhq)


with open("string-issue-8.json", "w") as fh:
fh.write("""[{"field": "FILTER", "alias":"test_filter", "missing_string": "OHNO"}]""")
8 changes: 8 additions & 0 deletions tests/string.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,11 @@ python check-string.py string-anno.vcf.gz
$echtvar anno -i "anno_num != 3" -e string.echtvar string.vcf string-anno.vcf.gz

rm -f string.vcf string.echtvar # string-anno.vcf.gz

python make-string-test-for-issue8.py
$echtvar encode issue8.echtvar string-issue-8.json string-issue-8.db.vcf
$echtvar anno -e issue8.echtvar string-issue-8.query.vcf issue-8.output.vcf.gz
python check-string-for-issue8.py issue-8.output.vcf.gz

rm -r issue-8.output.vcf.gz issue8.echtvar string-issue-8.db.vcf string-issue-8.query.vcf string-issue-8.json

0 comments on commit 76078f6

Please sign in to comment.