Skip to content

Commit

Permalink
Fix early return and counting of mismatches with no-calls in barcodes. (
Browse files Browse the repository at this point in the history
#48)

* fix: early return and counting of mismatches when n in barcodes.
  • Loading branch information
theJasonFan authored Aug 19, 2024
1 parent 6ed98a3 commit dd803fc
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 2 deletions.
117 changes: 117 additions & 0 deletions src/bin/commands/demux.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1373,6 +1373,123 @@ mod tests {
);
}

#[test]
fn test_demux_with_catchall_barcode() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("7B+T").unwrap()];
let s1_barcode = "NNNNNNN";
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
let input_files =
vec![fastq_file(&tmp, "ex", "ex", &[&(s1_barcode.to_owned() + &"A".repeat(100))])];

let output_dir = tmp.path().to_path_buf().join("output");

let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 0,
min_mismatch_delta: 2,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();

let unmatched_path = output_dir.join("unmatched.R1.fq.gz");
let unmatched_reads = read_fastq(&unmatched_path);
assert_eq!(unmatched_reads.len(), 0);

let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);

assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:NNNNNNN".to_vec(),
seq: "A".repeat(100).as_bytes().to_vec(),
qual: ";".repeat(100).as_bytes().to_vec(),
},
);
}

#[test]
fn test_demux_with_ns_in_barcode() {
let tmp = TempDir::new().unwrap();
let read_structures = vec![ReadStructure::from_str("7B+T").unwrap()];
let s1_barcode = "NNAAAAA";
let s2_barcode = "NNCCCCC";
let sample_metadata = metadata_file(&tmp, &[s1_barcode, s2_barcode]);
let input_files = vec![fastq_file(
&tmp,
"ex",
"ex",
&[
&("ANAAAAA".to_owned() + &"A".repeat(5)),
&("ANCCCCC".to_owned() + &"C".repeat(5)),
&("NNNAAAA".to_owned() + &"T".repeat(5)),
],
)];

let output_dir = tmp.path().to_path_buf().join("output");

let demux_inputs = Demux {
inputs: input_files,
read_structures,
sample_metadata,
output_types: vec!['T'],
output: output_dir.clone(),
unmatched_prefix: "unmatched".to_owned(),
max_mismatches: 0,
min_mismatch_delta: 0,
threads: 5,
compression_level: 5,
skip_reasons: vec![],
};
demux_inputs.execute().unwrap();

let output_path = output_dir.join("Sample0000.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_0 1:N:0:ANAAAAA".to_vec(),
seq: "A".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);

let output_path = output_dir.join("Sample0001.R1.fq.gz");
let fq_reads = read_fastq(&output_path);
assert_eq!(fq_reads.len(), 1);
assert_equal(
&fq_reads[0],
&OwnedRecord {
head: b"ex_1 1:N:0:ANCCCCC".to_vec(),
seq: "C".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);

// Should not match since it has 3 no calls, and barcodes have at maximum 1 no-call
let unmatched_path = output_dir.join("unmatched.R1.fq.gz");
let unmatched_reads = read_fastq(&unmatched_path);
assert_eq!(unmatched_reads.len(), 1);
assert_equal(
&unmatched_reads[0],
&OwnedRecord {
head: b"ex_2 1:N:0:NNNAAAA".to_vec(),
seq: "T".repeat(5).as_bytes().to_vec(),
qual: ";".repeat(5).as_bytes().to_vec(),
},
);
}

#[test]
fn test_demux_paired_reads_with_in_line_sample_barcodes() {
let tmp = TempDir::new().unwrap();
Expand Down
19 changes: 17 additions & 2 deletions src/lib/barcode_matching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ pub struct BarcodeMatcher {
/// Note - this is to be replaced by a sample struct in task 3. For now we're keeping things
/// very simple.
sample_barcodes: Vec<BString>,
/// The maxium number of Ns in any barcode in set of sample barcodes
max_ns_in_barcodes: usize,
/// The maximum mismatches to match a sample barcode.
max_mismatches: u8,
/// The minimum difference between number of mismatches in the best and second best barcodes
Expand Down Expand Up @@ -56,12 +58,20 @@ impl BarcodeMatcher {
"Sample barcode cannot be empty string"
);

let mut max_ns_in_barcodes = 0;
let modified_sample_barcodes = sample_barcodes
.iter()
.map(|barcode| BString::from(barcode.to_ascii_uppercase()))
.map(|barcode| {
let barcode = BString::from(barcode.to_ascii_uppercase());
let num_ns = barcode.iter().filter(|&b| byte_is_nocall(*b)).count();
max_ns_in_barcodes = max_ns_in_barcodes.max(num_ns);
barcode
})
.collect::<Vec<_>>();

Self {
sample_barcodes: modified_sample_barcodes,
max_ns_in_barcodes,
max_mismatches,
min_mismatch_delta,
use_cache,
Expand Down Expand Up @@ -132,7 +142,7 @@ impl BarcodeMatcher {
return None;
}
let num_no_calls = read_bases.iter().filter(|&&b| byte_is_nocall(b)).count();
if num_no_calls > self.max_mismatches as usize {
if num_no_calls > (self.max_mismatches as usize) + self.max_ns_in_barcodes {
None
} else if self.use_cache {
if let Some(cached_match) = self.cache.get(read_bases) {
Expand Down Expand Up @@ -207,6 +217,11 @@ mod tests {
assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GANNACA".as_bytes()), 0,);
}

#[test]
fn all_ns_barcode_have_no_mismatches() {
assert_eq!(BarcodeMatcher::count_mismatches("GANNACA".as_bytes(), "NNNNNNN".as_bytes()), 0,);
}

#[test]
fn find_two_mismatches() {
assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GACCACA".as_bytes()), 2,);
Expand Down

0 comments on commit dd803fc

Please sign in to comment.