Skip to content

Commit

Permalink
Merge features in tree, so we have 1: fewer branches, 2: parity with …
Browse files Browse the repository at this point in the history
…the original htseq-count
  • Loading branch information
CedricHermansBIT committed Feb 15, 2024
1 parent f10899e commit c7bc919
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 11 deletions.
4 changes: 4 additions & 0 deletions src/feature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,8 @@ impl Feature {
pub fn strand(&self) -> char {
self.strand
}

pub fn set_end(&mut self, end: i32) {
self.end = end;
}
}
12 changes: 10 additions & 2 deletions src/interval.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,16 @@ impl Interval {
self.start < end && self.end-1 > start
}

// as_ref() returns a reference to the Option's value
pub fn name(&self) -> Option<&str> {
self.data.as_ref().map(|f| f.name())
}

pub fn set_end(&mut self, end: i32) {
self.end = end;
self.data.as_mut().map(|f| f.set_end(end));
}

}

impl PartialOrd for Interval {
Expand Down Expand Up @@ -139,5 +149,3 @@ impl std::fmt::Display for Interval {
}
}
}


37 changes: 35 additions & 2 deletions src/intervaltree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,39 @@ impl IntervalTree {
for interval in intervals {
unique_intervals.insert(interval);
}

// merge overlapping intervals with same name
use std::collections::HashMap;

// Create a HashMap where the keys are the names of the intervals and the values are vectors of intervals with that name.
let mut grouped_intervals: HashMap<String, Vec<Interval>> = HashMap::new();
for interval in &unique_intervals {
grouped_intervals.entry(interval.name().unwrap().to_string()).or_insert(Vec::new()).push(interval.clone());
}

// For each name in the HashMap, sort the intervals by their start position and then merge the overlapping intervals.
for intervals in grouped_intervals.values_mut() {
intervals.sort_by(|a, b| a.start.cmp(&b.start));
let mut i = 0;
while i < intervals.len() - 1 {
if intervals[i].end >= intervals[i+1].start && intervals[i+1].end >= intervals[i].end {
intervals[i].end = intervals[i+1].end;
intervals.remove(i+1);
} else {
i += 1;
}
}
}

// Create a new HashSet of intervals from the merged intervals.
let mut unique_intervals: HashSet<Interval> = HashSet::new();
for intervals in grouped_intervals.values() {
for interval in intervals {
unique_intervals.insert(interval.clone());
}
}


// convert to vector
//let unique_intervals: Vec<Interval> = unique_intervals.into_iter().collect();
//eprintln!("filtered intervals");
Expand Down Expand Up @@ -91,6 +124,7 @@ impl IntervalTree {
}
}


pub fn add(&mut self, interval: Interval) {
if interval.is_null() {
panic!("IntervalTree: Null Interval objects are not allowed in IntervalTree");
Expand Down Expand Up @@ -300,8 +334,7 @@ impl IntervalTree {
}

pub fn contains(&self, start: i32, end: i32) -> Vec<&Interval> {
let mut result = self.overlap(start, end);
// TODO: merge overlapping intervals with same name into one interval
let mut result: Vec<&Interval> = self.overlap(start, end);
result.retain(|x| x.start <= start && x.end >= end);
result
}
Expand Down
14 changes: 7 additions & 7 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,6 @@ struct Args {
_m: String,

// Stranded
// TODO: implement stranded mode
#[structopt(
short = "s",
long = "stranded",
Expand Down Expand Up @@ -363,17 +362,17 @@ fn prepare_count_hashmap(gtf: &Vec<Option<IntervalTree>>) -> HashMap<String, f32
}

// Add the special keys
counts.insert("__no_feature".to_string(), 0 as f32);
counts.insert("__ambiguous".to_string(), 0 as f32);
counts.insert("__not_aligned".to_string(), 0 as f32);
counts.insert("__too_low_aQual".to_string(), 0 as f32);
counts.insert("__alignment_not_unique".to_string(), 0 as f32);
counts.insert("__no_feature".to_string(), 0f32);
counts.insert("__ambiguous".to_string(), 0f32);
counts.insert("__not_aligned".to_string(), 0f32);
counts.insert("__too_low_aQual".to_string(), 0f32);
counts.insert("__alignment_not_unique".to_string(), 0f32);
counts
}

fn read_gtf(file_path: &str, feature_type_filter: &str, ref_names_to_id: &HashMap<String, i32>) -> Vec<Option<IntervalTree>> {
let mut map: HashMap<i32, Vec<Interval>> = HashMap::new();
let file = File::open(file_path).expect("Kan het bestand niet openen");
let file = File::open(file_path).expect("Could not open this file");
let mut reader = BufReader::new(file);
let mut counter = 0;
let mut line = String::new();
Expand Down Expand Up @@ -691,6 +690,7 @@ fn process_intersection_nonempty_read(_features: &IntervalTree, _start_pos: i32,
fn process_intersection_strict_read(features: &IntervalTree, start_pos: i32, end_pos: i32, strand: bool, overlapping_features: &mut Vec<Feature>, args: &Args) {
//todo!("process_partial_read for intersection-strict");
let new_contained = features.contains(start_pos, end_pos);
// change new_contained to a Vec<&Interval>
let strand = if strand { '-' } else { '+' };
// add all contained features to the list
add_stranded_features(new_contained, strand, overlapping_features, args);
Expand Down

0 comments on commit c7bc919

Please sign in to comment.