-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
244 additions
and
154 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
use std::cmp::min; | ||
use std::collections::HashMap; | ||
use std::fs::File; | ||
use std::io::{self, BufReader, Read, Seek, SeekFrom}; | ||
|
||
pub fn distance( | ||
file_path1: &str, | ||
file_path2: &str, | ||
node_sizes1: HashMap<String, usize>, | ||
node_sizes2: HashMap<String, usize>, | ||
path_positions1: HashMap<String, u64>, | ||
path_positions2: HashMap<String, u64>, | ||
path_lengths1: HashMap<String, u64>, | ||
path_lengths2: HashMap<String, u64>, | ||
) -> io::Result<()> { | ||
/* | ||
Given two GFA files and their associated node sizes and path positions, this function computes the distance between the two graphs. | ||
- file_path1: the path to the first GFA file | ||
- file_path2: the path to the second GFA file | ||
- node_sizes1: a HashMap with the node names as keys and the node sizes as values for the first GFA file | ||
- node_sizes2: a HashMap with the node names as keys and the node sizes as values for the second GFA file | ||
- path_positions1: a HashMap with the path names as keys and the offset of the path description for the first GFA file | ||
- path_positions2: a HashMap with the path names as keys and the offset of the path description for the second GFA file | ||
- path_lengths1: a HashMap with the path names as keys and the number of nodes in the path for the first GFA file | ||
- path_lengths2: a HashMap with the path names as keys and the number of nodes in the path for the second GFA file | ||
Writes to standard output the operations (merges and splits) needed to transform the first graph into the second graph | ||
*/ | ||
let intersection: Vec<&String> = path_positions1 | ||
.keys() | ||
.filter(|&k| path_positions2.contains_key(k)) | ||
.collect(); | ||
|
||
println!("# Intersection of paths: {:?}", intersection); | ||
|
||
let mut equivalences_count = 0; | ||
let mut merges_count = 0; | ||
let mut splits_count = 0; | ||
|
||
println!("# Path name\tPosition\tOperation\tNodeA\tNodeB\tBreakpointA\tBreakpointB"); | ||
for path_name in intersection { | ||
// We get the positions of the path in the two files | ||
let pos1 = path_positions1[path_name]; | ||
let pos2 = path_positions2[path_name]; | ||
|
||
// We open the two files | ||
let mut file1 = BufReader::new(File::open(file_path1)?); | ||
let mut file2 = BufReader::new(File::open(file_path2)?); | ||
|
||
// We seek to the position of the path in the two files | ||
file1.seek(SeekFrom::Start(pos1))?; | ||
file2.seek(SeekFrom::Start(pos2))?; | ||
|
||
// Buffer to read the two files | ||
let mut buffer1 = [0; 1]; | ||
let mut buffer2 = [0; 1]; | ||
|
||
// Node names | ||
let mut node1 = String::new(); | ||
let mut node2 = String::new(); | ||
|
||
let mut breakpoint_a = 0; | ||
let mut breakpoint_b = 0; | ||
|
||
let mut position = 0; | ||
|
||
let max_length1 = path_lengths1[path_name.as_str()] as usize; | ||
let max_length2 = path_lengths2[path_name.as_str()] as usize; | ||
|
||
if max_length1 != max_length2 { | ||
// The two paths have different lengths, we cannot compare them | ||
println!( | ||
"# Error: the two paths representing {} have different lengths: {} and {}.", | ||
path_name, max_length1, max_length2 | ||
); | ||
continue; | ||
} else { | ||
while position < max_length1 { | ||
if breakpoint_a == breakpoint_b { | ||
// The two positions in the two paths are aligned | ||
equivalences_count += 1; | ||
// No edition operation is needed | ||
// We must read the two next nodes in the two files | ||
node1 = read_next_node(&mut file1, &mut buffer1); | ||
node2 = read_next_node(&mut file2, &mut buffer2); | ||
// Store their associated sizes in breakpoint_a and breakpoint_b | ||
breakpoint_a += node_sizes1.get(&node1).unwrap().clone(); | ||
breakpoint_b += node_sizes2.get(&node2).unwrap().clone(); | ||
} else if breakpoint_a < breakpoint_b { | ||
// The node in the first path is missing in the second path | ||
// It is a split operation | ||
splits_count += 1; | ||
// The two positions in the two paths are not aligned | ||
println!( | ||
"{}\t{}\tS\t{}\t{}\t{}\t{}", | ||
path_name, position, node1, node2, breakpoint_a, breakpoint_b | ||
); | ||
node1 = read_next_node(&mut file1, &mut buffer1); | ||
breakpoint_a += node_sizes1.get(&node1).unwrap().clone(); | ||
} else if breakpoint_a > breakpoint_b { | ||
// The node in the second path is missing in the first path | ||
// It is a merge operation | ||
merges_count += 1; | ||
// The two positions in the two paths are not aligned | ||
println!( | ||
"{}\t{}\tM\t{}\t{}\t{}\t{}", | ||
path_name, position, node1, node2, breakpoint_a, breakpoint_b | ||
); | ||
node2 = read_next_node(&mut file2, &mut buffer2); | ||
breakpoint_b += node_sizes2.get(&node2).unwrap().clone(); | ||
} | ||
|
||
// We update the position in the two paths | ||
position = min(breakpoint_a, breakpoint_b); | ||
} | ||
} | ||
} | ||
println!( | ||
"# Distance: {} (E={}, S={}, M={}).", | ||
splits_count + merges_count, | ||
equivalences_count, | ||
splits_count, | ||
merges_count | ||
); | ||
Ok(()) | ||
} | ||
|
||
fn read_next_node(file: &mut BufReader<File>, buffer: &mut [u8; 1]) -> String { | ||
/* | ||
* Read the next node in the file, until a comma or a newline is found | ||
* Return the node name and a boolean indicating if the end of the line has been reached | ||
* file: the file to read | ||
* buffer: a buffer to read the file | ||
*/ | ||
let mut node = String::new(); | ||
while file.read(buffer).unwrap() > 0 { | ||
if buffer[0] == b'+' || buffer[0] == b'-' { | ||
break; | ||
} | ||
if buffer[0] != b',' { | ||
node.push(buffer[0] as char); | ||
} | ||
} | ||
node | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
use std::collections::HashMap; | ||
use std::fs::File; | ||
use std::io::{self, BufRead, BufReader, Seek}; | ||
|
||
pub fn index_gfa( | ||
file_path: &str, | ||
) -> io::Result<( | ||
HashMap<String, usize>, | ||
HashMap<String, u64>, | ||
HashMap<String, u64>, | ||
)> { | ||
/* | ||
Given a file path, this function reads the GFA file and returns two HashMaps: | ||
- seq_lengths: a HashMap with the node names as keys and the sequence lengths as values | ||
- path_positions: a HashMap with the path names as keys and the offset of the path description as values | ||
- path_lengths: a HashMap with the path names as keys and the number of nodes in the path as values | ||
*/ | ||
let file = File::open(file_path)?; | ||
let mut reader = BufReader::new(file); | ||
|
||
let mut seq_lengths = HashMap::new(); | ||
let mut path_positions = HashMap::new(); | ||
let mut path_lengths = HashMap::new(); | ||
|
||
let mut line = String::new(); | ||
while reader.read_line(&mut line)? > 0 { | ||
let columns: Vec<&str> = line.split('\t').collect(); | ||
if let Some(first_char) = line.chars().next() { | ||
if first_char == 'S' { | ||
// In the case of an S-line, we store the node name and the sequence length | ||
let node_name = String::from(columns[1].trim()); | ||
let sequence_length = columns[2].trim().len(); | ||
seq_lengths.insert(node_name, sequence_length); | ||
} | ||
if first_char == 'P' { | ||
let mut path_length = 0; | ||
// In the case of a P-line, we store the path name and the offset of the path description | ||
// When processing paths, we can match paths in the path_positions HashMap | ||
// Then start reading the file from there and go with a buffer to read node by node the path | ||
let path_name = String::from(columns[1]); | ||
let offset = reader.seek(io::SeekFrom::Current(0))? | ||
- (line.len() as u64 - columns[0].len() as u64 - columns[1].len() as u64 - 2); | ||
path_positions.insert(path_name, offset); | ||
let path_name = String::from(columns[1]); | ||
// let path_length = columns[2].split(',').count(); | ||
|
||
let node_list: Vec<String> = columns[2] | ||
.split(',') | ||
.map(|s| s[..s.len() - 1].to_string()) | ||
.collect(); | ||
for node in node_list { | ||
let sequence_length = seq_lengths.get(&node).unwrap(); | ||
path_length += sequence_length; | ||
} | ||
|
||
path_lengths.insert(path_name, path_length as u64); | ||
} | ||
} | ||
line.clear(); // Clear the line buffer for the next read | ||
} | ||
|
||
Ok((seq_lengths, path_positions, path_lengths)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,54 +1,50 @@ | ||
mod calculate_distance; | ||
mod compute_boundaries; | ||
mod parse_gfa_file; | ||
mod compute_distance; | ||
mod index_gfa_file; | ||
use std::env; | ||
|
||
fn main() { | ||
/* | ||
Compare two GFA files and compute the distance between them | ||
Two arguments must be given in the command line: | ||
- The path to the first GFA file | ||
- The path to the second GFA file | ||
It will print to standard output the differences between the two graphs | ||
*/ | ||
// Get the file path from command line arguments | ||
let args: Vec<String> = env::args().collect(); | ||
let file_path_a = &args[1]; | ||
let file_path_b = &args[2]; | ||
|
||
// Parse first graph | ||
let (seq_lengths_a, node_lists_a) = match parse_gfa_file::read_gfa_file(file_path_a) { | ||
Ok(result) => result, | ||
Err(error) => { | ||
eprintln!("Failed to read GFA file: {}", error); | ||
return; | ||
} | ||
}; | ||
let (seq_lengths_a, path_descriptors_a, path_lengths_a) = | ||
match index_gfa_file::index_gfa(file_path_a) { | ||
Ok(result) => result, | ||
Err(error) => { | ||
eprintln!("Failed to read GFA file: {}", error); | ||
return; | ||
} | ||
}; | ||
|
||
// Parse second graph | ||
let (seq_lengths_b, node_lists_b) = match parse_gfa_file::read_gfa_file(file_path_b) { | ||
Ok(result) => result, | ||
Err(error) => { | ||
eprintln!("Failed to read GFA file: {}", error); | ||
return; | ||
} | ||
}; | ||
let (seq_lengths_b, path_descriptors_b, path_lengths_b) = | ||
match index_gfa_file::index_gfa(file_path_b) { | ||
Ok(result) => result, | ||
Err(error) => { | ||
eprintln!("Failed to read GFA file: {}", error); | ||
return; | ||
} | ||
}; | ||
|
||
// Compute the intersection of node_list_a and node_list_b keys | ||
let common_keys: Vec<&String> = node_lists_a | ||
.keys() | ||
.filter(|key| node_lists_b.contains_key(*key)) | ||
.collect(); | ||
// Iterate on common_keys | ||
println!("# Path name\tPosition\tOperation\tNodeA\tNodeB"); | ||
for key in common_keys { | ||
let path_a_descriptor = compute_boundaries::compute_cumulative_sum( | ||
&node_lists_a.get(key).unwrap(), | ||
&seq_lengths_a, | ||
); | ||
let path_b_descriptor = compute_boundaries::compute_cumulative_sum( | ||
&node_lists_b.get(key).unwrap(), | ||
&seq_lengths_b, | ||
); | ||
calculate_distance::distance( | ||
key, | ||
path_a_descriptor, | ||
path_b_descriptor, | ||
&node_lists_a.get(key).unwrap(), | ||
&node_lists_b.get(key).unwrap(), | ||
); | ||
} | ||
// Compute the distance between the two graphs | ||
compute_distance::distance( | ||
file_path_a, | ||
file_path_b, | ||
seq_lengths_a, | ||
seq_lengths_b, | ||
path_descriptors_a, | ||
path_descriptors_b, | ||
path_lengths_a, | ||
path_lengths_b, | ||
) | ||
.unwrap(); | ||
} |
Oops, something went wrong.