Skip to content

Commit

Permalink
Update source code
Browse files Browse the repository at this point in the history
  • Loading branch information
dubssieg committed Oct 21, 2024
1 parent 82c3356 commit ce5075d
Show file tree
Hide file tree
Showing 6 changed files with 244 additions and 154 deletions.
36 changes: 0 additions & 36 deletions src/calculate_distance.rs

This file was deleted.

21 changes: 0 additions & 21 deletions src/compute_boundaries.rs

This file was deleted.

144 changes: 144 additions & 0 deletions src/compute_distance.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
use std::cmp::min;
use std::collections::HashMap;
use std::fs::File;
use std::io::{self, BufReader, Read, Seek, SeekFrom};

pub fn distance(
file_path1: &str,
file_path2: &str,
node_sizes1: HashMap<String, usize>,
node_sizes2: HashMap<String, usize>,
path_positions1: HashMap<String, u64>,
path_positions2: HashMap<String, u64>,
path_lengths1: HashMap<String, u64>,
path_lengths2: HashMap<String, u64>,
) -> io::Result<()> {
/*
Given two GFA files and their associated node sizes and path positions, this function computes the distance between the two graphs.
- file_path1: the path to the first GFA file
- file_path2: the path to the second GFA file
- node_sizes1: a HashMap with the node names as keys and the node sizes as values for the first GFA file
- node_sizes2: a HashMap with the node names as keys and the node sizes as values for the second GFA file
- path_positions1: a HashMap with the path names as keys and the offset of the path description for the first GFA file
- path_positions2: a HashMap with the path names as keys and the offset of the path description for the second GFA file
- path_lengths1: a HashMap with the path names as keys and the number of nodes in the path for the first GFA file
- path_lengths2: a HashMap with the path names as keys and the number of nodes in the path for the second GFA file
Writes to standard output the operations (merges and splits) needed to transform the first graph into the second graph
*/
let intersection: Vec<&String> = path_positions1
.keys()
.filter(|&k| path_positions2.contains_key(k))
.collect();

println!("# Intersection of paths: {:?}", intersection);

let mut equivalences_count = 0;
let mut merges_count = 0;
let mut splits_count = 0;

println!("# Path name\tPosition\tOperation\tNodeA\tNodeB\tBreakpointA\tBreakpointB");
for path_name in intersection {
// We get the positions of the path in the two files
let pos1 = path_positions1[path_name];
let pos2 = path_positions2[path_name];

// We open the two files
let mut file1 = BufReader::new(File::open(file_path1)?);
let mut file2 = BufReader::new(File::open(file_path2)?);

// We seek to the position of the path in the two files
file1.seek(SeekFrom::Start(pos1))?;
file2.seek(SeekFrom::Start(pos2))?;

// Buffer to read the two files
let mut buffer1 = [0; 1];
let mut buffer2 = [0; 1];

// Node names
let mut node1 = String::new();
let mut node2 = String::new();

let mut breakpoint_a = 0;
let mut breakpoint_b = 0;

let mut position = 0;

let max_length1 = path_lengths1[path_name.as_str()] as usize;
let max_length2 = path_lengths2[path_name.as_str()] as usize;

if max_length1 != max_length2 {
// The two paths have different lengths, we cannot compare them
println!(
"# Error: the two paths representing {} have different lengths: {} and {}.",
path_name, max_length1, max_length2
);
continue;
} else {
while position < max_length1 {
if breakpoint_a == breakpoint_b {
// The two positions in the two paths are aligned
equivalences_count += 1;
// No edition operation is needed
// We must read the two next nodes in the two files
node1 = read_next_node(&mut file1, &mut buffer1);
node2 = read_next_node(&mut file2, &mut buffer2);
// Store their associated sizes in breakpoint_a and breakpoint_b
breakpoint_a += node_sizes1.get(&node1).unwrap().clone();
breakpoint_b += node_sizes2.get(&node2).unwrap().clone();
} else if breakpoint_a < breakpoint_b {
// The node in the first path is missing in the second path
// It is a split operation
splits_count += 1;
// The two positions in the two paths are not aligned
println!(
"{}\t{}\tS\t{}\t{}\t{}\t{}",
path_name, position, node1, node2, breakpoint_a, breakpoint_b
);
node1 = read_next_node(&mut file1, &mut buffer1);
breakpoint_a += node_sizes1.get(&node1).unwrap().clone();
} else if breakpoint_a > breakpoint_b {
// The node in the second path is missing in the first path
// It is a merge operation
merges_count += 1;
// The two positions in the two paths are not aligned
println!(
"{}\t{}\tM\t{}\t{}\t{}\t{}",
path_name, position, node1, node2, breakpoint_a, breakpoint_b
);
node2 = read_next_node(&mut file2, &mut buffer2);
breakpoint_b += node_sizes2.get(&node2).unwrap().clone();
}

// We update the position in the two paths
position = min(breakpoint_a, breakpoint_b);
}
}
}
println!(
"# Distance: {} (E={}, S={}, M={}).",
splits_count + merges_count,
equivalences_count,
splits_count,
merges_count
);
Ok(())
}

fn read_next_node(file: &mut BufReader<File>, buffer: &mut [u8; 1]) -> String {
/*
* Read the next node in the file, until a comma or a newline is found
* Return the node name and a boolean indicating if the end of the line has been reached
* file: the file to read
* buffer: a buffer to read the file
*/
let mut node = String::new();
while file.read(buffer).unwrap() > 0 {
if buffer[0] == b'+' || buffer[0] == b'-' {
break;
}
if buffer[0] != b',' {
node.push(buffer[0] as char);
}
}
node
}
63 changes: 63 additions & 0 deletions src/index_gfa_file.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
use std::collections::HashMap;
use std::fs::File;
use std::io::{self, BufRead, BufReader, Seek};

pub fn index_gfa(
file_path: &str,
) -> io::Result<(
HashMap<String, usize>,
HashMap<String, u64>,
HashMap<String, u64>,
)> {
/*
Given a file path, this function reads the GFA file and returns two HashMaps:
- seq_lengths: a HashMap with the node names as keys and the sequence lengths as values
- path_positions: a HashMap with the path names as keys and the offset of the path description as values
- path_lengths: a HashMap with the path names as keys and the number of nodes in the path as values
*/
let file = File::open(file_path)?;
let mut reader = BufReader::new(file);

let mut seq_lengths = HashMap::new();
let mut path_positions = HashMap::new();
let mut path_lengths = HashMap::new();

let mut line = String::new();
while reader.read_line(&mut line)? > 0 {
let columns: Vec<&str> = line.split('\t').collect();
if let Some(first_char) = line.chars().next() {
if first_char == 'S' {
// In the case of an S-line, we store the node name and the sequence length
let node_name = String::from(columns[1].trim());
let sequence_length = columns[2].trim().len();
seq_lengths.insert(node_name, sequence_length);
}
if first_char == 'P' {
let mut path_length = 0;
// In the case of a P-line, we store the path name and the offset of the path description
// When processing paths, we can match paths in the path_positions HashMap
// Then start reading the file from there and go with a buffer to read node by node the path
let path_name = String::from(columns[1]);
let offset = reader.seek(io::SeekFrom::Current(0))?
- (line.len() as u64 - columns[0].len() as u64 - columns[1].len() as u64 - 2);
path_positions.insert(path_name, offset);
let path_name = String::from(columns[1]);
// let path_length = columns[2].split(',').count();

let node_list: Vec<String> = columns[2]
.split(',')
.map(|s| s[..s.len() - 1].to_string())
.collect();
for node in node_list {
let sequence_length = seq_lengths.get(&node).unwrap();
path_length += sequence_length;
}

path_lengths.insert(path_name, path_length as u64);
}
}
line.clear(); // Clear the line buffer for the next read
}

Ok((seq_lengths, path_positions, path_lengths))
}
78 changes: 37 additions & 41 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,54 +1,50 @@
mod calculate_distance;
mod compute_boundaries;
mod parse_gfa_file;
mod compute_distance;
mod index_gfa_file;
use std::env;

fn main() {
/*
Compare two GFA files and compute the distance between them
Two arguments must be given in the command line:
- The path to the first GFA file
- The path to the second GFA file
It will print to standard output the differences between the two graphs
*/
// Get the file path from command line arguments
let args: Vec<String> = env::args().collect();
let file_path_a = &args[1];
let file_path_b = &args[2];

// Parse first graph
let (seq_lengths_a, node_lists_a) = match parse_gfa_file::read_gfa_file(file_path_a) {
Ok(result) => result,
Err(error) => {
eprintln!("Failed to read GFA file: {}", error);
return;
}
};
let (seq_lengths_a, path_descriptors_a, path_lengths_a) =
match index_gfa_file::index_gfa(file_path_a) {
Ok(result) => result,
Err(error) => {
eprintln!("Failed to read GFA file: {}", error);
return;
}
};

// Parse second graph
let (seq_lengths_b, node_lists_b) = match parse_gfa_file::read_gfa_file(file_path_b) {
Ok(result) => result,
Err(error) => {
eprintln!("Failed to read GFA file: {}", error);
return;
}
};
let (seq_lengths_b, path_descriptors_b, path_lengths_b) =
match index_gfa_file::index_gfa(file_path_b) {
Ok(result) => result,
Err(error) => {
eprintln!("Failed to read GFA file: {}", error);
return;
}
};

// Compute the intersection of node_list_a and node_list_b keys
let common_keys: Vec<&String> = node_lists_a
.keys()
.filter(|key| node_lists_b.contains_key(*key))
.collect();
// Iterate on common_keys
println!("# Path name\tPosition\tOperation\tNodeA\tNodeB");
for key in common_keys {
let path_a_descriptor = compute_boundaries::compute_cumulative_sum(
&node_lists_a.get(key).unwrap(),
&seq_lengths_a,
);
let path_b_descriptor = compute_boundaries::compute_cumulative_sum(
&node_lists_b.get(key).unwrap(),
&seq_lengths_b,
);
calculate_distance::distance(
key,
path_a_descriptor,
path_b_descriptor,
&node_lists_a.get(key).unwrap(),
&node_lists_b.get(key).unwrap(),
);
}
// Compute the distance between the two graphs
compute_distance::distance(
file_path_a,
file_path_b,
seq_lengths_a,
seq_lengths_b,
path_descriptors_a,
path_descriptors_b,
path_lengths_a,
path_lengths_b,
)
.unwrap();
}
Loading

0 comments on commit ce5075d

Please sign in to comment.