Update source code

dubssieg · Oct 21, 2024 · ce5075d · ce5075d
1 parent 82c3356
commit ce5075d
Show file tree

Hide file tree

Showing 6 changed files with 244 additions and 154 deletions.
diff --git a/src/calculate_distance.rs b/src/calculate_distance.rs
diff --git a/src/compute_boundaries.rs b/src/compute_boundaries.rs
diff --git a/src/compute_distance.rs b/src/compute_distance.rs
@@ -0,0 +1,144 @@
+use std::cmp::min;
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::{self, BufReader, Read, Seek, SeekFrom};
+
+pub fn distance(
+    file_path1: &str,
+    file_path2: &str,
+    node_sizes1: HashMap<String, usize>,
+    node_sizes2: HashMap<String, usize>,
+    path_positions1: HashMap<String, u64>,
+    path_positions2: HashMap<String, u64>,
+    path_lengths1: HashMap<String, u64>,
+    path_lengths2: HashMap<String, u64>,
+) -> io::Result<()> {
+    /*
+    Given two GFA files and their associated node sizes and path positions, this function computes the distance between the two graphs.
+    - file_path1: the path to the first GFA file
+    - file_path2: the path to the second GFA file
+    - node_sizes1: a HashMap with the node names as keys and the node sizes as values for the first GFA file
+    - node_sizes2: a HashMap with the node names as keys and the node sizes as values for the second GFA file
+    - path_positions1: a HashMap with the path names as keys and the offset of the path description for the first GFA file
+    - path_positions2: a HashMap with the path names as keys and the offset of the path description for the second GFA file
+    - path_lengths1: a HashMap with the path names as keys and the number of nodes in the path for the first GFA file
+    - path_lengths2: a HashMap with the path names as keys and the number of nodes in the path for the second GFA file
+    Writes to standard output the operations (merges and splits) needed to transform the first graph into the second graph
+    */
+    let intersection: Vec<&String> = path_positions1
+        .keys()
+        .filter(|&k| path_positions2.contains_key(k))
+        .collect();
+
+    println!("# Intersection of paths: {:?}", intersection);
+
+    let mut equivalences_count = 0;
+    let mut merges_count = 0;
+    let mut splits_count = 0;
+
+    println!("# Path name\tPosition\tOperation\tNodeA\tNodeB\tBreakpointA\tBreakpointB");
+    for path_name in intersection {
+        // We get the positions of the path in the two files
+        let pos1 = path_positions1[path_name];
+        let pos2 = path_positions2[path_name];
+
+        // We open the two files
+        let mut file1 = BufReader::new(File::open(file_path1)?);
+        let mut file2 = BufReader::new(File::open(file_path2)?);
+
+        // We seek to the position of the path in the two files
+        file1.seek(SeekFrom::Start(pos1))?;
+        file2.seek(SeekFrom::Start(pos2))?;
+
+        // Buffer to read the two files
+        let mut buffer1 = [0; 1];
+        let mut buffer2 = [0; 1];
+
+        // Node names
+        let mut node1 = String::new();
+        let mut node2 = String::new();
+
+        let mut breakpoint_a = 0;
+        let mut breakpoint_b = 0;
+
+        let mut position = 0;
+
+        let max_length1 = path_lengths1[path_name.as_str()] as usize;
+        let max_length2 = path_lengths2[path_name.as_str()] as usize;
+
+        if max_length1 != max_length2 {
+            // The two paths have different lengths, we cannot compare them
+            println!(
+                "# Error: the two paths representing {} have different lengths: {} and {}.",
+                path_name, max_length1, max_length2
+            );
+            continue;
+        } else {
+            while position < max_length1 {
+                if breakpoint_a == breakpoint_b {
+                    // The two positions in the two paths are aligned
+                    equivalences_count += 1;
+                    // No edition operation is needed
+                    // We must read the two next nodes in the two files
+                    node1 = read_next_node(&mut file1, &mut buffer1);
+                    node2 = read_next_node(&mut file2, &mut buffer2);
+                    // Store their associated sizes in breakpoint_a and breakpoint_b
+                    breakpoint_a += node_sizes1.get(&node1).unwrap().clone();
+                    breakpoint_b += node_sizes2.get(&node2).unwrap().clone();
+                } else if breakpoint_a < breakpoint_b {
+                    // The node in the first path is missing in the second path
+                    // It is a split operation
+                    splits_count += 1;
+                    // The two positions in the two paths are not aligned
+                    println!(
+                        "{}\t{}\tS\t{}\t{}\t{}\t{}",
+                        path_name, position, node1, node2, breakpoint_a, breakpoint_b
+                    );
+                    node1 = read_next_node(&mut file1, &mut buffer1);
+                    breakpoint_a += node_sizes1.get(&node1).unwrap().clone();
+                } else if breakpoint_a > breakpoint_b {
+                    // The node in the second path is missing in the first path
+                    // It is a merge operation
+                    merges_count += 1;
+                    // The two positions in the two paths are not aligned
+                    println!(
+                        "{}\t{}\tM\t{}\t{}\t{}\t{}",
+                        path_name, position, node1, node2, breakpoint_a, breakpoint_b
+                    );
+                    node2 = read_next_node(&mut file2, &mut buffer2);
+                    breakpoint_b += node_sizes2.get(&node2).unwrap().clone();
+                }
+
+                // We update the position in the two paths
+                position = min(breakpoint_a, breakpoint_b);
+            }
+        }
+    }
+    println!(
+        "# Distance: {} (E={}, S={}, M={}).",
+        splits_count + merges_count,
+        equivalences_count,
+        splits_count,
+        merges_count
+    );
+    Ok(())
+}
+
+fn read_next_node(file: &mut BufReader<File>, buffer: &mut [u8; 1]) -> String {
+    /*
+     * Read the next node in the file, until a comma or a newline is found
+     * Return the node name and a boolean indicating if the end of the line has been reached
+     * file: the file to read
+     * buffer: a buffer to read the file
+     */
+    let mut node = String::new();
+    while file.read(buffer).unwrap() > 0 {
+        if buffer[0] == b'+' || buffer[0] == b'-' {
+            break;
+        }
+        if buffer[0] != b',' {
+            node.push(buffer[0] as char);
+        }
+    }
+    node
+}
diff --git a/src/index_gfa_file.rs b/src/index_gfa_file.rs
@@ -0,0 +1,63 @@
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::{self, BufRead, BufReader, Seek};
+
+pub fn index_gfa(
+    file_path: &str,
+) -> io::Result<(
+    HashMap<String, usize>,
+    HashMap<String, u64>,
+    HashMap<String, u64>,
+)> {
+    /*
+    Given a file path, this function reads the GFA file and returns two HashMaps:
+    - seq_lengths: a HashMap with the node names as keys and the sequence lengths as values
+    - path_positions: a HashMap with the path names as keys and the offset of the path description as values
+    - path_lengths: a HashMap with the path names as keys and the number of nodes in the path as values
+    */
+    let file = File::open(file_path)?;
+    let mut reader = BufReader::new(file);
+
+    let mut seq_lengths = HashMap::new();
+    let mut path_positions = HashMap::new();
+    let mut path_lengths = HashMap::new();
+
+    let mut line = String::new();
+    while reader.read_line(&mut line)? > 0 {
+        let columns: Vec<&str> = line.split('\t').collect();
+        if let Some(first_char) = line.chars().next() {
+            if first_char == 'S' {
+                // In the case of an S-line, we store the node name and the sequence length
+                let node_name = String::from(columns[1].trim());
+                let sequence_length = columns[2].trim().len();
+                seq_lengths.insert(node_name, sequence_length);
+            }
+            if first_char == 'P' {
+                let mut path_length = 0;
+                // In the case of a P-line, we store the path name and the offset of the path description
+                // When processing paths, we can match paths in the path_positions HashMap
+                // Then start reading the file from there and go with a buffer to read node by node the path
+                let path_name = String::from(columns[1]);
+                let offset = reader.seek(io::SeekFrom::Current(0))?
+                    - (line.len() as u64 - columns[0].len() as u64 - columns[1].len() as u64 - 2);
+                path_positions.insert(path_name, offset);
+                let path_name = String::from(columns[1]);
+                // let path_length = columns[2].split(',').count();
+
+                let node_list: Vec<String> = columns[2]
+                    .split(',')
+                    .map(|s| s[..s.len() - 1].to_string())
+                    .collect();
+                for node in node_list {
+                    let sequence_length = seq_lengths.get(&node).unwrap();
+                    path_length += sequence_length;
+                }
+
+                path_lengths.insert(path_name, path_length as u64);
+            }
+        }
+        line.clear(); // Clear the line buffer for the next read
+    }
+
+    Ok((seq_lengths, path_positions, path_lengths))
+}
diff --git a/src/main.rs b/src/main.rs
@@ -1,54 +1,50 @@
-mod calculate_distance;
-mod compute_boundaries;
-mod parse_gfa_file;
+mod compute_distance;
+mod index_gfa_file;
 use std::env;
 
 fn main() {
+    /*
+    Compare two GFA files and compute the distance between them
+    Two arguments must be given in the command line:
+    - The path to the first GFA file
+    - The path to the second GFA file
+    It will print to standard output the differences between the two graphs
+    */
     // Get the file path from command line arguments
     let args: Vec<String> = env::args().collect();
     let file_path_a = &args[1];
     let file_path_b = &args[2];
 
     // Parse first graph
-    let (seq_lengths_a, node_lists_a) = match parse_gfa_file::read_gfa_file(file_path_a) {
-        Ok(result) => result,
-        Err(error) => {
-            eprintln!("Failed to read GFA file: {}", error);
-            return;
-        }
-    };
+    let (seq_lengths_a, path_descriptors_a, path_lengths_a) =
+        match index_gfa_file::index_gfa(file_path_a) {
+            Ok(result) => result,
+            Err(error) => {
+                eprintln!("Failed to read GFA file: {}", error);
+                return;
+            }
+        };
 
     // Parse second graph
-    let (seq_lengths_b, node_lists_b) = match parse_gfa_file::read_gfa_file(file_path_b) {
-        Ok(result) => result,
-        Err(error) => {
-            eprintln!("Failed to read GFA file: {}", error);
-            return;
-        }
-    };
+    let (seq_lengths_b, path_descriptors_b, path_lengths_b) =
+        match index_gfa_file::index_gfa(file_path_b) {
+            Ok(result) => result,
+            Err(error) => {
+                eprintln!("Failed to read GFA file: {}", error);
+                return;
+            }
+        };
 
-    // Compute the intersection of node_list_a and node_list_b keys
-    let common_keys: Vec<&String> = node_lists_a
-        .keys()
-        .filter(|key| node_lists_b.contains_key(*key))
-        .collect();
-    // Iterate on common_keys
-    println!("# Path name\tPosition\tOperation\tNodeA\tNodeB");
-    for key in common_keys {
-        let path_a_descriptor = compute_boundaries::compute_cumulative_sum(
-            &node_lists_a.get(key).unwrap(),
-            &seq_lengths_a,
-        );
-        let path_b_descriptor = compute_boundaries::compute_cumulative_sum(
-            &node_lists_b.get(key).unwrap(),
-            &seq_lengths_b,
-        );
-        calculate_distance::distance(
-            key,
-            path_a_descriptor,
-            path_b_descriptor,
-            &node_lists_a.get(key).unwrap(),
-            &node_lists_b.get(key).unwrap(),
-        );
-    }
+    // Compute the distance between the two graphs
+    compute_distance::distance(
+        file_path_a,
+        file_path_b,
+        seq_lengths_a,
+        seq_lengths_b,
+        path_descriptors_a,
+        path_descriptors_b,
+        path_lengths_a,
+        path_lengths_b,
+    )
+    .unwrap();
 }