From ad3ef802fbca040c1471b98d1ee67f082a5c47c1 Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Tue, 2 Apr 2024 07:03:07 +0200 Subject: [PATCH 1/8] refactor: Improve error handling in parser --- src/parser.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 716072c..7296e04 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -11,6 +11,7 @@ pub(crate) mod hp_obo; /// Module to parse HPO - `Gene` associations from `genes_to_phenotype.txt` file pub(crate) mod genes_to_phenotype { use crate::parser::Path; + use crate::HpoError; use crate::HpoResult; use std::fs::File; use std::io::BufRead; @@ -21,7 +22,8 @@ pub(crate) mod genes_to_phenotype { /// Quick and dirty parser for development and debugging pub fn parse>(file: P, ontology: &mut Ontology) -> HpoResult<()> { - let file = File::open(file).unwrap(); + let filename = file.as_ref().display().to_string(); + let file = File::open(file).map_err(|_| HpoError::CannotOpenFile(filename))?; let reader = BufReader::new(file); for line in reader.lines() { let line = line.unwrap(); @@ -45,6 +47,7 @@ pub(crate) mod genes_to_phenotype { /// Module to parse HPO - `Gene` associations from `phenotype_to_genes.txt` file pub(crate) mod phenotype_to_genes { + use crate::HpoError; use crate::parser::Path; use crate::HpoResult; use std::fs::File; @@ -56,7 +59,8 @@ pub(crate) mod phenotype_to_genes { /// Quick and dirty parser for development and debugging pub fn parse>(file: P, ontology: &mut Ontology) -> HpoResult<()> { - let file = File::open(file).unwrap(); + let filename = file.as_ref().display().to_string(); + let file = File::open(file).map_err(|_| HpoError::CannotOpenFile(filename))?; let reader = BufReader::new(file); for line in reader.lines() { let line = line.unwrap(); @@ -132,7 +136,8 @@ pub(crate) mod phenotype_hpoa { /// Quick and dirty parser for development and debugging pub fn parse>(file: P, ontology: &mut Ontology) -> HpoResult<()> { - let file = File::open(file).unwrap(); + let filename = file.as_ref().display().to_string(); + let file = File::open(file).map_err(|_| HpoError::CannotOpenFile(filename))?; let reader = BufReader::new(file); for line in reader.lines() { let line = line.unwrap(); From 9ec4a196d939365722817e05d859de1647e8f46f Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Tue, 2 Apr 2024 07:03:31 +0200 Subject: [PATCH 2/8] Update documentation on which gene_to_phenotype files to download --- src/ontology.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ontology.rs b/src/ontology.rs index 20875f6..1ad389e 100644 --- a/src/ontology.rs +++ b/src/ontology.rs @@ -366,7 +366,7 @@ impl Ontology { /// /// - Actual OBO data: [`hp.obo`](https://hpo.jax.org/app/data/ontology) /// - Links between HPO and OMIM diseases: [`phenotype.hpoa`](https://hpo.jax.org/app/data/annotations) - /// - Links between HPO and Genes: [`phenotype_to_genes.txt`](http://purl.obolibrary.org/obo/hp/hpoa/phenotype_to_genes.txt) + /// - Links between HPO and Genes: [`genes_to_phenotype.txt`](http://purl.obolibrary.org/obo/hp/hpoa/genes_to_phenotype.txt) /// /// and then specify the folder where the data is stored. /// @@ -424,7 +424,7 @@ impl Ontology { /// /// - Actual OBO data: [`hp.obo`](https://hpo.jax.org/app/data/ontology) /// - Links between HPO and OMIM diseases: [`phenotype.hpoa`](https://hpo.jax.org/app/data/annotations) - /// - Links between HPO and Genes: [`genes_to_phenotypes.txt`](http://purl.obolibrary.org/obo/hp/hpoa/genes_to_phenotype.txt) + /// - Links between HPO and Genes: [`phenotype_to_genes.txt`](http://purl.obolibrary.org/obo/hp/hpoa/phenotype_to_genes.txt) /// /// and then specify the folder where the data is stored. /// From d73d132281642ff4b5fa54ec856e5953ead45271 Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Tue, 2 Apr 2024 07:10:08 +0200 Subject: [PATCH 3/8] refactor: Add better error messages in SimilarityCombiner trait --- src/similarity.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/similarity.rs b/src/similarity.rs index a85c387..33e50a5 100644 --- a/src/similarity.rs +++ b/src/similarity.rs @@ -122,24 +122,32 @@ pub trait SimilarityCombiner { } /// Returns the maximum values of each row + /// + /// # Panics + /// + /// Panics when rows don't contain any values, i.e. when the matrix is empty fn row_maxes(&self, m: &Matrix) -> Vec { m.rows() .map(|row| { // I have no idea why, but I could not get a.max(b) to work // with the borrow checker - row.reduce(|a, b| if a > b { a } else { b }).unwrap() + row.reduce(|a, b| if a > b { a } else { b }).expect("A matrix must contain values") }) .copied() .collect() } /// Returns the maximum values of each column + /// + /// # Panics + /// + /// Panics when rows don't contain any values, i.e. when the matrix is empty fn col_maxes(&self, m: &Matrix) -> Vec { m.cols() .map(|col| { // I have no idea why, but I could not get a.max(b) to work // with the borrow checker - col.reduce(|a, b| if a > b { a } else { b }).unwrap() + col.reduce(|a, b| if a > b { a } else { b }).expect("A matrix must contain values") }) .copied() .collect() From bad8fcfa30568f317cac5f27302c9337a4870430 Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Tue, 2 Apr 2024 19:09:21 +0200 Subject: [PATCH 4/8] Add error handling to parsing of phenotype.hpoa file --- src/lib.rs | 3 ++ src/parser.rs | 119 ++++++++++++++++++++++++++++++++++-------- src/similarity.rs | 6 ++- src/term/hpotermid.rs | 11 ++++ 4 files changed, 115 insertions(+), 24 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ee1a777..b28dd60 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -58,6 +58,9 @@ pub enum HpoError { /// Failed to convert an integer to a float #[error("cannot convert int to float")] TryFromIntError(#[from] std::num::TryFromIntError), + /// Failed to parse a line of input data from the JAX obo + #[error("invalid input data: {0}")] + InvalidInput(String), } impl From for HpoError { diff --git a/src/parser.rs b/src/parser.rs index 7296e04..d7d7f70 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -47,8 +47,8 @@ pub(crate) mod genes_to_phenotype { /// Module to parse HPO - `Gene` associations from `phenotype_to_genes.txt` file pub(crate) mod phenotype_to_genes { - use crate::HpoError; use crate::parser::Path; + use crate::HpoError; use crate::HpoResult; use std::fs::File; use std::io::BufRead; @@ -83,6 +83,14 @@ pub(crate) mod phenotype_to_genes { } /// Module to parse HPO - `OmimDisease` associations from `phenotype.hpoa` file +/// +/// # Example line +/// +/// ```text +/// OMIM:619340 Developmental and epileptic encephalopathy 96 HP:0011097 PMID:31675180 PCS 1/2 P HPO:probinson[2021-06-21] +/// OMIM:609153 Pseudohyperkalemia NOT HP:0001878 PMID:2766660 PCS P HPO:lccarmody[2018-10-03] +/// ``` +/// pub(crate) mod phenotype_hpoa { use crate::HpoError; use crate::HpoResult; @@ -92,8 +100,6 @@ pub(crate) mod phenotype_hpoa { use std::io::BufReader; use std::path::Path; - use tracing::error; - use crate::Ontology; struct Omim<'a> { @@ -102,52 +108,66 @@ pub(crate) mod phenotype_hpoa { hpo_id: HpoTermId, } - fn parse_line(line: &str) -> Option> { + fn parse_line(line: &str) -> HpoResult>> { // TODO (nice to have): Add check to skip `database_id` header row // It is not strictly needed, because we're discarding non-OMIM rows if line.starts_with('#') { - return None; + return Ok(None); } if !line.starts_with("OMIM") { - return None; + return Ok(None); } - let cols: Vec<&str> = line.trim().split('\t').collect(); - if cols[2] == "NOT" { - return None; - } + let mut cols = line.trim().splitn(5, '\t'); - let Some((_, omim_id)) = cols[0].split_once(':') else { - error!("cannot parse OMIM ID from {}", cols[0]); - return None; + let Some(id_col) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + let Some((_, omim_id)) = id_col.split_once(':') else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + let Some(omim_name) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); }; - let Ok(hpo_id) = HpoTermId::try_from(cols[3]) else { - error!("invalid HPO ID: {}", cols[3]); - return None; + if let Some("NOT") = cols.next() { + return Ok(None); }; - Some(Omim { + let hpo_id = if let Some(id) = cols.next() { + HpoTermId::try_from(id)? + } else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + Ok(Some(Omim { id: omim_id, - name: cols[1], + name: omim_name, hpo_id, - }) + })) } /// Quick and dirty parser for development and debugging + /// + /// # Errors + /// + /// - [`HpoError::CannotOpenFile`]: Source file not present or can't be opened + /// - [`HpoError::ParseIntError`]: A line contains an invalid `omim_disease_id` + /// - [`HpoError::DoesNotExist`]: A line contains a non-existing [`HpoTermId`] pub fn parse>(file: P, ontology: &mut Ontology) -> HpoResult<()> { let filename = file.as_ref().display().to_string(); let file = File::open(file).map_err(|_| HpoError::CannotOpenFile(filename))?; let reader = BufReader::new(file); for line in reader.lines() { let line = line.unwrap(); - if let Some(omim) = parse_line(&line) { + if let Some(omim) = parse_line(&line)? { let omim_disease_id = ontology.add_omim_disease(omim.name, omim.id)?; ontology.link_omim_disease_term(omim.hpo_id, omim_disease_id)?; ontology .omim_disease_mut(&omim_disease_id) - .ok_or(HpoError::DoesNotExist)? + .expect("Omim disease was just added and cannot be missing") .add_term(omim.hpo_id); } } @@ -158,10 +178,65 @@ pub(crate) mod phenotype_hpoa { mod test_omim_parsing { use super::*; + #[test] + fn test_skip_comment() { + let s = "#OMIM:600171\tGonadal agenesis\t\tHP:0000055\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; + assert!(parse_line(s) + .expect("This line has the correct format") + .is_none()); + } + #[test] fn test_skip_not() { let s = "OMIM:600171\tGonadal agenesis\tNOT\tHP:0000055\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; - assert!(parse_line(s).is_none()); + assert!(parse_line(s) + .expect("This line has the correct format") + .is_none()); + } + + #[test] + fn test_skip_orpha() { + let s = "ORPHA:600171\tGonadal agenesis\t\tHP:0000055\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; + assert!(parse_line(s) + .expect("This line has the correct format") + .is_none()); + } + + #[test] + fn test_skip_orpha_not() { + let s = "ORPHA:600171\tGonadal agenesis\tNOT\tHP:0000055\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; + assert!(parse_line(s) + .expect("This line has the correct format") + .is_none()); + } + + #[test] + fn test_correct_omim() { + let s = "OMIM:600171\tGonadal agenesis\t\tHP:0000055\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; + let omim = parse_line(s) + .expect("This line has the correct format") + .expect("Line describes an Omim disease"); + assert_eq!(omim.name, "Gonadal agenesis"); + assert_eq!(omim.id, "600171"); + assert_eq!(omim.hpo_id, "HP:0000055"); + } + + #[test] + fn test_invalid_omim_id() { + let s = "OMIM_600171\tGonadal agenesis\t\tHP:0000055\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; + assert!(parse_line(s).is_err()); + } + + #[test] + fn test_invalid_hpo_id() { + let s = "OMIM:600171\tGonadal agenesis\t\tH55\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; + assert!(parse_line(s).is_err()); + } + + #[test] + fn test_invalid_input() { + let s = "OMIM:600171 Gonadal agenesis HP:0000055 OMIM:600171 TAS P HPO:skoehler[2014-11-27]"; + assert!(parse_line(s).is_err()); } } } diff --git a/src/similarity.rs b/src/similarity.rs index 33e50a5..6910b03 100644 --- a/src/similarity.rs +++ b/src/similarity.rs @@ -131,7 +131,8 @@ pub trait SimilarityCombiner { .map(|row| { // I have no idea why, but I could not get a.max(b) to work // with the borrow checker - row.reduce(|a, b| if a > b { a } else { b }).expect("A matrix must contain values") + row.reduce(|a, b| if a > b { a } else { b }) + .expect("A matrix must contain values") }) .copied() .collect() @@ -147,7 +148,8 @@ pub trait SimilarityCombiner { .map(|col| { // I have no idea why, but I could not get a.max(b) to work // with the borrow checker - col.reduce(|a, b| if a > b { a } else { b }).expect("A matrix must contain values") + col.reduce(|a, b| if a > b { a } else { b }) + .expect("A matrix must contain values") }) .copied() .collect() diff --git a/src/term/hpotermid.rs b/src/term/hpotermid.rs index 3ad5f10..8f6f9d9 100644 --- a/src/term/hpotermid.rs +++ b/src/term/hpotermid.rs @@ -54,6 +54,17 @@ impl AnnotationId for HpoTermId { impl TryFrom<&str> for HpoTermId { type Error = HpoError; + + /// Parse a str into an `HpoTermId` + /// + /// This method assumes (but does not check!) + /// that the first 3 characters are `HP:`. + /// + /// # Error + /// + /// An empty string or string with less than 4 characters + /// will return an error. + /// The characters after the 4th position must be parsable to `u32`. fn try_from(s: &str) -> HpoResult { if s.len() < 4 { return Err(HpoError::ParseIntError); From 128b81bb752235b34e8d572e705166bc9d585819 Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Wed, 3 Apr 2024 20:59:37 +0200 Subject: [PATCH 5/8] refactor: Add error handling to parsing gene_pheno file --- Cargo.toml | 4 + src/parser.rs | 266 +++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 222 insertions(+), 48 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3d2fba8..375641d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,5 +36,9 @@ harness = false name = "ancestors" harness = false +[[bench]] +name = "parser" +harness = false + [profile.release] lto = true diff --git a/src/parser.rs b/src/parser.rs index d7d7f70..f7f197f 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -8,8 +8,9 @@ pub(crate) mod binary; /// Module to parse `hp.obo` file pub(crate) mod hp_obo; -/// Module to parse HPO - `Gene` associations from `genes_to_phenotype.txt` file -pub(crate) mod genes_to_phenotype { +/// Module to parse HPO - `Gene` associations +/// +pub(crate) mod gene_to_hpo { use crate::parser::Path; use crate::HpoError; use crate::HpoResult; @@ -20,66 +21,235 @@ pub(crate) mod genes_to_phenotype { use crate::HpoTermId; use crate::Ontology; - /// Quick and dirty parser for development and debugging - pub fn parse>(file: P, ontology: &mut Ontology) -> HpoResult<()> { - let filename = file.as_ref().display().to_string(); - let file = File::open(file).map_err(|_| HpoError::CannotOpenFile(filename))?; - let reader = BufReader::new(file); - for line in reader.lines() { - let line = line.unwrap(); - // TODO: Check for the header outside of the `lines` iterator - if line.starts_with('#') || line.starts_with("ncbi_gene_id") { - continue; - } - let cols: Vec<&str> = line.trim().split('\t').collect(); - let gene_id = ontology.add_gene(cols[1], cols[0])?; - let term_id = HpoTermId::try_from(cols[2])?; - ontology.link_gene_term(term_id, gene_id)?; + struct ParsedGene<'a> { + ncbi_id: &'a str, + symbol: &'a str, + hpo: HpoTermId, + } - ontology - .gene_mut(&gene_id) - .expect("Cannot find gene {gene_id}") - .add_term(term_id); + impl<'a> ParsedGene<'a> { + fn try_new(ncbi_id: &'a str, symbol: &'a str, hpo: &'a str) -> HpoResult { + let hpo = HpoTermId::try_from(hpo)?; + Ok(Self { + ncbi_id, + symbol, + hpo, + }) + } + } + + // Removes the first (header) line. + // TODO: Update this once https://doc.rust-lang.org/std/io/trait.BufRead.html#method.skip_until is stable + fn remove_header(reader: &mut R) -> HpoResult<()> { + let mut trash = String::with_capacity(80); + reader.read_line(&mut trash).map_err(|_| { + HpoError::InvalidInput("Invalid data in genes_to_phenotype.txt".to_string()) + })?; + if !trash.starts_with('#') + && !trash.starts_with("ncbi_gene_id") + && !trash.starts_with("hpo_id") + { + return Err(HpoError::InvalidInput( + "genes_to_phenotype.txt file must contain a header".to_string(), + )); } Ok(()) } -} -/// Module to parse HPO - `Gene` associations from `phenotype_to_genes.txt` file -pub(crate) mod phenotype_to_genes { - use crate::parser::Path; - use crate::HpoError; - use crate::HpoResult; - use std::fs::File; - use std::io::BufRead; - use std::io::BufReader; + /// Parses a single line of `genes_to_phenotype.txt` + /// + /// and returns a `ParsedGene` struct with gene and HPO info + fn genes_to_phenotype_line(line: &str) -> HpoResult> { + let mut cols = line.split('\t'); - use crate::HpoTermId; - use crate::Ontology; + // Column 1 is the NCBI-ID of the gene + let Some(ncbi_id) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; - /// Quick and dirty parser for development and debugging - pub fn parse>(file: P, ontology: &mut Ontology) -> HpoResult<()> { + // Column 2 is the gene symbol + let Some(symbol) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + // Column 3 is the Hpo Term ID + let Some(hpo) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + ParsedGene::try_new(ncbi_id, symbol, hpo) + } + + /// Parses a single line of `phenotype_to_genes.txt` + /// + /// ```text + /// HP:0000002 Abnormality of body height 81848 SPRY4 orphadata ORPHA:432 + /// ``` + fn phenotype_to_gene_line(line: &str) -> HpoResult> { + let mut cols = line.split('\t'); + + // Column 1 is the Hpo Term ID + let Some(hpo) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + if cols.next().is_none() { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + // Column 1 is the NCBI-ID of the gene + let Some(ncbi_id) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + // Column 2 is the gene symbol + let Some(symbol) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + + ParsedGene::try_new(ncbi_id, symbol, hpo) + } + + /// Parse `genes_to_phenotype.txt` file + /// + /// ```text + /// ncbi_gene_id gene_symbol hpo_id hpo_name frequency disease_id + /// 10 NAT2 HP:0000007 Autosomal recessive inheritance - OMIM:243400 + /// 10 NAT2 HP:0001939 Abnormality of metabolism/homeostasis - OMIM:243400 + /// 16 AARS1 HP:0002460 Distal muscle weakness 15/15 OMIM:613287 + /// ``` + pub fn parse_genes_to_phenotype>( + file: P, + ontology: &mut Ontology, + ) -> HpoResult<()> { + parse(file, ontology, genes_to_phenotype_line) + } + + /// Parse `phenotype_to_genes.txt` file + /// + /// ```text + /// #Format: HPO-idHPO labelentrez-gene-identrez-gene-symbolAdditional Info from G-D sourceG-D sourcedisease-ID for link + /// HP:0000002 Abnormality of body height 81848 SPRY4 orphadata ORPHA:432 + /// HP:0000002 Abnormality of body height 204219 CERS3 orphadata ORPHA:79394 + /// HP:0000002 Abnormality of body height 51360 MBTPS2 - mim2gene OMIM:308205 + /// ``` + pub fn parse_phenotype_to_genes>( + file: P, + ontology: &mut Ontology, + ) -> HpoResult<()> { + parse(file, ontology, phenotype_to_gene_line) + } + + /// Parses a file to connect genes to HPO terms + fn parse, F: Fn(&str) -> HpoResult>>( + file: P, + ontology: &mut Ontology, + parse_line: F, + ) -> HpoResult<()> { let filename = file.as_ref().display().to_string(); let file = File::open(file).map_err(|_| HpoError::CannotOpenFile(filename))?; - let reader = BufReader::new(file); + let mut reader = BufReader::new(file); + + remove_header(&mut reader)?; + for line in reader.lines() { - let line = line.unwrap(); - // TODO: Check for the header outside of the `lines` iterator - if line.starts_with('#') || line.starts_with("hpo_id") { - continue; - } - let cols: Vec<&str> = line.trim().split('\t').collect(); - let gene_id = ontology.add_gene(cols[3], cols[2])?; - let term_id = HpoTermId::try_from(cols[0])?; - ontology.link_gene_term(term_id, gene_id)?; + let line = line.map_err(|_| { + HpoError::InvalidInput("Invalid data in genes_to_phenotype.txt".to_string()) + })?; + let gene = parse_line(&line)?; + + let gene_id = ontology.add_gene(gene.symbol, gene.ncbi_id)?; + ontology.link_gene_term(gene.hpo, gene_id)?; ontology .gene_mut(&gene_id) - .expect("Cannot find gene {gene_id}") - .add_term(term_id); + .expect("Gene is present because it was just add_omim_disease") + .add_term(gene.hpo); } Ok(()) } + + #[cfg(test)] + mod test_gene_pheno_parsing { + use crate::annotations::AnnotationId; + + use super::*; + + #[test] + fn test_remove_header_ncbi_gene() { + let x = "ncbi_gene_id\txyz\n10\tNAT2\n".as_bytes(); + let mut reader = BufReader::new(x); + assert!(remove_header(&mut reader).is_ok()); + + let mut lines = reader.lines(); + assert_eq!(lines.next().unwrap().unwrap(), "10\tNAT2"); + assert!(lines.next().is_none()); + } + + #[test] + fn test_remove_header_hashtag() { + let x = "#foobar\txyz\n10\tNAT2\n".as_bytes(); + let mut reader = BufReader::new(x); + assert!(remove_header(&mut reader).is_ok()); + + let mut lines = reader.lines(); + assert_eq!(lines.next().unwrap().unwrap(), "10\tNAT2"); + assert!(lines.next().is_none()); + } + + #[test] + fn test_remove_header_fails() { + let x = "foobar\txyz\n10\tNAT2\n".as_bytes(); + let mut reader = BufReader::new(x); + assert!(remove_header(&mut reader).is_err()); + } + + #[test] + fn test_parse_correct_line() { + let line = "10\tNAT2\tHP:0000007\tfoobar\n"; + let res = genes_to_phenotype_line(line).expect("This line should parse correctly"); + assert_eq!(res.ncbi_id, "10"); + assert_eq!(res.symbol, "NAT2"); + assert_eq!(res.hpo.as_u32(), 7u32); + } + + #[test] + fn test_parse_correct_line2() { + let line = "10\tNAT2\tHP:0000007\tfoobar"; + let res = genes_to_phenotype_line(line).expect("This line should parse correctly"); + assert_eq!(res.ncbi_id, "10"); + assert_eq!(res.symbol, "NAT2"); + assert_eq!(res.hpo.as_u32(), 7u32); + } + + #[test] + fn test_parse_missing_id() { + let line = "NAT2\tHP:0000007\tfoobar\n"; + let res = genes_to_phenotype_line(line); + assert!(res.is_err()); + } + + #[test] + fn test_parse_missing_symbol() { + let line = "10\tHP:0000007\tfoobar\n"; + let res = genes_to_phenotype_line(line); + assert!(res.is_err()); + } + + #[test] + fn test_parse_missing_hpo() { + let line = "10\tNAT2\tfoobar\n"; + let res = genes_to_phenotype_line(line); + assert!(res.is_err()); + } + + #[test] + fn test_parse_invalid_hpo() { + let line = "10\tNAT2\tHP:000000A\tfoobar\n"; + let res = genes_to_phenotype_line(line); + assert!(res.is_err()); + } + } } /// Module to parse HPO - `OmimDisease` associations from `phenotype.hpoa` file @@ -248,7 +418,7 @@ pub(crate) fn load_from_jax_files_with_transivitve_genes>( ontology: &mut Ontology, ) -> HpoResult<()> { hp_obo::read_obo_file(obo_file, ontology)?; - phenotype_to_genes::parse(gene_file, ontology)?; + gene_to_hpo::parse_phenotype_to_genes(gene_file, ontology)?; phenotype_hpoa::parse(disease_file, ontology)?; Ok(()) } @@ -260,7 +430,7 @@ pub(crate) fn load_from_jax_files>( ontology: &mut Ontology, ) -> HpoResult<()> { hp_obo::read_obo_file(obo_file, ontology)?; - genes_to_phenotype::parse(gene_file, ontology)?; + gene_to_hpo::parse_genes_to_phenotype(gene_file, ontology)?; phenotype_hpoa::parse(disease_file, ontology)?; Ok(()) } From a9f17e90550d4955a3b2a36518c45860da6129b6 Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Thu, 4 Apr 2024 08:25:04 +0200 Subject: [PATCH 6/8] Modify Ontology.add_gene due improve parsing error handling --- src/ontology.rs | 38 +++++++--------- src/parser.rs | 118 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 116 insertions(+), 40 deletions(-) diff --git a/src/ontology.rs b/src/ontology.rs index 1ad389e..5a3f66d 100644 --- a/src/ontology.rs +++ b/src/ontology.rs @@ -1,5 +1,5 @@ use core::fmt::Debug; -use std::collections::hash_map::Values; +use std::collections::hash_map::{Entry, Values}; use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::Read; @@ -898,11 +898,12 @@ impl Ontology { /// /// ``` /// use hpo::Ontology; + /// use hpo::annotations::GeneId; /// /// let ontology_1 = Ontology::from_binary("tests/example.hpo").unwrap(); /// let mut ontology_2 = Ontology::default(); /// - /// ontology_2.add_gene("FOOBAR", "666666").unwrap(); + /// ontology_2.add_gene("FOOBAR", GeneId::from(666666)); /// /// let compare = ontology_1.compare(&ontology_2); /// assert_eq!(compare.added_hpo_terms().len(), 0); @@ -974,13 +975,13 @@ impl Ontology { if matched_terms.is_empty() { continue; } - let gene_id = ont.add_gene( + ont.add_gene( self.gene(gene.id()).ok_or(HpoError::DoesNotExist)?.name(), - &gene.id().as_u32().to_string(), - )?; + *gene.id(), + ); for term in &matched_terms { - ont.link_gene_term(term, gene_id)?; - ont.gene_mut(&gene_id) + ont.link_gene_term(term, *gene.id())?; + ont.gene_mut(gene.id()) .ok_or(HpoError::DoesNotExist)? .add_term(term); } @@ -1270,7 +1271,7 @@ impl Ontology { } } - /// Add a gene to the Ontology. and return the [`GeneId`] + /// Add a gene to the Ontology /// /// If the gene does not yet exist, a new [`Gene`] entity is created /// and stored in the Ontology. @@ -1281,19 +1282,19 @@ impl Ontology { /// Adding a gene does not connect it to any HPO terms. /// Use [`Ontology::link_gene_term`] for creating connections. /// - /// # Errors - /// - /// If the `gene_id` is invalid, an [`HpoError::ParseIntError`] is returned + /// This method was changed to receive the `gene_id` as [`GeneId`] + /// instead of `str` in `0.10` and does not return a `Result` anymore. /// /// # Examples /// /// ``` /// use hpo::Ontology; + /// use hpo::annotations::GeneId; /// /// let mut ontology = Ontology::default(); /// assert!(ontology.gene(&1u32.into()).is_none()); /// - /// ontology.add_gene("Foo", "1"); + /// ontology.add_gene("Foo", GeneId::from(1)); /// /// // Genes can be iterated... /// let mut gene_iterator = ontology.genes(); @@ -1304,14 +1305,9 @@ impl Ontology { /// // .. or accessed directly /// assert!(ontology.gene(&1u32.into()).is_some()); /// ``` - pub fn add_gene(&mut self, gene_name: &str, gene_id: &str) -> HpoResult { - let id = GeneId::try_from(gene_id)?; - match self.genes.entry(id) { - std::collections::hash_map::Entry::Occupied(_) => Ok(id), - std::collections::hash_map::Entry::Vacant(entry) => { - entry.insert(Gene::new(id, gene_name)); - Ok(id) - } + pub fn add_gene(&mut self, gene_name: &str, gene_id: GeneId) { + if let Entry::Vacant(entry) = self.genes.entry(gene_id) { + entry.insert(Gene::new(gene_id, gene_name)); } } @@ -1383,7 +1379,7 @@ impl Ontology { /// /// let mut ontology = Ontology::default(); /// ontology.insert_term("Term-Foo".into(), 1u32); - /// ontology.add_gene("Foo", "5"); + /// ontology.add_gene("Foo", GeneId::from(5)); /// ontology.link_gene_term(1u32, GeneId::from(5u32)).unwrap(); /// /// let term = ontology.hpo(1u32).unwrap(); diff --git a/src/parser.rs b/src/parser.rs index f7f197f..a2d3c4c 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -10,7 +10,11 @@ pub(crate) mod hp_obo; /// Module to parse HPO - `Gene` associations /// +/// It contains functions to parse `genes_to_phenotype.txt` and +/// `phenotype_to_genes.txt` input files pub(crate) mod gene_to_hpo { + + use crate::annotations::GeneId; use crate::parser::Path; use crate::HpoError; use crate::HpoResult; @@ -22,7 +26,7 @@ pub(crate) mod gene_to_hpo { use crate::Ontology; struct ParsedGene<'a> { - ncbi_id: &'a str, + ncbi_id: GeneId, symbol: &'a str, hpo: HpoTermId, } @@ -30,6 +34,7 @@ pub(crate) mod gene_to_hpo { impl<'a> ParsedGene<'a> { fn try_new(ncbi_id: &'a str, symbol: &'a str, hpo: &'a str) -> HpoResult { let hpo = HpoTermId::try_from(hpo)?; + let ncbi_id = GeneId::try_from(ncbi_id)?; Ok(Self { ncbi_id, symbol, @@ -38,8 +43,9 @@ pub(crate) mod gene_to_hpo { } } - // Removes the first (header) line. - // TODO: Update this once https://doc.rust-lang.org/std/io/trait.BufRead.html#method.skip_until is stable + /// Removes the first (header) line. + /// + /// TODO: Update this once is stable fn remove_header(reader: &mut R) -> HpoResult<()> { let mut trash = String::with_capacity(80); reader.read_line(&mut trash).map_err(|_| { @@ -59,6 +65,10 @@ pub(crate) mod gene_to_hpo { /// Parses a single line of `genes_to_phenotype.txt` /// /// and returns a `ParsedGene` struct with gene and HPO info + /// + /// ```text + /// 10 NAT2 HP:0000007 Autosomal recessive inheritance - OMIM:243400 + /// ``` fn genes_to_phenotype_line(line: &str) -> HpoResult> { let mut cols = line.split('\t'); @@ -93,16 +103,17 @@ pub(crate) mod gene_to_hpo { return Err(HpoError::InvalidInput(line.to_string())); }; + // Column 2 is the HPO-name, which we don't need if cols.next().is_none() { return Err(HpoError::InvalidInput(line.to_string())); }; - // Column 1 is the NCBI-ID of the gene + // Column 3 is the NCBI-ID of the gene let Some(ncbi_id) = cols.next() else { return Err(HpoError::InvalidInput(line.to_string())); }; - // Column 2 is the gene symbol + // Column 4 is the gene symbol let Some(symbol) = cols.next() else { return Err(HpoError::InvalidInput(line.to_string())); }; @@ -159,10 +170,10 @@ pub(crate) mod gene_to_hpo { let gene = parse_line(&line)?; - let gene_id = ontology.add_gene(gene.symbol, gene.ncbi_id)?; - ontology.link_gene_term(gene.hpo, gene_id)?; + ontology.add_gene(gene.symbol, gene.ncbi_id); + ontology.link_gene_term(gene.hpo, gene.ncbi_id)?; ontology - .gene_mut(&gene_id) + .gene_mut(&gene.ncbi_id) .expect("Gene is present because it was just add_omim_disease") .add_term(gene.hpo); } @@ -170,11 +181,8 @@ pub(crate) mod gene_to_hpo { } #[cfg(test)] - mod test_gene_pheno_parsing { - use crate::annotations::AnnotationId; - + mod test { use super::*; - #[test] fn test_remove_header_ncbi_gene() { let x = "ncbi_gene_id\txyz\n10\tNAT2\n".as_bytes(); @@ -186,6 +194,17 @@ pub(crate) mod gene_to_hpo { assert!(lines.next().is_none()); } + #[test] + fn test_remove_header_hpo_id() { + let x = "hpo_id\txyz\n10\tNAT2\n".as_bytes(); + let mut reader = BufReader::new(x); + assert!(remove_header(&mut reader).is_ok()); + + let mut lines = reader.lines(); + assert_eq!(lines.next().unwrap().unwrap(), "10\tNAT2"); + assert!(lines.next().is_none()); + } + #[test] fn test_remove_header_hashtag() { let x = "#foobar\txyz\n10\tNAT2\n".as_bytes(); @@ -203,21 +222,28 @@ pub(crate) mod gene_to_hpo { let mut reader = BufReader::new(x); assert!(remove_header(&mut reader).is_err()); } + } + + #[cfg(test)] + mod test_genes_to_phenotype { + use crate::annotations::AnnotationId; + + use super::*; #[test] - fn test_parse_correct_line() { + fn test_parse_correct_line_with_newline() { let line = "10\tNAT2\tHP:0000007\tfoobar\n"; let res = genes_to_phenotype_line(line).expect("This line should parse correctly"); - assert_eq!(res.ncbi_id, "10"); + assert_eq!(res.ncbi_id.as_u32(), 10); assert_eq!(res.symbol, "NAT2"); assert_eq!(res.hpo.as_u32(), 7u32); } #[test] - fn test_parse_correct_line2() { + fn test_parse_correct_line_without_newline() { let line = "10\tNAT2\tHP:0000007\tfoobar"; let res = genes_to_phenotype_line(line).expect("This line should parse correctly"); - assert_eq!(res.ncbi_id, "10"); + assert_eq!(res.ncbi_id.as_u32(), 10); assert_eq!(res.symbol, "NAT2"); assert_eq!(res.hpo.as_u32(), 7u32); } @@ -250,6 +276,60 @@ pub(crate) mod gene_to_hpo { assert!(res.is_err()); } } + + #[cfg(test)] + mod test_phenotpye_to_genes { + use super::*; + use crate::annotations::AnnotationId; + + #[test] + fn test_parse_correct_line_with_newline() { + let line = "HP:0000007\tAbnormality of body height\t10\tNAT2\tfoobar\n"; + let res = phenotype_to_gene_line(line).expect("This line should parse correctly"); + assert_eq!(res.ncbi_id.as_u32(), 10); + assert_eq!(res.symbol, "NAT2"); + assert_eq!(res.hpo.as_u32(), 7u32); + } + + #[test] + fn test_parse_correct_line_without_newline() { + let line = "HP:0000007\tAbnormality of body height\t10\tNAT2\tfoobar"; + let res = phenotype_to_gene_line(line).expect("This line should parse correctly"); + assert_eq!(res.ncbi_id.as_u32(), 10); + assert_eq!(res.symbol, "NAT2"); + assert_eq!(res.hpo.as_u32(), 7u32); + } + + #[test] + fn test_parse_missing_id() { + let line = "HP:0000007\tAbnormality of body height\tNAT2\tfoobar\n"; + let res = phenotype_to_gene_line(line); + assert!(res.is_err()); + } + + #[test] + fn test_parse_empty_symbol() { + let line = "HP:0000007\tAbnormality of body height\t10\t\tfoobar\n"; + let res = phenotype_to_gene_line(line).expect("This line should parse correctly"); + assert_eq!(res.ncbi_id.as_u32(), 10); + assert_eq!(res.symbol, ""); + assert_eq!(res.hpo.as_u32(), 7u32); + } + + #[test] + fn test_parse_missing_hpo() { + let line = "Abnormality of body height\t10\tNAT2\tfoobar\n"; + let res = phenotype_to_gene_line(line); + assert!(res.is_err()); + } + + #[test] + fn test_parse_invalid_hpo() { + let line = "HP:0000007A\tAbnormality of body height\t10\tNAT2\tfoobar\n"; + let res = genes_to_phenotype_line(line); + assert!(res.is_err()); + } + } } /// Module to parse HPO - `OmimDisease` associations from `phenotype.hpoa` file @@ -261,7 +341,7 @@ pub(crate) mod gene_to_hpo { /// OMIM:609153 Pseudohyperkalemia NOT HP:0001878 PMID:2766660 PCS P HPO:lccarmody[2018-10-03] /// ``` /// -pub(crate) mod phenotype_hpoa { +pub(crate) mod disease_to_hpo { use crate::HpoError; use crate::HpoResult; use crate::HpoTermId; @@ -419,7 +499,7 @@ pub(crate) fn load_from_jax_files_with_transivitve_genes>( ) -> HpoResult<()> { hp_obo::read_obo_file(obo_file, ontology)?; gene_to_hpo::parse_phenotype_to_genes(gene_file, ontology)?; - phenotype_hpoa::parse(disease_file, ontology)?; + disease_to_hpo::parse(disease_file, ontology)?; Ok(()) } @@ -431,6 +511,6 @@ pub(crate) fn load_from_jax_files>( ) -> HpoResult<()> { hp_obo::read_obo_file(obo_file, ontology)?; gene_to_hpo::parse_genes_to_phenotype(gene_file, ontology)?; - phenotype_hpoa::parse(disease_file, ontology)?; + disease_to_hpo::parse(disease_file, ontology)?; Ok(()) } From b7b343e8113ac4f3fb264cf47fcffa1ec7f18a8d Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Fri, 5 Apr 2024 08:11:05 +0200 Subject: [PATCH 7/8] Update compare-ontologies example to also compare transitive vs normal --- examples/compare_ontologies.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/examples/compare_ontologies.rs b/examples/compare_ontologies.rs index 3b39ca6..ebf834e 100644 --- a/examples/compare_ontologies.rs +++ b/examples/compare_ontologies.rs @@ -16,7 +16,7 @@ fn ontology(path_arg: &str) -> Ontology { } } -fn ontology2(path_arg: &str) -> Ontology { +fn ontology_transitive(path_arg: &str) -> Ontology { let path = Path::new(path_arg); match path.is_file() { @@ -213,17 +213,22 @@ fn print_replacement_diff(replacements: Option<(Option, Option "); + println!("Usage:\ncompare_ontologies []"); println!("e.g.:\ncompare_ontologies tests/ontology.hpo tests/ontology_v2.hpo:\n"); + println!("Alternatively compare transitive vs non-transitive:\ncompare_ontologies tests/ontology.hpo\n"); process::exit(1) } let arg_old = args.nth(1).unwrap(); - let arg_new = args.next().unwrap(); - let lhs = ontology(&arg_old); - let rhs = ontology2(&arg_new); + + + let rhs = if let Some(arg_new) = args.next() { + ontology(&arg_new) + } else { + ontology_transitive(&arg_old) + }; let diffs = lhs.compare(&rhs); From 504b3c73119b9c8914d757aa49915f9e89d52d5e Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Sat, 6 Apr 2024 10:40:48 +0200 Subject: [PATCH 8/8] Minor formatting fix --- examples/compare_ontologies.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/compare_ontologies.rs b/examples/compare_ontologies.rs index ebf834e..f32dd3d 100644 --- a/examples/compare_ontologies.rs +++ b/examples/compare_ontologies.rs @@ -223,7 +223,6 @@ fn main() { let arg_old = args.nth(1).unwrap(); let lhs = ontology(&arg_old); - let rhs = if let Some(arg_new) = args.next() { ontology(&arg_new) } else {