From 8e9d876fbb676e131a36cf64eab4d9b43c75fe70 Mon Sep 17 00:00:00 2001 From: c3rb3r3u5d3d53c Date: Sat, 21 Dec 2024 09:55:29 -0400 Subject: [PATCH] Better Comologous Chromosomes --- src/genetics/chromosome.rs | 21 +- src/lcs/lcs.rs | 585 +++++++++++++++++++++---------------- 2 files changed, 347 insertions(+), 259 deletions(-) diff --git a/src/genetics/chromosome.rs b/src/genetics/chromosome.rs index b2f409e..e603b25 100644 --- a/src/genetics/chromosome.rs +++ b/src/genetics/chromosome.rs @@ -524,24 +524,21 @@ impl Chromosome { if lhs_tlsh.is_some() && rhs_tlsh.is_some() { tlsh = TLSH::compare(lhs_tlsh.unwrap(), rhs_tlsh.unwrap()); } + if minhash.is_none() && tlsh.is_none() { return None; } let mut homologues = Vec::::new(); if self.config.chromosomes.homologues.enabled { let lhs_pattern = self.pattern(); let rhs_pattern = rhs.pattern(); - let fuzzy = FuzzyLCS::new(&lhs_pattern, &rhs_pattern); - let matches = fuzzy.compare(self.config.chromosomes.homologues.maximum); - for (score, string) in matches.iter() { - let c = Chromosome::new( - string.to_string(), - self.config.clone()); - if c.is_err() { continue; } - let homologous_chromosome = HomologousChromosome { - score: *score as f64, - chromosome: c.unwrap(), - }; - homologues.push(homologous_chromosome); + for (score, (_, _), homologue) in lhs_pattern.fuzzy_find_subyara_all(rhs_pattern, 0.25) { + if let Ok(c) = Chromosome::new(homologue.to_string().clone(), self.config.clone()) { + let homologous_chromosome = HomologousChromosome { + score: score as f64, + chromosome: c.clone(), + }; + homologues.push(homologous_chromosome); + } } } Some(ChromosomeSimilarity { diff --git a/src/lcs/lcs.rs b/src/lcs/lcs.rs index 5a24f62..c4eaf61 100644 --- a/src/lcs/lcs.rs +++ b/src/lcs/lcs.rs @@ -1,264 +1,355 @@ -// GNU LESSER GENERAL PUBLIC LICENSE -// Version 3, 29 June 2007 -// -// Copyright (C) 2007 Free Software Foundation, Inc. -// Everyone is permitted to copy and distribute verbatim copies -// of this license document, but changing it is not allowed. -// -// -// This version of the GNU Lesser General Public License incorporates -// the terms and conditions of version 3 of the GNU General Public -// License, supplemented by the additional permissions listed below. -// -// 0. Additional Definitions. -// -// As used herein, "this License" refers to version 3 of the GNU Lesser -// General Public License, and the "GNU GPL" refers to version 3 of the GNU -// General Public License. -// -// "The Library" refers to a covered work governed by this License, -// other than an Application or a Combined Work as defined below. -// -// An "Application" is any work that makes use of an interface provided -// by the Library, but which is not otherwise based on the Library. -// Defining a subclass of a class defined by the Library is deemed a mode -// of using an interface provided by the Library. -// -// A "Combined Work" is a work produced by combining or linking an -// Application with the Library. The particular version of the Library -// with which the Combined Work was made is also called the "Linked -// Version". -// -// The "Minimal Corresponding Source" for a Combined Work means the -// Corresponding Source for the Combined Work, excluding any source code -// for portions of the Combined Work that, considered in isolation, are -// based on the Application, and not on the Linked Version. -// -// The "Corresponding Application Code" for a Combined Work means the -// object code and/or source code for the Application, including any data -// and utility programs needed for reproducing the Combined Work from the -// Application, but excluding the System Libraries of the Combined Work. -// -// 1. Exception to Section 3 of the GNU GPL. -// -// You may convey a covered work under sections 3 and 4 of this License -// without being bound by section 3 of the GNU GPL. -// -// 2. Conveying Modified Versions. -// -// If you modify a copy of the Library, and, in your modifications, a -// facility refers to a function or data to be supplied by an Application -// that uses the facility (other than as an argument passed when the -// facility is invoked), then you may convey a copy of the modified -// version: -// -// a) under this License, provided that you make a good faith effort to -// ensure that, in the event an Application does not supply the -// function or data, the facility still operates, and performs -// whatever part of its purpose remains meaningful, or -// -// b) under the GNU GPL, with none of the additional permissions of -// this License applicable to that copy. -// -// 3. Object Code Incorporating Material from Library Header Files. -// -// The object code form of an Application may incorporate material from -// a header file that is part of the Library. You may convey such object -// code under terms of your choice, provided that, if the incorporated -// material is not limited to numerical parameters, data structure -// layouts and accessors, or small macros, inline functions and templates -// (ten or fewer lines in length), you do both of the following: -// -// a) Give prominent notice with each copy of the object code that the -// Library is used in it and that the Library and its use are -// covered by this License. -// -// b) Accompany the object code with a copy of the GNU GPL and this license -// document. -// -// 4. Combined Works. -// -// You may convey a Combined Work under terms of your choice that, -// taken together, effectively do not restrict modification of the -// portions of the Library contained in the Combined Work and reverse -// engineering for debugging such modifications, if you also do each of -// the following: -// -// a) Give prominent notice with each copy of the Combined Work that -// the Library is used in it and that the Library and its use are -// covered by this License. -// -// b) Accompany the Combined Work with a copy of the GNU GPL and this license -// document. -// -// c) For a Combined Work that displays copyright notices during -// execution, include the copyright notice for the Library among -// these notices, as well as a reference directing the user to the -// copies of the GNU GPL and this license document. -// -// d) Do one of the following: -// -// 0) Convey the Minimal Corresponding Source under the terms of this -// License, and the Corresponding Application Code in a form -// suitable for, and under terms that permit, the user to -// recombine or relink the Application with a modified version of -// the Linked Version to produce a modified Combined Work, in the -// manner specified by section 6 of the GNU GPL for conveying -// Corresponding Source. -// -// 1) Use a suitable shared library mechanism for linking with the -// Library. A suitable mechanism is one that (a) uses at run time -// a copy of the Library already present on the user's computer -// system, and (b) will operate properly with a modified version -// of the Library that is interface-compatible with the Linked -// Version. -// -// e) Provide Installation Information, but only if you would otherwise -// be required to provide such information under section 6 of the -// GNU GPL, and only to the extent that such information is -// necessary to install and execute a modified version of the -// Combined Work produced by recombining or relinking the -// Application with a modified version of the Linked Version. (If -// you use option 4d0, the Installation Information must accompany -// the Minimal Corresponding Source and Corresponding Application -// Code. If you use option 4d1, you must provide the Installation -// Information in the manner specified by section 6 of the GNU GPL -// for conveying Corresponding Source.) -// -// 5. Combined Libraries. -// -// You may place library facilities that are a work based on the -// Library side by side in a single library together with other library -// facilities that are not Applications and are not covered by this -// License, and convey such a combined library under terms of your -// choice, if you do both of the following: -// -// a) Accompany the combined library with a copy of the same work based -// on the Library, uncombined with any other library facilities, -// conveyed under the terms of this License. -// -// b) Give prominent notice with the combined library that part of it -// is a work based on the Library, and explaining where to find the -// accompanying uncombined form of the same work. -// -// 6. Revised Versions of the GNU Lesser General Public License. -// -// The Free Software Foundation may publish revised and/or new versions -// of the GNU Lesser General Public License from time to time. Such new -// versions will be similar in spirit to the present version, but may -// differ in detail to address new problems or concerns. +use std::cmp::{max, min}; + // -// Each version is given a distinguishing version number. If the -// Library as you received it specifies that a certain numbered version -// of the GNU Lesser General Public License "or any later version" -// applies to it, you have the option of following the terms and -// conditions either of that published version or of any later version -// published by the Free Software Foundation. If the Library as you -// received it does not specify a version number of the GNU Lesser -// General Public License, you may choose any version of the GNU Lesser -// General Public License ever published by the Free Software Foundation. +// --[ Original LCS code remains unchanged ]-- // -// If the Library as you received it specifies that a proxy can decide -// whether future versions of the GNU Lesser General Public License shall -// apply, that proxy's public statement of acceptance of any version is -// permanent authorization for you to choose that version for the -// Library. - -use std::cmp::{max, Ordering}; -use std::collections::HashMap; - -pub struct FuzzyLCS<'a> { - s1: &'a str, - s2: &'a str, + +// =========== +// 1) Helpers +// =========== + +pub fn compute_mat_ih_iv(a: &[T], b: &[T]) -> (Vec>, Vec>) +where + T: Eq, +{ + let na = a.len(); + let nb = b.len(); + let mut ih = vec![vec![0; nb + 1]; na + 1]; + let mut iv = vec![vec![0; nb + 1]; na + 1]; + + for j in 0..(nb + 1) { + ih[0][j] = j; + } + for l in 0..(na + 1) { + iv[l][0] = 0; + } + + for l in 1..(na + 1) { + for j in 1..(nb + 1) { + if a[l - 1] != b[j - 1] { + ih[l][j] = max(iv[l][j - 1], ih[l - 1][j]); + iv[l][j] = min(iv[l][j - 1], ih[l - 1][j]); + } else { + ih[l][j] = iv[l][j - 1]; + iv[l][j] = ih[l - 1][j]; + } + } + } + (ih, iv) +} + +pub fn compute_vec_ig(a: &[T], b: &[T]) -> Vec +where + T: Eq, +{ + let na = a.len(); + let nb = b.len(); + let mut ih = vec![vec![0; nb + 1], vec![0; nb + 1]]; + let mut iv = vec![vec![0; nb + 1], vec![0; nb + 1]]; + + for j in 0..(nb + 1) { + ih[0][j] = j; + } + + for l in 1..(na + 1) { + iv[1][0] = 0; + for j in 1..(nb + 1) { + if a[l - 1] != b[j - 1] { + ih[1][j] = max(iv[1][j - 1], ih[0][j]); + iv[1][j] = min(iv[1][j - 1], ih[0][j]); + } else { + ih[1][j] = iv[1][j - 1]; + iv[1][j] = ih[0][j]; + } + } + ih.swap(0, 1); + iv.swap(0, 1); + } + ih.into_iter().next().unwrap() +} + +pub fn compute_vg_dg_from_ig( + a: &[T], + b: &[T], + ig: &Vec, +) -> (Vec>, Vec>) +where + T: Eq, +{ + let na = a.len(); + let nb = b.len(); + let mut vg = vec![None; nb + 1]; + let mut dg = vec![Some(0); na + 1]; + + let mut i = 1; + for j in 1..(nb + 1) { + if ig[j] == 0 { + dg[i] = Some(j); + i += 1; + } else { + vg[ig[j]] = Some(j); + } + } + for l in i..(na + 1) { + dg[l] = None; + } + (vg, dg) +} + +pub fn compute_ig_vg_dg_from_ih_mat( + a: &[T], + b: &[T], + ih: &Vec>, +) -> (Vec, Vec>, Vec>) +where + T: Eq, +{ + let ig = ih[ih.len() - 1].clone(); + let (vg, dg) = compute_vg_dg_from_ig(a, b, &ih[ih.len() - 1]); + (ig, vg, dg) +} + +pub fn alcs(a: &[T], b: &[T]) -> (Vec, Vec>, Vec>) +where + T: Eq, +{ + let ig = compute_vec_ig(a, b); + let (vg, dg) = compute_vg_dg_from_ig(a, b, &ig); + (ig, vg, dg) +} + +pub fn alcs_mat( + a: &[T], + b: &[T], +) -> ( + Vec>, + Vec>, + Vec, + Vec>, + Vec>, +) +where + T: Eq, +{ + let (ih, iv) = compute_mat_ih_iv(a, b); + let (ig, vg, dg) = compute_ig_vg_dg_from_ih_mat(a, b, &ih); + (ih, iv, ig, vg, dg) +} + +#[derive(Debug)] +pub struct Alcs { + ig: Vec, +} + +impl Alcs { + pub fn new(a: &[T], b: &[T]) -> Self + where + T: Eq, + { + Alcs { + ig: compute_vec_ig(a, b), + } + } + + /// Returns an iterator yielding the LCS length for all substrings starting from position `i` + pub fn suffix(&self, pos: usize) -> AlcsIterator { + AlcsIterator::new(self, pos) + } +} + +#[derive(Debug)] +pub struct AlcsIterator<'a> { + alcs: &'a Alcs, + i: usize, + j: usize, + prev: usize, +} + +impl<'a> AlcsIterator<'a> { + fn new(alcs: &'a Alcs, pos: usize) -> Self { + AlcsIterator { + alcs, + i: pos, + j: pos + 1, + prev: 0, + } + } } -impl<'a> FuzzyLCS<'a> { - pub fn new(s1: &'a str, s2: &'a str) -> Self { - FuzzyLCS { s1, s2 } +impl<'a> Iterator for AlcsIterator<'a> { + type Item = (usize, usize, usize); + + fn next(&mut self) -> Option { + if self.j >= self.alcs.ig.len() { + return None; + } + let cur = self.prev + (self.alcs.ig[self.j] <= self.i) as usize; + self.prev = cur; + self.j += 1; + Some((self.i, self.j - 1, cur)) } +} - pub fn compare( - &self, - max_results: usize, - ) -> Vec<(f64, String)> { - let a: Vec = self.s1.chars().collect(); - let b: Vec = self.s2.chars().collect(); - let a_len = a.len(); - let b_len = b.len(); - - let mut unique_matches: HashMap = HashMap::new(); - - for start in 0..b_len { - for end in (start + 1)..=b_len { - let substring = &self.s2[start..end]; - - let substring_chars: Vec = substring.chars().collect(); - let sub_len = substring_chars.len(); - - let mut dp = vec![vec![0; sub_len + 1]; a_len + 1]; - - for i in 1..=a_len { - for j in 1..=sub_len { - if a[i - 1] == substring_chars[j - 1] { - dp[i][j] = dp[i - 1][j - 1] + 1; - } else { - dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]); - } - } - } - - let lcs_length = dp[a_len][sub_len]; - let score = lcs_length as f64 / a_len.max(sub_len) as f64; - - if lcs_length > 0 { - let mut lcs = String::new(); - let (mut i, mut j) = (a_len, sub_len); - - while i > 0 && j > 0 { - if a[i - 1] == substring_chars[j - 1] { - lcs.push(a[i - 1]); - i -= 1; - j -= 1; - } else if dp[i - 1][j] > dp[i][j - 1] { - i -= 1; - } else { - j -= 1; - } - } - - lcs = lcs.chars().rev().collect::(); - - if let Some(existing_score) = unique_matches.get(&lcs) { - if score > *existing_score { - unique_matches.insert(lcs, score); - } - } else { - unique_matches.insert(lcs, score); - } - } +// +// =========== +// 2) The old single best "score" function (LCS-based). +// We keep it for backward-compatibility. +// =========== +// +fn score(b: &str, a: &str, tsh: Option) -> (f32, usize, usize) { + let va = a.chars().collect::>(); + let vb = b.chars().collect::>(); + let alcs = Alcs::new(&va, &vb); + let na = a.len(); + let nb = b.len(); + let many = match tsh { + None => nb, + Some(tsh) => (na as f32 / tsh) as usize, + }; + let mut best = (0., 0, 0); + for i in 0..nb { + let mut maxrow = (0., 0, 0); + for (start_i, j, lcs_len) in alcs.suffix(i).take(many) { + let cur = lcs_len as f32 / max(j - start_i, na) as f32; + if cur > maxrow.0 { + maxrow = (cur, start_i, j); } } + if maxrow >= best { + best = maxrow; + } + } + best +} + +// +// ====================================================== +// 3) New "sub-YARA–style" function: single '?' wildcard +// and returning ALL matches above threshold +// ====================================================== +// + +/// Finds all sub-YARA–style matches (one `?` mismatch allowed) +/// in `b` that align (in order) to the pattern `a`. +/// +/// Returns a vector of (score, (start, end), subyara_alignment), +/// sorted by descending score (best match first). +/// +/// Score formula ∈ [0,1]: +/// score = (matched_count / a.len()) * (matched_count / alignment_len) +/// +/// This rewards matches that cover a large portion of `a` and +/// are "dense" within the alignment slice of `b`. +pub fn find_subyara_matches( + b: &str, + a: &str, + threshold: f32 +) -> Vec<(f32, (usize, usize), String)> { + let vb: Vec = b.chars().collect(); + let va: Vec = a.chars().collect(); + let nb = vb.len(); + let na = va.len(); + + let mut results = Vec::new(); - let mut matches: Vec<(f64, String)> = unique_matches.into_iter() - .map(|(lcs, score)| (score, lcs)) - .collect(); + // Slide `a` across `b`. + for start_b in 0..nb { + let mut mismatch_used = false; + let mut alignment = String::new(); + let mut matched_count = 0; + let mut end_b = start_b; + let mut idx_a = 0; - matches.sort_by(|a, b| { - b.0.partial_cmp(&a.0).unwrap_or(Ordering::Equal) - .then_with(|| b.1.len().cmp(&a.1.len())) - }); + // As long as we have characters left in both b and a + while end_b < nb && idx_a < na { + if vb[end_b] == va[idx_a] { + // Exact match + alignment.push(va[idx_a]); + matched_count += 1; + } else if !mismatch_used { + // Use our one wildcard mismatch + alignment.push('?'); + mismatch_used = true; + } else { + // Already used our mismatch; stop + break; + } + + // Compute the alignment length so far in `b` + let alignment_len = end_b - start_b + 1; - matches.truncate(max_results); + // fraction of pattern matched: + // matched_count / a.len() + // fraction of alignment matched: + // matched_count / alignment_len + // final score = product of both => in [0,1]. + let frac_pattern = matched_count as f32 / na as f32; + let frac_dense = matched_count as f32 / alignment_len as f32; + let score = frac_pattern * frac_dense; + + if score >= threshold { + results.push((score, (start_b, end_b + 1), alignment.clone())); + } - matches + end_b += 1; + idx_a += 1; + } } - pub fn wildcard_ratio(s: &str) -> f64 { - let total_chars = s.chars().count(); - if total_chars == 0 { - return 0.0; + // Sort results by descending score (best first) + results.sort_by(|(score_a, _, _), (score_b, _, _)| { + score_b + .partial_cmp(score_a) + .unwrap_or(std::cmp::Ordering::Equal) + }); + + results +} + + + +// +// ====================================================== +// 4) Example extension of the FuzzyLCS trait to gather +// *multiple* matches using the new subyara approach. +// ====================================================== +// + +pub trait FuzzyLCS>: AsRef { + /// Old single best-match function + fn fuzzy_find_pos(&self, s: T, tsh: f32) -> Option<(f32, usize, usize)> { + let s = score(self.as_ref(), s.as_ref(), Some(tsh)); + if s.0 > tsh { + Some(s) + } else { + None } - let count = s.chars().filter(|&c| c == '?').count(); - count as f64 / total_chars as f64 } + + /// Old single best-match substring + fn fuzzy_find_str<'a>(&'a self, s: T, tsh: f32) -> Option<(f32, &'a str)> { + let r = self.fuzzy_find_pos(s, tsh); + r.map(|(score_val, start, end)| (score_val, &self.as_ref()[start..end])) + } + + /// Old single yes/no + fn fuzzy_contains(&self, s: T, tsh: f32) -> bool { + self.fuzzy_find_pos(s, tsh).is_some() + } + + // ================================ + // NEW: get multiple sub-YARA–style matches + // ================================ + fn fuzzy_find_subyara_all(&self, pat: T, tsh: f32) + -> Vec<(f32, (usize, usize), String)> + { + find_subyara_matches(self.as_ref(), pat.as_ref(), tsh) + } +} + +impl FuzzyLCS for S +where + S: AsRef, + T: AsRef, +{ }