diff --git a/CHANGELOG.md b/CHANGELOG.md index a19b706..cc00f65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 2.0.1 + +This only addresses a build error in the crates.io. Previously, I had tried to include +a header to render LaTeX. But, that proved error prone. It is removed in favor of html +in the README. I added a line in the Cargo.toml that should build the docs on crates.io +with the --feature mpi, so that all functions, including the mpi functions, have +documentation generated. Finally, I added documentation to `optimize()`. + +No code changes were made from 2.0.0. + ## 2.0.0 This is a complete re-write in Rust. In addition to changing the language, the diff --git a/Cargo.lock b/Cargo.lock index 9074a4b..c3eb080 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "aho-corasick" @@ -233,7 +233,7 @@ checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" [[package]] name = "dual_threshold_optimization" -version = "2.0.0" +version = "2.0.1" dependencies = [ "bincode", "clap", diff --git a/Cargo.toml b/Cargo.toml index 7570276..ba42450 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,10 +9,9 @@ to determine the rank threshold for each list that minimizes the hypergeometric p-value of the overlap of features. It then calculates a permutation based empirical p-value and an FDR. Details can be found [in this paper](https://doi.org/10.1101/gr.259655.119) """ -version = "2.0.0" +version = "2.0.1" edition = "2021" readme = "README.md" -repository = "https://github.com/BrentLab/Dual_Threshold_Optimization" license = "GPL-3.0-or-later" exclude = [ @@ -42,8 +41,9 @@ crate-type = ["rlib"] [features] mpi = ["dep:mpi"] +# enable mpi features to be always documented [package.metadata.docs.rs] -rustdoc-args = [ "--html-in-header", "path-to-your-header-file.html" ] +features = ["mpi"] [[bin]] name = "dual_threshold_optimization" diff --git a/README.md b/README.md index 9d91e85..1615598 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,9 @@ # Dual Threshold Optimization [![Rust Tests](https://github.com/BrentLab/Dual_Threshold_Optimization/actions/workflows/tests.yml/badge.svg)](https://github.com/BrentLab/Dual_Threshold_Optimization/actions/workflows/tests.yml) [![Rust Linting and Formatting](https://github.com/BrentLab/Dual_Threshold_Optimization/actions/workflows/linting.yml/badge.svg)](https://github.com/BrentLab/Dual_Threshold_Optimization/actions/workflows/linting.yml) -[![Crates.io Version](https://img.shields.io/crates/v/dual_threshold_optimization)](https://crates.io/crates/dual_threshold_optimization) +[![Crates.io Version](https://img.shields.io/crates/v/dual_threshold_optimization?cacheBust=1)](https://crates.io/crates/dual_threshold_optimization) +[![Documentation](https://docs.rs/dual_threshold_optimization/badge.svg)](https://docs.rs/dual_threshold_optimization) + This library provides a comprehensive toolkit for performing [Dual Threshold Optimization](https://doi.org/10.1101/gr.259655.119) (DTO) @@ -53,14 +55,6 @@ toolchain and compile a binary. Alternatively, open an Issue and we will help. ### Installation -If you have `cargo`, the Rust package manager, installed, you can do the following to -install `dual_threshold_optimization` in your `$PATH`: - -```bash -cargo install dual_threshold_optimization -``` - -Alternatively, you can download binaries from the [github release](https://github.com/BrentLab/Dual_Threshold_Optimization/releases). If you are on a Mac, for example, and you do not need MPI (most users), then you would download the binary called `dual_threshold_optimization-macos-latest-default` from the releases tab. There is also a windows executable, and both a default (non-mpi) and mpi @@ -86,11 +80,10 @@ to rename the executable to simply `dual_threshold_optimization`. ### Using the cmd line -With the correct binary, you can print the help message like so (omit the `./` if -`dual_threshold_optimization` is in your `$PATH`): +With the correct binary, you can print the help message like so: ```bash -./dual_threshold_optimization --help +dual_threshold_optimization --help ``` ```bash @@ -141,22 +134,21 @@ Options: ``` -You can run this with the following minimal test data: - -- input list examples: [list1](https://raw.githubusercontent.com/BrentLab/Dual_Threshold_Optimization/refs/heads/main/test_data/ranklist1.csv), [list2](https://raw.githubusercontent.com/BrentLab/Dual_Threshold_Optimization/refs/heads/main/test_data/ranklist2.csv) -- background example: [background](https://raw.githubusercontent.com/BrentLab/Dual_Threshold_Optimization/refs/heads/main/test_data/background.txt) +You can run this with the following minimal test data: + +- input list examples: [list1](https://github.com/cmatKhan/Dual_Threshold_Optimization/blob/rust_implementation/test_data/ranklist1.csv), [list2](https://github.com/cmatKhan/Dual_Threshold_Optimization/blob/rust_implementation/test_data/ranklist2.csv) +- background example: [background](https://github.com/cmatKhan/Dual_Threshold_Optimization/blob/rust_implementation/test_data/background.txt) like this ```bash # download list1 -wget https://raw.githubusercontent.com/BrentLab/Dual_Threshold_Optimization/refs/heads/main/test_data/ranklist1.csv +wget https://raw.githubusercontent.com/cmatKhan/Dual_Threshold_Optimization/refs/heads/rust_implementation/test_data/ranklist1.csv # download list2 -wget https://raw.githubusercontent.com/BrentLab/Dual_Threshold_Optimization/refs/heads/main/test_data/ranklist2.csv +wget https://raw.githubusercontent.com/cmatKhan/Dual_Threshold_Optimization/refs/heads/rust_implementation/test_data/ranklist2.csv -# run the binary. Note that the background is optional. If not provided, then ranklist1 and ranklist2 must have -# the same set of features -./dual_threshold_optimization -1 ranklist1.csv -2 ranklist2.csv -p 5 -t 1 +# run the binary +dual_threshold_optimization -1 ranklist1.csv -2 ranklist2.csv -p 5 -t 1 ``` This will output some run information to stderr, and a json to stdout. The json in the stdout is the output of the program. This is important because it means that you can @@ -213,7 +205,11 @@ Where the fields are the following: To use the library in your own Rust program, you can `cargo add dual_threshold_optimization` in your rust project. See the crates.io -documentation for more information about what is provided in each of the submodules. +documentation for more information about what is provided in each of the submodules. + +crates.io provides +[documentation](https://docs.rs/dual_threshold_optimization/latest/dual_threshold_optimization/) +of the various structs, types, etc provided by `dual_threshold_optimization`. ### Developer installation and usage @@ -289,24 +285,27 @@ The following provides details on the DTO algorithm, step by step. used to generate sets of features from each list to compare the overlap. The thresholds are calculated by the recurrence relation - $$ T_1 = 1 \\ Tn = Floor(T_{n-1} * 1.01 + 1) $$ +

+ T1 = 1
+ Tn = ⌊ Tn-1 * 1.01 + 1 ⌋ +

The stopping condition is when the threshold meets or exceeds the largest rank. The final threshold is always set to the max rank. This series provides finer spacing at higher ranks, allowing more granular selection among top-ranked genes. The effect of this equation is that for the first 100 ranks, the thresholds - increment at the same rate as the ranks, so we have $1, 2, 3, \dots$ . At $100$, the - resolution decreases by 2, eg $100, 102, 104, \dots$ . For every additional 100 + increment at the same rate as the ranks, so we have 1, 2, 3, ... . At 100, the + resolution decreases by 2, eg 100, 102, 104, ... . For every additional 100 ranks after this, the resolution decreases by 1, so for instance: - $200, 203, 206, \dots, 402, 407, \dots, 1705, 1723, 1741$ + 200, 203, 206, ..., 402, 407, ..., 1705, 1723, 1741 1. Conduct a brute force search of the threshold pairs to find an optimal overlap For each possible pair of thresholds, select the genes from each list with rank less than or equal to the respective threshold. Calculate the hypergeometric p-value by intersecting the feature sets. This is the core of the algorithm with - a complexity of $O(n^2)$ where $n$ is the length of the threshold lists. + a complexity of O(n^2) where n is the length of the threshold lists. 1. Report the optimal threshold pair diff --git a/src/dto/optimize_main.rs b/src/dto/optimize_main.rs index be498ac..55c8ef7 100644 --- a/src/dto/optimize_main.rs +++ b/src/dto/optimize_main.rs @@ -1,6 +1,55 @@ use crate::collections::RankedFeatureList; use crate::dto::{process_threshold_pairs, OptimizationResult}; +/// This is the main function of the dto module. It takes two ranked feature lists +/// and finds the optimal thresholds which produce the smallest hypergeometric p-value +/// for the intersection of the two subsets formed by taking the features with rank +/// less than or equal to the thresholds. +/// +/// # Arguments +/// - `ranked_feature_list1`: The first ranked feature list. +/// - `ranked_feature_list2`: The second ranked feature list. +/// - `permute`: Whether to permute the ranks of the second list. +/// - `population_size`: The size of the population for permutation testing. +/// - `debug`: Whether to return all results or just the best result. +/// +/// # Returns +/// An `OptimizationResult` enum containing either the best result or all results. +/// +/// # Example +/// +/// ``` +/// use dual_threshold_optimization::collections::{Feature, FeatureList, RankedFeatureList}; +/// +/// use dual_threshold_optimization::dto::{optimize, OptimizationResult}; +/// +/// // RankedFeatureList 1 +/// let genes1 = FeatureList::from(vec![ +/// Feature::from("gene1"), +/// Feature::from("gene2"), +/// Feature::from("gene3"), +/// ]); +/// +/// let ranks1 = vec![1, 2, 3]; +/// let ranked_feature_list1 = RankedFeatureList::from(genes1, ranks1).unwrap(); +/// +/// // RankedFeatureList 2 +/// let genes2 = FeatureList::from(vec![ +/// Feature::from("gene1"), +/// Feature::from("gene2"), +/// Feature::from("gene4"), +/// ]); +/// +/// +/// let ranks2 = vec![1, 2, 3]; +/// +/// let ranked_feature_list2 = RankedFeatureList::from(genes2, ranks2).unwrap(); +/// // note that the background would have had to be provided in this case, since the +/// // two ranked feature lists have different features. +/// let result = optimize(&ranked_feature_list1, &ranked_feature_list2, true, 4, false); +/// +/// // Ensure the optimization returned a valid best result +/// assert!(matches!(result, OptimizationResult::Best(_))); pub fn optimize( ranked_feature_list1: &RankedFeatureList, ranked_feature_list2: &RankedFeatureList,