From 9606fbecbb8e61d49b74e25a3541689a573cc4a8 Mon Sep 17 00:00:00 2001 From: Saiyam26 Date: Fri, 13 Sep 2024 18:16:59 +0530 Subject: [PATCH] README + config updated --- README.md | 42 +++++++++++++++++++-------------- config.yaml | 67 +++++++++++++++++++---------------------------------- 2 files changed, 48 insertions(+), 61 deletions(-) diff --git a/README.md b/README.md index 2c157c6..aac58c6 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,11 @@ # Single-cell analysis using Low Resource (scaLR) + +[![GitHub](https://img.shields.io/github/license/InFoCusp/scaLR)](https://github.com/infocusp/scaLR?tab=GPL-3.0-1-ov-file#) + +## 📖 Overview + scaLR is a comprehensive end-to-end pipeline that is equipped with a range of advanced features to streamline and enhance the analysis of scRNA-seq data. The major steps of the platform are: 1. Data Processing: Large datasets undergo preprocessing and normalization (if the user opts to) and are segmented into training, testing, and validation sets. @@ -40,8 +45,25 @@ pip install -r requirements.txt - `adata.obs`: contains any metadata regarding cells, including a column for `target` which will be used for classification. The index of `adata.obs` is cell_barcodes. - `adata.var`: contains all gene_names as Index. + +## How to run + +1. It is necessary that the user modify the configuration file and each stage of the pipeline is available inside the config folder [config.yml] or [full_config.yml] as per your requirements. Simply omit/comment out stages of the pipeline you do not wish to run. +2. Refer config.yml & it's detailed config [README](config_README.md) file on how to use different parameters and files. +3. Then use the `pipeline.py` file to run the entire pipeline according to your configurations. This file takes as argument the path to config (`-c | --config`), and an optional flag to log all parts of the pipelines (`-l | --log`). +4. `python pipeline.py --config /path/to/config -c config.yaml -l` to run the scaLR. + + +## Interactive tutorials +Detailed tutorials have been made on how to use some functionalities as a scaLR library. Find the links below. + +- [Normalization](tutorials/preprocessing/normalization.ipynb) +- [Batch correction](tutorials/preprocessing/batchc_correction.ipynb) +- [Gene recall curve](tutorials/analysis/gene_recall_curve/gene_recall_curve.ipynb) +- [Differential gene expression analysis](tutorials/analysis/differential_gene_expression/dge.ipynb) +- [SHAP analysis](tutorials/analysis/shap_analysis/shap_heatmap.ipynb) -## Output Structure +## Experiment Output Structure - **pipeline.py**: The main script that perform end to end run. - `exp_dir`: root experiment directory for the storage of all step outputs of the platform specified in the config. @@ -94,23 +116,7 @@ Performs evaluation of best model trained on user-defined metrics on the test se - `lmem_dge_result` - `lmem_DGE_celltype.csv`: contains LMEM DGE results between selected factor categories for a celltype. - `lmem_DGE_fixed_effect_factor_X.svg`: volcano plot of coefficient vs -log10(p-value) of genes. - -## How to run - -1. It is necessary that the user modify the configuration file and each stage of the pipeline is available inside the config folder [config.yml] or [full_config.yml] as per your requirements. Simply omit/comment out stages of the pipeline you do not wish to run. -2. Refer config.yml & it's detailed config [README](config_README.md) file on how to use different parameters and files. -3. Then use the `pipeline.py` file to run the entire pipeline according to your configurations. This file takes as argument the path to config (`-c | --config`), and an optional flag to log all parts of the pipelines (`-l | --log`). -4. `python pipeline.py --config /path/to/config -c config.yaml -l` to run the scaLR. - - -## Interactive tutorials -Detailed tutorials have been made on how to use some functionalities as a scaLR library. Find the links below. - -- Normalization - `tutorials/preprocessing/normalization.ipynb` -- Batch correction - `tutorials/preprocessing/batchc_correction.ipynb` -- Gene recall curve - `tutorials/analysis/gene_recall_curve/gene_recall_curve.ipynb` -- Differential gene expression analysis - `tutorials/analysis/differential_gene_expression/dge.ipynb` -- SHAP analysis - `tutorials/analysis/shap_analysis/shap_heatmap.ipynb` +
scaLR © 2024 Infocusp Innovations diff --git a/config.yaml b/config.yaml index 6958df6..c1a5c17 100644 --- a/config.yaml +++ b/config.yaml @@ -5,9 +5,9 @@ device: 'cuda' # EXPERIMENT. experiment: - dirpath: 'revamped_scalr_experiments' - exp_name: 'final' - exp_run: 1 + dirpath: 'scalr_experiments' + exp_name: 'final_5000_6' + exp_run: 0 # DATA CONFIG. @@ -40,13 +40,13 @@ data: # FEATURE SELECTION. feature_selection: - # scores: '/path/to/matrix' - feature_subsetsize: 6000 + # score_matrix: '/path/to/matrix' + feature_subsetsize: 5000 model: name: SequentialModel params: - layers: [6000, 6] + layers: [5000, 6] weights_init_zero: True model_train_config: @@ -56,7 +56,7 @@ feature_selection: name: SimpleDataLoader params: batch_size: 25000 - padding: 6000 + padding: 5000 optimizer: name: SGD @@ -139,39 +139,20 @@ analysis: params: k: 100 - downstream_analysis: - - name: GeneRecallCurve - params: - reference_genes_path: '/path/to/reference_genes.csv' - top_K: 300 - plots_per_row: 3 - features_selector: - name: ClasswiseAbs - params: {} - - name: Heatmap - params: - top_n_genes: 100 - - name: RocAucCurve - params: {} - - name: DgePseudoBulk - params: - celltype_column: 'cell_type' - design_factor: 'disease' - factor_categories: ['Alzheimer disease', 'normal'] - sum_column: 'donor_id' - cell_subsets: ['excitatory neuron', 'inhibitory interneuron', 'oligodendrocyte'] - min_cell_threshold: 1 - fold_change: 1.5 - p_val: 0.05 - save_plot: True - - name: DgeLMEM - params: - fixed_effect_column: 'disease' - fixed_effect_factors: ['Alzheimer disease', 'normal'] - group: 'donor_id' - min_cell_threshold: 10 - n_cpu: 4 - gene_batch_size: 1000 - coef_threshold: 0 - p_val: 0.05 - save_plot: True \ No newline at end of file + # downstream_analysis: + # - name: GeneRecallCurve + # params: + # reference_genes_path: '/path/to/reference_genes.csv' + # top_K: 300 + # plots_per_row: 3 + # features_selector: + # name: ClasswiseAbs + # params: {} + # - name: Heatmap + # params: **kwargs + # - name: RocAucCurve + # params: **kwargs + # - name: DgePseudoBulk + # params: **kwargs + # - name: DgeLMEM + # params: **kwargs \ No newline at end of file