README + config updated

infocusp · Sep 13, 2024 · 9606fbe · 9606fbe
1 parent 1390a7d
commit 9606fbe
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 61 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,11 @@
 
 # Single-cell analysis using Low Resource (scaLR) 
 
+<!-- [![Paper](https://img.shields.io/badge/Paper-insert_paper_id_here-white)]() -->
+[![GitHub](https://img.shields.io/github/license/InFoCusp/scaLR)](https://github.com/infocusp/scaLR?tab=GPL-3.0-1-ov-file#)
+
+## 📖 Overview 
+
 <b>scaLR</b> is a comprehensive end-to-end pipeline that is equipped with a range of advanced features to streamline and enhance the analysis of scRNA-seq data. The major steps of the platform are:
 
 1. <b>Data Processing</b>: Large datasets undergo preprocessing and normalization (if the user opts to) and are segmented into training, testing, and validation sets.
@@ -40,8 +45,25 @@ pip install -r requirements.txt
 - `adata.obs`: contains any metadata regarding cells, including a column for `target` which will be used for classification. The index of `adata.obs` is cell_barcodes.
 - `adata.var`: contains all gene_names as Index.
 
+             
+## How to run
+
+1. It is necessary that the user modify the configuration file and each stage of the pipeline is available inside the config folder [config.yml] or [full_config.yml] as per your requirements. Simply omit/comment out stages of the pipeline you do not wish to run.
+2. Refer config.yml & it's detailed config [README](config_README.md) file on how to use different parameters and files.
+3. Then use the `pipeline.py` file to run the entire pipeline according to your configurations. This file takes as argument the path to config (`-c | --config`), and an optional flag to log all parts of the pipelines (`-l | --log`).
+4. `python pipeline.py --config /path/to/config -c config.yaml -l` to run the scaLR.
+
+
+## Interactive tutorials
+Detailed tutorials have been made on how to use some functionalities as a scaLR library. Find the links below.
+
+- [Normalization](tutorials/preprocessing/normalization.ipynb)
+- [Batch correction](tutorials/preprocessing/batchc_correction.ipynb)
+- [Gene recall curve](tutorials/analysis/gene_recall_curve/gene_recall_curve.ipynb)
+- [Differential gene expression analysis](tutorials/analysis/differential_gene_expression/dge.ipynb)
+- [SHAP analysis](tutorials/analysis/shap_analysis/shap_heatmap.ipynb)
 
-## Output Structure
+## Experiment Output Structure
 - **pipeline.py**:
 The main script that perform end to end run.
     - `exp_dir`: root experiment directory for the storage of all step outputs of the platform specified in the config.
@@ -94,23 +116,7 @@ Performs evaluation of best model trained on user-defined metrics on the test se
             - `lmem_dge_result`
                 - `lmem_DGE_celltype.csv`: contains LMEM DGE results between selected factor categories for a celltype.
                 - `lmem_DGE_fixed_effect_factor_X.svg`: volcano plot of coefficient vs -log10(p-value) of genes.
-
-## How to run
-
-1. It is necessary that the user modify the configuration file and each stage of the pipeline is available inside the config folder [config.yml] or [full_config.yml] as per your requirements. Simply omit/comment out stages of the pipeline you do not wish to run.
-2. Refer config.yml & it's detailed config [README](config_README.md) file on how to use different parameters and files.
-3. Then use the `pipeline.py` file to run the entire pipeline according to your configurations. This file takes as argument the path to config (`-c | --config`), and an optional flag to log all parts of the pipelines (`-l | --log`).
-4. `python pipeline.py --config /path/to/config -c config.yaml -l` to run the scaLR.
-
-
-## Interactive tutorials
-Detailed tutorials have been made on how to use some functionalities as a scaLR library. Find the links below.
-
-- Normalization - `tutorials/preprocessing/normalization.ipynb`
-- Batch correction - `tutorials/preprocessing/batchc_correction.ipynb`
-- Gene recall curve - `tutorials/analysis/gene_recall_curve/gene_recall_curve.ipynb`
-- Differential gene expression analysis - `tutorials/analysis/differential_gene_expression/dge.ipynb`
-- SHAP analysis - `tutorials/analysis/shap_analysis/shap_heatmap.ipynb`
+
 
 <center >
   <b>scaLR © 2024 Infocusp Innovations</b>

diff --git a/config.yaml b/config.yaml
@@ -5,9 +5,9 @@ device: 'cuda'
 
 # EXPERIMENT.
 experiment:
-    dirpath: 'revamped_scalr_experiments'
-    exp_name: 'final'
-    exp_run: 1
+    dirpath: 'scalr_experiments'
+    exp_name: 'final_5000_6'
+    exp_run: 0
 
 
 # DATA CONFIG.
@@ -40,13 +40,13 @@ data:
 # FEATURE SELECTION.
 feature_selection:
 
-    # scores: '/path/to/matrix'
-    feature_subsetsize: 6000
+    # score_matrix: '/path/to/matrix'
+    feature_subsetsize: 5000
 
     model:
         name: SequentialModel
         params:
-            layers: [6000, 6]
+            layers: [5000, 6]
             weights_init_zero: True
 
     model_train_config:
@@ -56,7 +56,7 @@ feature_selection:
             name: SimpleDataLoader
             params:
                 batch_size: 25000
-                padding: 6000
+                padding: 5000
 
         optimizer:
             name: SGD
@@ -139,39 +139,20 @@ analysis:
             params:
                 k: 100
 
-    downstream_analysis:
-        - name: GeneRecallCurve
-          params:
-            reference_genes_path: '/path/to/reference_genes.csv'
-            top_K: 300
-            plots_per_row: 3
-            features_selector:
-                name: ClasswiseAbs
-                params: {}
-        - name: Heatmap
-          params:
-            top_n_genes: 100
-        - name: RocAucCurve
-          params: {}
-        - name: DgePseudoBulk
-          params:
-              celltype_column: 'cell_type'
-              design_factor: 'disease'
-              factor_categories: ['Alzheimer disease', 'normal']
-              sum_column: 'donor_id'
-              cell_subsets: ['excitatory neuron', 'inhibitory interneuron', 'oligodendrocyte']
-              min_cell_threshold: 1
-              fold_change: 1.5
-              p_val: 0.05
-              save_plot: True
-        - name: DgeLMEM
-          params:
-             fixed_effect_column: 'disease'
-             fixed_effect_factors: ['Alzheimer disease', 'normal']
-             group: 'donor_id'
-             min_cell_threshold: 10
-             n_cpu: 4
-             gene_batch_size: 1000
-             coef_threshold: 0
-             p_val: 0.05
-             save_plot: True    
+    # downstream_analysis:
+    #     - name: GeneRecallCurve
+    #       params:
+    #         reference_genes_path: '/path/to/reference_genes.csv'
+    #         top_K: 300
+    #         plots_per_row: 3
+    #         features_selector:
+    #             name: ClasswiseAbs
+    #             params: {}
+    #     - name: Heatmap
+    #       params: **kwargs
+    #     - name: RocAucCurve
+    #       params: **kwargs
+    #     - name: DgePseudoBulk
+    #       params: **kwargs
+    #     - name: DgeLMEM
+    #       params: **kwargs