From ea9e6de68743da087165c51dff01c620ece74f02 Mon Sep 17 00:00:00 2001
From: Chris Flerin <ccflerin@gmail.com>
Date: Tue, 24 Nov 2020 11:09:57 +0100
Subject: [PATCH] Add CLI option add_cor - Take expression and adjacencies
 input, calculate correlations for each TF-gene interaction, and output to a
 new file.

---
 README.rst                   |  5 ++++
 src/pyscenic/cli/pyscenic.py | 53 ++++++++++++++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index b1ecedd..e206e06 100644
--- a/README.rst
+++ b/README.rst
@@ -18,6 +18,11 @@ in no time. The latter is achieved via the dask_ framework for distributed compu
 News and releases
 -----------------
 
+0.10.4 | 2020-11-24
+^^^^^^^^^^^^^^^^^^^
+
+* Included new CLI option to add correlation information to the GRN adjacencies file. This can be called with ``pyscenic add_cor``.
+
 0.10.3 | 2020-07-15
 ^^^^^^^^^^^^^^^^^^^
 
diff --git a/src/pyscenic/cli/pyscenic.py b/src/pyscenic/cli/pyscenic.py
index 43a0103..5cfcbf4 100644
--- a/src/pyscenic/cli/pyscenic.py
+++ b/src/pyscenic/cli/pyscenic.py
@@ -16,7 +16,7 @@
 from arboreto.algo import grnboost2, genie3
 from arboreto.utils import load_tf_names
 
-from pyscenic.utils import modules_from_adjacencies
+from pyscenic.utils import modules_from_adjacencies, add_correlation
 from pyscenic.rnkdb import opendb, RankingDatabase
 from pyscenic.prune import prune2df, find_features, _prepare_client
 from pyscenic.aucell import aucell
@@ -109,6 +109,34 @@ def adjacencies2modules(args):
                                     keep_only_activating=(args.all_modules != "yes"))
 
 
+def addCorrelations(args):
+    try:
+        adjacencies = load_adjacencies(args.adjacencies.name)
+    except ValueError as e:
+        LOGGER.error(e)
+        sys.exit(1)
+
+    LOGGER.info("Loading expression matrix.")
+    try:
+        ex_mtx = load_exp_matrix(args.expression_mtx_fname.name,
+                                 (args.transpose == 'yes'),
+                                 False, # sparse loading is disabled here for now
+                                 args.cell_id_attribute,
+                                 args.gene_attribute)
+    except ValueError as e:
+        LOGGER.error(e)
+        sys.exit(1)
+
+    LOGGER.info("Calculating correlations.")
+    adjacencies_wCor = add_correlation(adjacencies, ex_mtx,
+            rho_threshold=0.03, mask_dropouts=args.mask_dropouts)
+
+    LOGGER.info("Writing results to file.")
+
+    extension = PurePath(args.output.name).suffixes
+    adjacencies_wCor.to_csv(args.output.name, index=False, sep=suffixes_to_separator(extension))
+
+
 def _load_dbs(fnames: Sequence[str]) -> Sequence[Type[RankingDatabase]]:
     def get_name(fname):
         return os.path.splitext(os.path.basename(fname))[0]
@@ -276,7 +304,7 @@ def add_computation_parameters(parser):
     group = parser.add_argument_group('computation arguments')
     group.add_argument('--num_workers',
                        type=int, default=cpu_count(),
-                       help='The number of workers to use. Only valid of using dask_multiprocessing, custom_multiprocessing or local as mode. (default: {}).'.format(cpu_count()))
+                       help='The number of workers to use. Only valid if using dask_multiprocessing, custom_multiprocessing or local as mode. (default: {}).'.format(cpu_count()))
     group.add_argument('--client_or_address',
                        type=str, default='local',
                        help='The client or the IP address of the dask scheduler to use.'
@@ -334,6 +362,27 @@ def create_argument_parser():
     add_loom_parameters(parser_grn)
     parser_grn.set_defaults(func=find_adjacencies_command)
 
+    #-----------------------------------------
+    # create the parser for the "add_cor" command
+    #-----------------------------------------
+    parser_add_cor = subparsers.add_parser('add_cor',
+                            help='[Optional] Add Pearson correlations based on TF-gene expression to the network adjacencies output from the GRN step, and output these to a new adjacencies file. This will normally be done during the "ctx" step.')
+    parser_add_cor.add_argument('adjacencies',
+                            type=argparse.FileType('r'),
+                            help='The name of the file that contains the GRN adjacencies (output from the GRN step).')
+    parser_add_cor.add_argument('expression_mtx_fname',
+                            type=argparse.FileType('r'),
+                            help='The name of the file that contains the expression matrix for the single cell experiment.'
+                                 ' Two file formats are supported: csv (rows=cells x columns=genes) or loom (rows=genes x columns=cells).')
+    parser_add_cor.add_argument('-o', '--output',
+                            type=argparse.FileType('w'), default=sys.stdout,
+                            help='Output file/stream, i.e. the adjacencies table with correlations (csv, tsv).')
+    parser_add_cor.add_argument('-t', '--transpose', action='store_const', const = 'yes',
+                            help='Transpose the expression matrix (rows=genes x columns=cells).')
+    add_loom_parameters(parser_add_cor)
+    add_module_parameters(parser_add_cor)
+    parser_add_cor.set_defaults(func=addCorrelations)
+
     #-----------------------------------------
     # create the parser for the "ctx" command
     #-----------------------------------------