From 35846420ce21be324e9a4a1f843b49052f32a61d Mon Sep 17 00:00:00 2001
From: DriesSchaumont <5946712+DriesSchaumont@users.noreply.github.com>
Date: Mon, 8 Jan 2024 12:06:35 +0100
Subject: [PATCH 1/2] `filter_with_hvg`: enable storing the number of highly
 variable genes found.

---
 CHANGELOG.md                               | 2 ++
 src/filter/filter_with_hvg/config.vsh.yaml | 7 +++++++
 src/filter/filter_with_hvg/script.py       | 7 +++++++
 3 files changed, 16 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 91b27b2c3f7..102eb41ce7c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,8 @@
 
 ## NEW FUNCTIONALITY
 
+* `filter_with_hvg`: add `output_obs_num_highly_variable_genes` argument in order to store the number of highly variable genes that were discovered.
+
 * `rna_multisample` workflow: added `--modality` argument (PR #607).
 
 * Added `filter/intersect_obs` component which removes observations that are not shared between modalities (PR #589).
diff --git a/src/filter/filter_with_hvg/config.vsh.yaml b/src/filter/filter_with_hvg/config.vsh.yaml
index 63006139424..fa96df5daf6 100644
--- a/src/filter/filter_with_hvg/config.vsh.yaml
+++ b/src/filter/filter_with_hvg/config.vsh.yaml
@@ -112,6 +112,13 @@ functionality:
         For all flavors, genes are first sorted by how many batches they are a HVG. For dispersion-based flavors 
         ties are broken by normalized dispersion. If flavor = 'seurat_v3', ties are broken by the median (across
         batches) rank based on within-batch normalized variance.
+
+    - name: "--output_obs_num_highly_variable_genes"
+      type: string
+      description: |
+        When specified, store the number of genes that were found. The value from this argument is used as the 
+        column name.
+
   resources:
     - type: python_script
       path: script.py
diff --git a/src/filter/filter_with_hvg/script.py b/src/filter/filter_with_hvg/script.py
index d063ca09673..b58e4febd93 100644
--- a/src/filter/filter_with_hvg/script.py
+++ b/src/filter/filter_with_hvg/script.py
@@ -23,6 +23,10 @@
   'layer': 'log_transformed'
 }
 
+meta = {
+    'resources_dir': "."
+}
+
 mu_in = mu.read_h5mu('resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu')
 rna_in = mu_in.mod["rna"]
 assert "filter_with_hvg" not in rna_in.var.columns
@@ -142,6 +146,9 @@ def setup_logger():
     # drop mean_bin as mudata/anndata doesn't support tuples
     data.varm[par["varm_name"]] = out.drop("mean_bin", axis=1)
 
+if par['output_obs_num_highly_variable_genes']:
+    data.obs[par['output_obs_num_highly_variable_genes']] = out["highly_variable"].sum()
+
 if par["do_subset"]:
     keep_feats = np.ravel(data.var[par["var_name_filter"]])
     mdata.mod[mod] = data[:,keep_feats]

From 069dd0e70c34b10cfe07b634da8c5ab82ee76e54 Mon Sep 17 00:00:00 2001
From: DriesSchaumont <5946712+DriesSchaumont@users.noreply.github.com>
Date: Tue, 9 Jan 2024 10:43:21 +0100
Subject: [PATCH 2/2] Implement test

---
 src/filter/filter_with_hvg/test.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/filter/filter_with_hvg/test.py b/src/filter/filter_with_hvg/test.py
index 096ba57cc59..2d0a16a4f79 100644
--- a/src/filter/filter_with_hvg/test.py
+++ b/src/filter/filter_with_hvg/test.py
@@ -152,6 +152,22 @@ def test_filter_with_hvg_cell_ranger_unfiltered_data_change_error_message(run_co
                      r"returned by scanpy \(see above\) could be the "
                      r"result from trying to use this component on unfiltered data\.",
                     err.value.stdout.decode('utf-8'))
+    
+def test_add_number_of_highly_variable_genes_selected(run_component, lognormed_test_data_path):
+    run_component([
+        "--flavor", "seurat",
+        "--input", lognormed_test_data_path,
+        "--output", "output.h5mu",
+        "--layer", "log_transformed",
+        "--output_obs_num_highly_variable_genes", "foo",
+        "--output_compression", "gzip"])
+    assert os.path.exists("output.h5mu")
+    data = mu.read_h5mu("output.h5mu")
+    assert "filter_with_hvg" in data.mod["rna"].var.columns
+    assert "foo" in data.mod["rna"].obs.columns
+    result_col = data.mod["rna"].obs["foo"]
+    assert (result_col <= len(data.mod["rna"].var_names)).all()
+    assert (result_col > 0).any() 
 
 
 if __name__ == '__main__':