From 35846420ce21be324e9a4a1f843b49052f32a61d Mon Sep 17 00:00:00 2001 From: DriesSchaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Mon, 8 Jan 2024 12:06:35 +0100 Subject: [PATCH 1/2] `filter_with_hvg`: enable storing the number of highly variable genes found. --- CHANGELOG.md | 2 ++ src/filter/filter_with_hvg/config.vsh.yaml | 7 +++++++ src/filter/filter_with_hvg/script.py | 7 +++++++ 3 files changed, 16 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 91b27b2c3f7..102eb41ce7c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ ## NEW FUNCTIONALITY +* `filter_with_hvg`: add `output_obs_num_highly_variable_genes` argument in order to store the number of highly variable genes that were discovered. + * `rna_multisample` workflow: added `--modality` argument (PR #607). * Added `filter/intersect_obs` component which removes observations that are not shared between modalities (PR #589). diff --git a/src/filter/filter_with_hvg/config.vsh.yaml b/src/filter/filter_with_hvg/config.vsh.yaml index 63006139424..fa96df5daf6 100644 --- a/src/filter/filter_with_hvg/config.vsh.yaml +++ b/src/filter/filter_with_hvg/config.vsh.yaml @@ -112,6 +112,13 @@ functionality: For all flavors, genes are first sorted by how many batches they are a HVG. For dispersion-based flavors ties are broken by normalized dispersion. If flavor = 'seurat_v3', ties are broken by the median (across batches) rank based on within-batch normalized variance. + + - name: "--output_obs_num_highly_variable_genes" + type: string + description: | + When specified, store the number of genes that were found. The value from this argument is used as the + column name. + resources: - type: python_script path: script.py diff --git a/src/filter/filter_with_hvg/script.py b/src/filter/filter_with_hvg/script.py index d063ca09673..b58e4febd93 100644 --- a/src/filter/filter_with_hvg/script.py +++ b/src/filter/filter_with_hvg/script.py @@ -23,6 +23,10 @@ 'layer': 'log_transformed' } +meta = { + 'resources_dir': "." +} + mu_in = mu.read_h5mu('resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu') rna_in = mu_in.mod["rna"] assert "filter_with_hvg" not in rna_in.var.columns @@ -142,6 +146,9 @@ def setup_logger(): # drop mean_bin as mudata/anndata doesn't support tuples data.varm[par["varm_name"]] = out.drop("mean_bin", axis=1) +if par['output_obs_num_highly_variable_genes']: + data.obs[par['output_obs_num_highly_variable_genes']] = out["highly_variable"].sum() + if par["do_subset"]: keep_feats = np.ravel(data.var[par["var_name_filter"]]) mdata.mod[mod] = data[:,keep_feats] From 069dd0e70c34b10cfe07b634da8c5ab82ee76e54 Mon Sep 17 00:00:00 2001 From: DriesSchaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Tue, 9 Jan 2024 10:43:21 +0100 Subject: [PATCH 2/2] Implement test --- src/filter/filter_with_hvg/test.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/filter/filter_with_hvg/test.py b/src/filter/filter_with_hvg/test.py index 096ba57cc59..2d0a16a4f79 100644 --- a/src/filter/filter_with_hvg/test.py +++ b/src/filter/filter_with_hvg/test.py @@ -152,6 +152,22 @@ def test_filter_with_hvg_cell_ranger_unfiltered_data_change_error_message(run_co r"returned by scanpy \(see above\) could be the " r"result from trying to use this component on unfiltered data\.", err.value.stdout.decode('utf-8')) + +def test_add_number_of_highly_variable_genes_selected(run_component, lognormed_test_data_path): + run_component([ + "--flavor", "seurat", + "--input", lognormed_test_data_path, + "--output", "output.h5mu", + "--layer", "log_transformed", + "--output_obs_num_highly_variable_genes", "foo", + "--output_compression", "gzip"]) + assert os.path.exists("output.h5mu") + data = mu.read_h5mu("output.h5mu") + assert "filter_with_hvg" in data.mod["rna"].var.columns + assert "foo" in data.mod["rna"].obs.columns + result_col = data.mod["rna"].obs["foo"] + assert (result_col <= len(data.mod["rna"].var_names)).all() + assert (result_col > 0).any() if __name__ == '__main__':