diff --git a/CHANGELOG.md b/CHANGELOG.md index 08cb836bcc1..0dc24209a9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ ## NEW FUNCTIONALITY +* `filter_with_hvg`: add `output_obs_num_highly_variable_genes` argument in order to store the number of highly variable genes that were discovered. + * `cluster/leiden`: Allow calculating multiple resolutions in parallel (PR #645). * `rna_multisample` workflow: added `--modality` argument (PR #607). diff --git a/src/filter/filter_with_hvg/config.vsh.yaml b/src/filter/filter_with_hvg/config.vsh.yaml index 63006139424..fa96df5daf6 100644 --- a/src/filter/filter_with_hvg/config.vsh.yaml +++ b/src/filter/filter_with_hvg/config.vsh.yaml @@ -112,6 +112,13 @@ functionality: For all flavors, genes are first sorted by how many batches they are a HVG. For dispersion-based flavors ties are broken by normalized dispersion. If flavor = 'seurat_v3', ties are broken by the median (across batches) rank based on within-batch normalized variance. + + - name: "--output_obs_num_highly_variable_genes" + type: string + description: | + When specified, store the number of genes that were found. The value from this argument is used as the + column name. + resources: - type: python_script path: script.py diff --git a/src/filter/filter_with_hvg/script.py b/src/filter/filter_with_hvg/script.py index d063ca09673..b58e4febd93 100644 --- a/src/filter/filter_with_hvg/script.py +++ b/src/filter/filter_with_hvg/script.py @@ -23,6 +23,10 @@ 'layer': 'log_transformed' } +meta = { + 'resources_dir': "." +} + mu_in = mu.read_h5mu('resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu') rna_in = mu_in.mod["rna"] assert "filter_with_hvg" not in rna_in.var.columns @@ -142,6 +146,9 @@ def setup_logger(): # drop mean_bin as mudata/anndata doesn't support tuples data.varm[par["varm_name"]] = out.drop("mean_bin", axis=1) +if par['output_obs_num_highly_variable_genes']: + data.obs[par['output_obs_num_highly_variable_genes']] = out["highly_variable"].sum() + if par["do_subset"]: keep_feats = np.ravel(data.var[par["var_name_filter"]]) mdata.mod[mod] = data[:,keep_feats] diff --git a/src/filter/filter_with_hvg/test.py b/src/filter/filter_with_hvg/test.py index 096ba57cc59..2d0a16a4f79 100644 --- a/src/filter/filter_with_hvg/test.py +++ b/src/filter/filter_with_hvg/test.py @@ -152,6 +152,22 @@ def test_filter_with_hvg_cell_ranger_unfiltered_data_change_error_message(run_co r"returned by scanpy \(see above\) could be the " r"result from trying to use this component on unfiltered data\.", err.value.stdout.decode('utf-8')) + +def test_add_number_of_highly_variable_genes_selected(run_component, lognormed_test_data_path): + run_component([ + "--flavor", "seurat", + "--input", lognormed_test_data_path, + "--output", "output.h5mu", + "--layer", "log_transformed", + "--output_obs_num_highly_variable_genes", "foo", + "--output_compression", "gzip"]) + assert os.path.exists("output.h5mu") + data = mu.read_h5mu("output.h5mu") + assert "filter_with_hvg" in data.mod["rna"].var.columns + assert "foo" in data.mod["rna"].obs.columns + result_col = data.mod["rna"].obs["foo"] + assert (result_col <= len(data.mod["rna"].var_names)).all() + assert (result_col > 0).any() if __name__ == '__main__':