diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index f67ce99d1..114743487 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -1013,7 +1013,9 @@ def _validate_seurat_convertibility(self): def _validate_obsm(self): """ Validates the embedding dictionary -- it checks that all values of adata.obsm are numpy arrays with the correct - dimension. Adds errors to self.errors if any. Checks that the keys start with "X_" + dimension. Adds errors to self.errors if any. Checks that the keys start with "X_", have no whitespace, and have + a suffix at least 1 character long. For keys that don't start with "X_", we will run them through the same + validation checks, but raise warnings instead of errors. :rtype none """ @@ -1027,38 +1029,41 @@ def _validate_obsm(self): if " " in key: self.errors.append(f"Embedding key {key} has whitespace in it, please remove it.") - if not isinstance(value, np.ndarray): - self.errors.append( - f"All embeddings have to be of 'numpy.ndarray' type, " f"'adata.obsm['{key}']' is {type(value)}')." - ) - continue - - # Embeddings to be shown in cellxgene explorer + issue_list = self.errors if key.startswith("X_"): obsm_with_x_prefix += 1 - if len(key) <= 3: self.errors.append( - f"Embedding key in 'adata.obsm' {key} must have a suffix at least one character long." - ) - if len(value.shape) < 2 or value.shape[0] != self.adata.n_obs or value.shape[1] < 2: - self.errors.append( - f"All embeddings must have as many rows as cells, and at least two columns." - f"'adata.obsm['{key}']' has shape of '{value.shape}'." + f"Embedding key in 'adata.obsm' {key} must start with X_ and have a suffix at least one character long." ) - if not (np.issubdtype(value.dtype, np.integer) or np.issubdtype(value.dtype, np.floating)): - self.errors.append( - f"adata.obsm['{key}'] has an invalid data type. It should be " - "float, integer, or unsigned integer of any precision (8, 16, 32, or 64 bits)." - ) - else: - # Check for inf/NaN values only if the dtype is numeric - if np.isinf(value).any(): - self.errors.append( - f"adata.obsm['{key}'] contains positive infinity or negative infinity values." - ) - if np.all(np.isnan(value)): - self.errors.append(f"adata.obsm['{key}'] contains all NaN values.") + else: + self.warnings.append(f"Embedding key in 'adata.obsm' {key} does not start with X_") + issue_list = self.warnings + + # For all subsequent checks, we want to raise an error if it's an X_ embedding key, and a warning otherwise + if not isinstance(value, np.ndarray): + issue_list.append( + f"All embeddings have to be of 'numpy.ndarray' type, " f"'adata.obsm['{key}']' is {type(value)}')." + ) + # Skip over the subsequent checks that require the value to be an array + continue + + if len(value.shape) < 2 or value.shape[0] != self.adata.n_obs or value.shape[1] < 2: + issue_list.append( + f"All embeddings must have as many rows as cells, and at least two columns." + f" 'adata.obsm['{key}']' has shape of '{value.shape}'." + ) + if not (np.issubdtype(value.dtype, np.integer) or np.issubdtype(value.dtype, np.floating)): + issue_list.append( + f"adata.obsm['{key}'] has an invalid data type. It should be " + "float, integer, or unsigned integer of any precision (8, 16, 32, or 64 bits)." + ) + else: + # Check for inf/NaN values only if the dtype is numeric + if np.isinf(value).any(): + issue_list.append(f"adata.obsm['{key}'] contains positive infinity or negative infinity values.") + if np.all(np.isnan(value)): + issue_list.append(f"adata.obsm['{key}'] contains all NaN values.") if obsm_with_x_prefix == 0: self.errors.append("At least one embedding in 'obsm' has to have a key with an 'X_' prefix.") diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index a3cf5bf64..c358f050e 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -1646,27 +1646,39 @@ def test_obsm_values_nan(self, validator_with_adata): assert validator.errors == ["ERROR: adata.obsm['X_umap'] contains all NaN values."] def test_obsm_values_at_least_one_X(self, validator_with_adata): - """ - At least one key for the embedding MUST be prefixed with "X_" - """ validator = validator_with_adata - obsm = validator.adata.obsm - obsm["umap"] = obsm["X_umap"] - validator.adata.uns["default_embedding"] = "umap" - del obsm["X_umap"] + validator.adata.obsm["harmony"] = validator.adata.obsm["X_umap"] + validator.adata.uns["default_embedding"] = "harmony" + del validator.adata.obsm["X_umap"] + validator.validate_adata() + assert validator.errors == [ + "ERROR: At least one embedding in 'obsm' has to have a key with an 'X_' prefix.", + ] + assert validator.warnings == [ + "WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix.", + "WARNING: Embedding key in 'adata.obsm' harmony does not start with X_", + "WARNING: Validation of raw layer was not performed due to current errors, try again after fixing current errors.", + ] + + def test_obsm_values_warn_start_with_X(self, validator_with_adata): + validator = validator_with_adata + validator.adata.obsm["harmony"] = pd.DataFrame(validator.adata.obsm["X_umap"], index=validator.adata.obs_names) validator.validate_adata() - assert validator.errors == ["ERROR: At least one embedding in 'obsm' has to have a " "key with an 'X_' prefix."] + assert validator.warnings == [ + "WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix.", + "WARNING: Embedding key in 'adata.obsm' harmony does not start with X_", + "WARNING: All embeddings have to be of 'numpy.ndarray' type, 'adata.obsm['harmony']' is ').", + ] def test_obsm_suffix_name_valid(self, validator_with_adata): """ Suffix after X_ must be at least 1 character long """ validator = validator_with_adata - obsm = validator.adata.obsm - obsm["X_"] = obsm["X_umap"] + validator.adata.obsm["X_"] = validator.adata.obsm["X_umap"] validator.validate_adata() assert validator.errors == [ - "ERROR: Embedding key in 'adata.obsm' X_ must have a suffix at least one character long." + "ERROR: Embedding key in 'adata.obsm' X_ must start with X_ and have a suffix at least one character long." ] def test_obsm_key_name_valid(self, validator_with_adata): @@ -1683,14 +1695,13 @@ def test_obsm_shape_one_column(self, validator_with_adata): """ Curators MUST annotate one or more two-dimensional (m >= 2) embeddings """ - validator = validator_with_adata - obsm = validator.adata.obsm # Makes 1 column array - obsm["X_umap"] = numpy.delete(obsm["X_umap"], 0, 1) + validator = validator_with_adata + validator.adata.obsm["X_umap"] = numpy.delete(validator.adata.obsm["X_umap"], 0, 1) validator.validate_adata() assert validator.errors == [ "ERROR: All embeddings must have as many rows as cells, and " - "at least two columns.'adata.obsm['X_umap']' has shape " + "at least two columns. 'adata.obsm['X_umap']' has shape " "of '(2, 1)'." ] @@ -1719,7 +1730,7 @@ def test_obsm_size_zero(self, validator_with_adata): validator.adata = save_and_read_adata(adata) validator.validate_adata() assert validator.errors == [ - "ERROR: The size of the ndarray stored for a 'adata.obsm['badsize']' MUST NOT be zero." + "ERROR: The size of the ndarray stored for a 'adata.obsm['badsize']' MUST NOT be zero.", ]