Skip to content

Commit

Permalink
Merge pull request #654 from chanzuckerberg/joyce/x-suffix-fix
Browse files Browse the repository at this point in the history
feat: verify all obsm keys start with X_ suffix
  • Loading branch information
joyceyan authored Oct 4, 2023
2 parents f20b5b9 + 034b2ff commit c8b2ad5
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 44 deletions.
61 changes: 33 additions & 28 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1013,7 +1013,9 @@ def _validate_seurat_convertibility(self):
def _validate_obsm(self):
"""
Validates the embedding dictionary -- it checks that all values of adata.obsm are numpy arrays with the correct
dimension. Adds errors to self.errors if any. Checks that the keys start with "X_"
dimension. Adds errors to self.errors if any. Checks that the keys start with "X_", have no whitespace, and have
a suffix at least 1 character long. For keys that don't start with "X_", we will run them through the same
validation checks, but raise warnings instead of errors.
:rtype none
"""
Expand All @@ -1027,38 +1029,41 @@ def _validate_obsm(self):
if " " in key:
self.errors.append(f"Embedding key {key} has whitespace in it, please remove it.")

if not isinstance(value, np.ndarray):
self.errors.append(
f"All embeddings have to be of 'numpy.ndarray' type, " f"'adata.obsm['{key}']' is {type(value)}')."
)
continue

# Embeddings to be shown in cellxgene explorer
issue_list = self.errors
if key.startswith("X_"):
obsm_with_x_prefix += 1

if len(key) <= 3:
self.errors.append(
f"Embedding key in 'adata.obsm' {key} must have a suffix at least one character long."
)
if len(value.shape) < 2 or value.shape[0] != self.adata.n_obs or value.shape[1] < 2:
self.errors.append(
f"All embeddings must have as many rows as cells, and at least two columns."
f"'adata.obsm['{key}']' has shape of '{value.shape}'."
f"Embedding key in 'adata.obsm' {key} must start with X_ and have a suffix at least one character long."
)
if not (np.issubdtype(value.dtype, np.integer) or np.issubdtype(value.dtype, np.floating)):
self.errors.append(
f"adata.obsm['{key}'] has an invalid data type. It should be "
"float, integer, or unsigned integer of any precision (8, 16, 32, or 64 bits)."
)
else:
# Check for inf/NaN values only if the dtype is numeric
if np.isinf(value).any():
self.errors.append(
f"adata.obsm['{key}'] contains positive infinity or negative infinity values."
)
if np.all(np.isnan(value)):
self.errors.append(f"adata.obsm['{key}'] contains all NaN values.")
else:
self.warnings.append(f"Embedding key in 'adata.obsm' {key} does not start with X_")
issue_list = self.warnings

# For all subsequent checks, we want to raise an error if it's an X_ embedding key, and a warning otherwise
if not isinstance(value, np.ndarray):
issue_list.append(
f"All embeddings have to be of 'numpy.ndarray' type, " f"'adata.obsm['{key}']' is {type(value)}')."
)
# Skip over the subsequent checks that require the value to be an array
continue

if len(value.shape) < 2 or value.shape[0] != self.adata.n_obs or value.shape[1] < 2:
issue_list.append(
f"All embeddings must have as many rows as cells, and at least two columns."
f" 'adata.obsm['{key}']' has shape of '{value.shape}'."
)
if not (np.issubdtype(value.dtype, np.integer) or np.issubdtype(value.dtype, np.floating)):
issue_list.append(
f"adata.obsm['{key}'] has an invalid data type. It should be "
"float, integer, or unsigned integer of any precision (8, 16, 32, or 64 bits)."
)
else:
# Check for inf/NaN values only if the dtype is numeric
if np.isinf(value).any():
issue_list.append(f"adata.obsm['{key}'] contains positive infinity or negative infinity values.")
if np.all(np.isnan(value)):
issue_list.append(f"adata.obsm['{key}'] contains all NaN values.")

if obsm_with_x_prefix == 0:
self.errors.append("At least one embedding in 'obsm' has to have a key with an 'X_' prefix.")
Expand Down
43 changes: 27 additions & 16 deletions cellxgene_schema_cli/tests/test_schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -1646,27 +1646,39 @@ def test_obsm_values_nan(self, validator_with_adata):
assert validator.errors == ["ERROR: adata.obsm['X_umap'] contains all NaN values."]

def test_obsm_values_at_least_one_X(self, validator_with_adata):
"""
At least one key for the embedding MUST be prefixed with "X_"
"""
validator = validator_with_adata
obsm = validator.adata.obsm
obsm["umap"] = obsm["X_umap"]
validator.adata.uns["default_embedding"] = "umap"
del obsm["X_umap"]
validator.adata.obsm["harmony"] = validator.adata.obsm["X_umap"]
validator.adata.uns["default_embedding"] = "harmony"
del validator.adata.obsm["X_umap"]
validator.validate_adata()
assert validator.errors == [
"ERROR: At least one embedding in 'obsm' has to have a key with an 'X_' prefix.",
]
assert validator.warnings == [
"WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix.",
"WARNING: Embedding key in 'adata.obsm' harmony does not start with X_",
"WARNING: Validation of raw layer was not performed due to current errors, try again after fixing current errors.",
]

def test_obsm_values_warn_start_with_X(self, validator_with_adata):
validator = validator_with_adata
validator.adata.obsm["harmony"] = pd.DataFrame(validator.adata.obsm["X_umap"], index=validator.adata.obs_names)
validator.validate_adata()
assert validator.errors == ["ERROR: At least one embedding in 'obsm' has to have a " "key with an 'X_' prefix."]
assert validator.warnings == [
"WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix.",
"WARNING: Embedding key in 'adata.obsm' harmony does not start with X_",
"WARNING: All embeddings have to be of 'numpy.ndarray' type, 'adata.obsm['harmony']' is <class 'pandas.core.frame.DataFrame'>').",
]

def test_obsm_suffix_name_valid(self, validator_with_adata):
"""
Suffix after X_ must be at least 1 character long
"""
validator = validator_with_adata
obsm = validator.adata.obsm
obsm["X_"] = obsm["X_umap"]
validator.adata.obsm["X_"] = validator.adata.obsm["X_umap"]
validator.validate_adata()
assert validator.errors == [
"ERROR: Embedding key in 'adata.obsm' X_ must have a suffix at least one character long."
"ERROR: Embedding key in 'adata.obsm' X_ must start with X_ and have a suffix at least one character long."
]

def test_obsm_key_name_valid(self, validator_with_adata):
Expand All @@ -1683,14 +1695,13 @@ def test_obsm_shape_one_column(self, validator_with_adata):
"""
Curators MUST annotate one or more two-dimensional (m >= 2) embeddings
"""
validator = validator_with_adata
obsm = validator.adata.obsm
# Makes 1 column array
obsm["X_umap"] = numpy.delete(obsm["X_umap"], 0, 1)
validator = validator_with_adata
validator.adata.obsm["X_umap"] = numpy.delete(validator.adata.obsm["X_umap"], 0, 1)
validator.validate_adata()
assert validator.errors == [
"ERROR: All embeddings must have as many rows as cells, and "
"at least two columns.'adata.obsm['X_umap']' has shape "
"at least two columns. 'adata.obsm['X_umap']' has shape "
"of '(2, 1)'."
]

Expand Down Expand Up @@ -1719,7 +1730,7 @@ def test_obsm_size_zero(self, validator_with_adata):
validator.adata = save_and_read_adata(adata)
validator.validate_adata()
assert validator.errors == [
"ERROR: The size of the ndarray stored for a 'adata.obsm['badsize']' MUST NOT be zero."
"ERROR: The size of the ndarray stored for a 'adata.obsm['badsize']' MUST NOT be zero.",
]


Expand Down

0 comments on commit c8b2ad5

Please sign in to comment.