Merge pull request #654 from chanzuckerberg/joyce/x-suffix-fix

feat: verify all obsm keys start with X_ suffix
chanzuckerberg · Oct 4, 2023 · c8b2ad5 · c8b2ad5
2 parents f20b5b9 + 034b2ff
commit c8b2ad5
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 44 deletions.
diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -1013,7 +1013,9 @@ def _validate_seurat_convertibility(self):
     def _validate_obsm(self):
         """
         Validates the embedding dictionary -- it checks that all values of adata.obsm are numpy arrays with the correct
-        dimension. Adds errors to self.errors if any. Checks that the keys start with "X_"
+        dimension. Adds errors to self.errors if any. Checks that the keys start with "X_", have no whitespace, and have
+        a suffix at least 1 character long. For keys that don't start with "X_", we will run them through the same
+        validation checks, but raise warnings instead of errors.
 
         :rtype none
         """
@@ -1027,38 +1029,41 @@ def _validate_obsm(self):
             if " " in key:
                 self.errors.append(f"Embedding key {key} has whitespace in it, please remove it.")
 
-            if not isinstance(value, np.ndarray):
-                self.errors.append(
-                    f"All embeddings have to be of 'numpy.ndarray' type, " f"'adata.obsm['{key}']' is {type(value)}')."
-                )
-                continue
-
-            # Embeddings to be shown in cellxgene explorer
+            issue_list = self.errors
             if key.startswith("X_"):
                 obsm_with_x_prefix += 1
-
                 if len(key) <= 3:
                     self.errors.append(
-                        f"Embedding key in 'adata.obsm' {key} must have a suffix at least one character long."
-                    )
-                if len(value.shape) < 2 or value.shape[0] != self.adata.n_obs or value.shape[1] < 2:
-                    self.errors.append(
-                        f"All embeddings must have as many rows as cells, and at least two columns."
-                        f"'adata.obsm['{key}']' has shape of '{value.shape}'."
+                        f"Embedding key in 'adata.obsm' {key} must start with X_ and have a suffix at least one character long."
                     )
-                if not (np.issubdtype(value.dtype, np.integer) or np.issubdtype(value.dtype, np.floating)):
-                    self.errors.append(
-                        f"adata.obsm['{key}'] has an invalid data type. It should be "
-                        "float, integer, or unsigned integer of any precision (8, 16, 32, or 64 bits)."
-                    )
-                else:
-                    # Check for inf/NaN values only if the dtype is numeric
-                    if np.isinf(value).any():
-                        self.errors.append(
-                            f"adata.obsm['{key}'] contains positive infinity or negative infinity values."
-                        )
-                    if np.all(np.isnan(value)):
-                        self.errors.append(f"adata.obsm['{key}'] contains all NaN values.")
+            else:
+                self.warnings.append(f"Embedding key in 'adata.obsm' {key} does not start with X_")
+                issue_list = self.warnings
+
+            # For all subsequent checks, we want to raise an error if it's an X_ embedding key, and a warning otherwise
+            if not isinstance(value, np.ndarray):
+                issue_list.append(
+                    f"All embeddings have to be of 'numpy.ndarray' type, " f"'adata.obsm['{key}']' is {type(value)}')."
+                )
+                # Skip over the subsequent checks that require the value to be an array
+                continue
+
+            if len(value.shape) < 2 or value.shape[0] != self.adata.n_obs or value.shape[1] < 2:
+                issue_list.append(
+                    f"All embeddings must have as many rows as cells, and at least two columns."
+                    f" 'adata.obsm['{key}']' has shape of '{value.shape}'."
+                )
+            if not (np.issubdtype(value.dtype, np.integer) or np.issubdtype(value.dtype, np.floating)):
+                issue_list.append(
+                    f"adata.obsm['{key}'] has an invalid data type. It should be "
+                    "float, integer, or unsigned integer of any precision (8, 16, 32, or 64 bits)."
+                )
+            else:
+                # Check for inf/NaN values only if the dtype is numeric
+                if np.isinf(value).any():
+                    issue_list.append(f"adata.obsm['{key}'] contains positive infinity or negative infinity values.")
+                if np.all(np.isnan(value)):
+                    issue_list.append(f"adata.obsm['{key}'] contains all NaN values.")
 
         if obsm_with_x_prefix == 0:
             self.errors.append("At least one embedding in 'obsm' has to have a key with an 'X_' prefix.")

diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py
@@ -1646,27 +1646,39 @@ def test_obsm_values_nan(self, validator_with_adata):
         assert validator.errors == ["ERROR: adata.obsm['X_umap'] contains all NaN values."]
 
     def test_obsm_values_at_least_one_X(self, validator_with_adata):
-        """
-        At least one key for the embedding MUST be prefixed with "X_"
-        """
         validator = validator_with_adata
-        obsm = validator.adata.obsm
-        obsm["umap"] = obsm["X_umap"]
-        validator.adata.uns["default_embedding"] = "umap"
-        del obsm["X_umap"]
+        validator.adata.obsm["harmony"] = validator.adata.obsm["X_umap"]
+        validator.adata.uns["default_embedding"] = "harmony"
+        del validator.adata.obsm["X_umap"]
+        validator.validate_adata()
+        assert validator.errors == [
+            "ERROR: At least one embedding in 'obsm' has to have a key with an 'X_' prefix.",
+        ]
+        assert validator.warnings == [
+            "WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix.",
+            "WARNING: Embedding key in 'adata.obsm' harmony does not start with X_",
+            "WARNING: Validation of raw layer was not performed due to current errors, try again after fixing current errors.",
+        ]
+
+    def test_obsm_values_warn_start_with_X(self, validator_with_adata):
+        validator = validator_with_adata
+        validator.adata.obsm["harmony"] = pd.DataFrame(validator.adata.obsm["X_umap"], index=validator.adata.obs_names)
         validator.validate_adata()
-        assert validator.errors == ["ERROR: At least one embedding in 'obsm' has to have a " "key with an 'X_' prefix."]
+        assert validator.warnings == [
+            "WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix.",
+            "WARNING: Embedding key in 'adata.obsm' harmony does not start with X_",
+            "WARNING: All embeddings have to be of 'numpy.ndarray' type, 'adata.obsm['harmony']' is <class 'pandas.core.frame.DataFrame'>').",
+        ]
 
     def test_obsm_suffix_name_valid(self, validator_with_adata):
         """
         Suffix after X_ must be at least 1 character long
         """
         validator = validator_with_adata
-        obsm = validator.adata.obsm
-        obsm["X_"] = obsm["X_umap"]
+        validator.adata.obsm["X_"] = validator.adata.obsm["X_umap"]
         validator.validate_adata()
         assert validator.errors == [
-            "ERROR: Embedding key in 'adata.obsm' X_ must have a suffix at least one character long."
+            "ERROR: Embedding key in 'adata.obsm' X_ must start with X_ and have a suffix at least one character long."
         ]
 
     def test_obsm_key_name_valid(self, validator_with_adata):
@@ -1683,14 +1695,13 @@ def test_obsm_shape_one_column(self, validator_with_adata):
         """
         Curators MUST annotate one or more two-dimensional (m >= 2) embeddings
         """
-        validator = validator_with_adata
-        obsm = validator.adata.obsm
         # Makes 1 column array
-        obsm["X_umap"] = numpy.delete(obsm["X_umap"], 0, 1)
+        validator = validator_with_adata
+        validator.adata.obsm["X_umap"] = numpy.delete(validator.adata.obsm["X_umap"], 0, 1)
         validator.validate_adata()
         assert validator.errors == [
             "ERROR: All embeddings must have as many rows as cells, and "
-            "at least two columns.'adata.obsm['X_umap']' has shape "
+            "at least two columns. 'adata.obsm['X_umap']' has shape "
             "of '(2, 1)'."
         ]
 
@@ -1719,7 +1730,7 @@ def test_obsm_size_zero(self, validator_with_adata):
         validator.adata = save_and_read_adata(adata)
         validator.validate_adata()
         assert validator.errors == [
-            "ERROR: The size of the ndarray stored for a 'adata.obsm['badsize']' MUST NOT be zero."
+            "ERROR: The size of the ndarray stored for a 'adata.obsm['badsize']' MUST NOT be zero.",
         ]