diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 08a490360..100004594 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -750,11 +750,13 @@ def _validate_colors_in_uns_dict(self, uns_dict: dict) -> None: for key, value in uns_dict.items(): if key.endswith("_colors"): # 1. Verify that the corresponding categorical field exists in obs - obs_unique_values = category_mapping.get(key.replace("_colors", "")) + column_name = key.replace("_colors", "") + obs_unique_values = category_mapping.get(column_name) if not obs_unique_values: - self.errors.append( - f"Colors field uns[{key}] does not have a corresponding categorical field in obs" - ) + error_message = f"Colors field uns[{key}] does not have a corresponding categorical field in obs" + if column_name in df.columns: + error_message += f". {column_name} is present but is dtype {df[column_name].dtype.name}" + self.errors.append(error_message) continue # 2. Verify that the value is a numpy array if value is None or not isinstance(value, np.ndarray): @@ -764,8 +766,11 @@ def _validate_colors_in_uns_dict(self, uns_dict: dict) -> None: # Skip over all subsequent validations which expect a numpy array continue # 3. Verify that we have strings in the array - if not np.issubdtype(value.dtype, np.character): - self.errors.append(f"Colors in uns[{key}] must be strings. Found: {value}") + all_strings = all(isinstance(color, str) for color in value) + if not all_strings: + self.errors.append( + f"Colors in uns[{key}] must be strings. Found: {value} which are {value.dtype.name}" + ) continue # 4. Verify that we have at least as many unique colors as unique values in the corresponding categorical field value = np.unique(value) diff --git a/cellxgene_schema_cli/tests/fixtures/examples_validate.py b/cellxgene_schema_cli/tests/fixtures/examples_validate.py index 79e141cda..33b36dfa5 100644 --- a/cellxgene_schema_cli/tests/fixtures/examples_validate.py +++ b/cellxgene_schema_cli/tests/fixtures/examples_validate.py @@ -180,6 +180,16 @@ "batch_condition": ["is_primary_data"], } +good_uns_with_colors = { + "title": "A title", + "default_embedding": "X_umap", + "X_approximate_distribution": "normal", + "batch_condition": ["is_primary_data"], + "suspension_type_colors": numpy.array(["red", "blue"]), + "donor_id_colors": numpy.array(["#000000", "#ffffff"]), + "tissue_type_colors": numpy.array(["black", "pink"]), +} + # --- # 4. Creating expression matrix, # X has integer values and non_raw_X has real values @@ -234,6 +244,10 @@ obsm=good_obsm, ) +# Expected anndata with colors for categorical obs fields +adata_with_colors = anndata.AnnData( + X=sparse.csr_matrix(X), obs=good_obs, uns=good_uns_with_colors, obsm=good_obsm, var=good_var +) # anndata for testing migration umigrated_obs = pd.DataFrame( diff --git a/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid_modified.h5ad b/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid_modified.h5ad new file mode 100644 index 000000000..e72fed468 Binary files /dev/null and b/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid_modified.h5ad differ diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index fdfa5fa67..ed6d11dde 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -294,7 +294,7 @@ def test_column_presence(self, validator_with_adata, column): validator.adata.uns.pop("batch_condition") validator.validate_adata() - assert validator.errors == [f"ERROR: Dataframe 'obs' is missing " f"column '{column}'."] + assert f"ERROR: Dataframe 'obs' is missing " f"column '{column}'." in validator.errors def test_column_presence_organism(self, validator_with_adata): """ @@ -1569,9 +1569,15 @@ def test_deprecated_fields(self, validator_with_adata): ] def test_colors_happy_path(self, validator_with_adata): + validator = validator_with_adata + validator.adata = examples.adata_with_colors.copy() + assert validator.validate_adata() + + def test_colors_happy_path_no_column_def(self, validator_with_adata): """ Creates a new column in `obs[test_column]` where the dtype is categorical, but its column definition - is not in the schema. Having valid colors in `uns[test_column_colors]` should not raise an error. + is not in the schema. Having valid colors in `uns[test_column_colors]` should not raise an error, since + we should be able to find the corresponding `obs` column with the `category` dtype. """ validator = validator_with_adata validator.adata.obs["test_column"] = ["one", "two"] @@ -1594,7 +1600,7 @@ def test_colors_bool_obs_counterpart(self, validator_with_adata): validator.adata.uns["is_primary_data_colors"] = numpy.array(["green", "purple"]) validator.validate_adata() assert validator.errors == [ - "ERROR: Colors field uns[is_primary_data_colors] does not have a corresponding categorical field in obs" + "ERROR: Colors field uns[is_primary_data_colors] does not have a corresponding categorical field in obs. is_primary_data is present but is dtype bool" ] def test_colors_without_obs_counterpart(self, validator_with_adata): @@ -1609,7 +1615,9 @@ def test_empty_color_array(self, validator_with_adata): validator = validator_with_adata validator.adata.uns["suspension_type_colors"] = numpy.array([]) validator.validate_adata() - assert validator.errors == ["ERROR: Colors in uns[suspension_type_colors] must be strings. Found: []"] + assert validator.errors == [ + "ERROR: Annotated categorical field suspension_type must have at least 2 color options in uns[suspension_type_colors]. Found: []" + ] def test_not_enough_color_options(self, validator_with_adata): validator = validator_with_adata @@ -1663,21 +1671,25 @@ def test_invalid_named_color_option_integer(self, validator_with_adata): validator = validator_with_adata validator.adata.uns["suspension_type_colors"] = numpy.array([3, 4]) validator.validate_adata() - assert validator.errors == ["ERROR: Colors in uns[suspension_type_colors] must be strings. Found: [3 4]"] + assert validator.errors == [ + "ERROR: Colors in uns[suspension_type_colors] must be strings. Found: [3 4] which are int64" + ] def test_invalid_named_color_option_none(self, validator_with_adata): validator = validator_with_adata validator.adata.uns["suspension_type_colors"] = numpy.array(["green", None]) validator.validate_adata() assert validator.errors == [ - "ERROR: Colors in uns[suspension_type_colors] must be strings. Found: ['green' None]" + "ERROR: Colors in uns[suspension_type_colors] must be strings. Found: ['green' None] which are object" ] def test_invalid_named_color_option_nan(self, validator_with_adata): validator = validator_with_adata validator.adata.uns["suspension_type_colors"] = numpy.array([numpy.nan, numpy.nan]) validator.validate_adata() - assert validator.errors == ["ERROR: Colors in uns[suspension_type_colors] must be strings. Found: [nan nan]"] + assert validator.errors == [ + "ERROR: Colors in uns[suspension_type_colors] must be strings. Found: [nan nan] which are float64" + ] class TestObsm: