Skip to content

Commit

Permalink
Merge pull request #668 from chanzuckerberg/joyce/colors-string
Browse files Browse the repository at this point in the history
fix: change how we detect strings in uns[{column}_colors]
  • Loading branch information
joyceyan authored Oct 12, 2023
2 parents 2721d03 + c7b0f62 commit 4b3d5b0
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 13 deletions.
17 changes: 11 additions & 6 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -750,11 +750,13 @@ def _validate_colors_in_uns_dict(self, uns_dict: dict) -> None:
for key, value in uns_dict.items():
if key.endswith("_colors"):
# 1. Verify that the corresponding categorical field exists in obs
obs_unique_values = category_mapping.get(key.replace("_colors", ""))
column_name = key.replace("_colors", "")
obs_unique_values = category_mapping.get(column_name)
if not obs_unique_values:
self.errors.append(
f"Colors field uns[{key}] does not have a corresponding categorical field in obs"
)
error_message = f"Colors field uns[{key}] does not have a corresponding categorical field in obs"
if column_name in df.columns:
error_message += f". {column_name} is present but is dtype {df[column_name].dtype.name}"
self.errors.append(error_message)
continue
# 2. Verify that the value is a numpy array
if value is None or not isinstance(value, np.ndarray):
Expand All @@ -764,8 +766,11 @@ def _validate_colors_in_uns_dict(self, uns_dict: dict) -> None:
# Skip over all subsequent validations which expect a numpy array
continue
# 3. Verify that we have strings in the array
if not np.issubdtype(value.dtype, np.character):
self.errors.append(f"Colors in uns[{key}] must be strings. Found: {value}")
all_strings = all(isinstance(color, str) for color in value)
if not all_strings:
self.errors.append(
f"Colors in uns[{key}] must be strings. Found: {value} which are {value.dtype.name}"
)
continue
# 4. Verify that we have at least as many unique colors as unique values in the corresponding categorical field
value = np.unique(value)
Expand Down
14 changes: 14 additions & 0 deletions cellxgene_schema_cli/tests/fixtures/examples_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,16 @@
"batch_condition": ["is_primary_data"],
}

good_uns_with_colors = {
"title": "A title",
"default_embedding": "X_umap",
"X_approximate_distribution": "normal",
"batch_condition": ["is_primary_data"],
"suspension_type_colors": numpy.array(["red", "blue"]),
"donor_id_colors": numpy.array(["#000000", "#ffffff"]),
"tissue_type_colors": numpy.array(["black", "pink"]),
}

# ---
# 4. Creating expression matrix,
# X has integer values and non_raw_X has real values
Expand Down Expand Up @@ -234,6 +244,10 @@
obsm=good_obsm,
)

# Expected anndata with colors for categorical obs fields
adata_with_colors = anndata.AnnData(
X=sparse.csr_matrix(X), obs=good_obs, uns=good_uns_with_colors, obsm=good_obsm, var=good_var
)

# anndata for testing migration
umigrated_obs = pd.DataFrame(
Expand Down
Binary file not shown.
26 changes: 19 additions & 7 deletions cellxgene_schema_cli/tests/test_schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def test_column_presence(self, validator_with_adata, column):
validator.adata.uns.pop("batch_condition")

validator.validate_adata()
assert validator.errors == [f"ERROR: Dataframe 'obs' is missing " f"column '{column}'."]
assert f"ERROR: Dataframe 'obs' is missing " f"column '{column}'." in validator.errors

def test_column_presence_organism(self, validator_with_adata):
"""
Expand Down Expand Up @@ -1569,9 +1569,15 @@ def test_deprecated_fields(self, validator_with_adata):
]

def test_colors_happy_path(self, validator_with_adata):
validator = validator_with_adata
validator.adata = examples.adata_with_colors.copy()
assert validator.validate_adata()

def test_colors_happy_path_no_column_def(self, validator_with_adata):
"""
Creates a new column in `obs[test_column]` where the dtype is categorical, but its column definition
is not in the schema. Having valid colors in `uns[test_column_colors]` should not raise an error.
is not in the schema. Having valid colors in `uns[test_column_colors]` should not raise an error, since
we should be able to find the corresponding `obs` column with the `category` dtype.
"""
validator = validator_with_adata
validator.adata.obs["test_column"] = ["one", "two"]
Expand All @@ -1594,7 +1600,7 @@ def test_colors_bool_obs_counterpart(self, validator_with_adata):
validator.adata.uns["is_primary_data_colors"] = numpy.array(["green", "purple"])
validator.validate_adata()
assert validator.errors == [
"ERROR: Colors field uns[is_primary_data_colors] does not have a corresponding categorical field in obs"
"ERROR: Colors field uns[is_primary_data_colors] does not have a corresponding categorical field in obs. is_primary_data is present but is dtype bool"
]

def test_colors_without_obs_counterpart(self, validator_with_adata):
Expand All @@ -1609,7 +1615,9 @@ def test_empty_color_array(self, validator_with_adata):
validator = validator_with_adata
validator.adata.uns["suspension_type_colors"] = numpy.array([])
validator.validate_adata()
assert validator.errors == ["ERROR: Colors in uns[suspension_type_colors] must be strings. Found: []"]
assert validator.errors == [
"ERROR: Annotated categorical field suspension_type must have at least 2 color options in uns[suspension_type_colors]. Found: []"
]

def test_not_enough_color_options(self, validator_with_adata):
validator = validator_with_adata
Expand Down Expand Up @@ -1663,21 +1671,25 @@ def test_invalid_named_color_option_integer(self, validator_with_adata):
validator = validator_with_adata
validator.adata.uns["suspension_type_colors"] = numpy.array([3, 4])
validator.validate_adata()
assert validator.errors == ["ERROR: Colors in uns[suspension_type_colors] must be strings. Found: [3 4]"]
assert validator.errors == [
"ERROR: Colors in uns[suspension_type_colors] must be strings. Found: [3 4] which are int64"
]

def test_invalid_named_color_option_none(self, validator_with_adata):
validator = validator_with_adata
validator.adata.uns["suspension_type_colors"] = numpy.array(["green", None])
validator.validate_adata()
assert validator.errors == [
"ERROR: Colors in uns[suspension_type_colors] must be strings. Found: ['green' None]"
"ERROR: Colors in uns[suspension_type_colors] must be strings. Found: ['green' None] which are object"
]

def test_invalid_named_color_option_nan(self, validator_with_adata):
validator = validator_with_adata
validator.adata.uns["suspension_type_colors"] = numpy.array([numpy.nan, numpy.nan])
validator.validate_adata()
assert validator.errors == ["ERROR: Colors in uns[suspension_type_colors] must be strings. Found: [nan nan]"]
assert validator.errors == [
"ERROR: Colors in uns[suspension_type_colors] must be strings. Found: [nan nan] which are float64"
]


class TestObsm:
Expand Down

0 comments on commit 4b3d5b0

Please sign in to comment.