diff --git a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml index 28a3fad54..d14a0442e 100644 --- a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml +++ b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml @@ -582,3 +582,15 @@ components: - "cell culture" - "organoid" - "tissue" + genetic_ancestry_African: + type: genetic_ancestry_value + genetic_ancestry_East_Asian: + type: genetic_ancestry_value + genetic_ancestry_European: + type: genetic_ancestry_value + genetic_ancestry_Indigenous_American: + type: genetic_ancestry_value + genetic_ancestry_Oceanian: + type: genetic_ancestry_value + genetic_ancestry_South_Asian: + type: genetic_ancestry_value diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 024a51ec0..01a2d1401 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -416,6 +416,109 @@ def _count_matrix_nonzero(self, matrix_name: str, matrix: Union[np.ndarray, spar self.number_non_zero[matrix_name] = nnz return nnz + def _validate_genetic_ancestry(self): + """ + Performs row-based validation of the genetic_ancestry_X fields. This ensures that a valid row must be: + - all float('nan') if organism is not homo sapiens or info is unavailable + - sum to 1.0 + + Additionally, verifies that all rows with the same donor_id must have the same genetic ancestry values + """ + ancestry_columns = [ + "genetic_ancestry_African", + "genetic_ancestry_East_Asian", + "genetic_ancestry_European", + "genetic_ancestry_Indigenous_American", + "genetic_ancestry_Oceanian", + "genetic_ancestry_South_Asian", + ] + + organism_column = "organism_ontology_term_id" + donor_id_column = "donor_id" + + # Skip any additional validation if the genetic ancestry or organism columns are not present + # An error for missing columns will be raised at a different point + required_columns = ancestry_columns + [organism_column, donor_id_column] + for column in required_columns: + if column not in self.adata.obs.columns: + return + + donor_id_to_ancestry_values = dict() + + def is_valid_row(row): + ancestry_values = row[ancestry_columns] + + # If ancestry values are different for the same donor id, then this row is invalid + donor_id = row[donor_id_column] + if donor_id in donor_id_to_ancestry_values: + if not donor_id_to_ancestry_values[donor_id].equals(ancestry_values): + return False + else: + donor_id_to_ancestry_values[donor_id] = ancestry_values + + # All values are NaN. This is always valid, regardless of organism + if ancestry_values.isna().all(): + return True + + # If any values are NaN, and we didn't return in the earlier all NaN check, then + # this is invalid + if ancestry_values.isna().any(): + return False + + # If organism is not homo sapiens, and we didn't return in the earlier all NaN check, + # then this row is invalid + if row[organism_column] != "NCBITaxon:9606": + return False + + # The sum of genetic ancestry values should be approximately 1.0 + if ( + ancestry_values.apply(lambda x: isinstance(x, (float, int))).all() + and abs(ancestry_values.sum() - 1.0) <= 1e-6 + ): + return True + + return False + + invalid_rows = ~self.adata.obs.apply(is_valid_row, axis=1) + + if invalid_rows.any(): + invalid_indices = self.adata.obs.index[invalid_rows].tolist() + self.errors.append( + f"obs rows with indices {invalid_indices} have invalid genetic_ancestry_* values. All " + f"observations with the same donor_id must contain the same genetic_ancestry_* values. If " + f"organism_ontolology_term_id is NOT 'NCBITaxon:9606' for Homo sapiens, then all genetic" + f"ancestry values MUST be float('nan'). If organism_ontolology_term_id is 'NCBITaxon:9606' " + f"for Homo sapiens, then the value MUST be a float('nan') if unavailable; otherwise, the " + f"sum of all genetic_ancestry_* fields must be equal to 1.0" + ) + + def _validate_individual_genetic_ancestry_value(self, column: pd.Series, column_name: str): + """ + The following fields are valid for genetic_ancestry_value columns: + - float values between 0 and 1 + - float('nan') + """ + if column.dtype != float: + self.errors.append(f"Column '{column_name}' in obs must be float, not '{column.dtype.name}'.") + return + + def is_individual_value_valid(value): + if isinstance(value, (float, int)) and 0 <= value <= 1: + return True + # Ensures only float('nan') or numpy.nan is valid, None is invalid + if isinstance(value, float) and pd.isna(value): + return True + return False + + # Identify invalid values + invalid_values = column[~column.map(is_individual_value_valid)] + + if not invalid_values.empty: + self.errors.append( + f"Column '{column_name}' in obs contains invalid values: {invalid_values.to_list()}. " + f"Valid values are floats between 0 and 1 or float('nan')." + ) + def _validate_column_feature_is_filtered(self, column: pd.Series, column_name: str, df_name: str): """ Validates the "is_feature_filtered" in adata.var. This column must be bool, and for genes that are set to @@ -505,6 +608,9 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co if column_def.get("type") == "feature_is_filtered": self._validate_column_feature_is_filtered(column, column_name, df_name) + if column_def.get("type") == "genetic_ancestry_value": + self._validate_individual_genetic_ancestry_value(column, column_name) + if "enum" in column_def: bad_enums = [v for v in column.drop_duplicates() if v not in column_def["enum"]] if bad_enums: @@ -781,6 +887,7 @@ def _validate_dataframe(self, df_name: str): f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with " f"zero observations. These categories will be removed when `--add-labels` flag is present." ) + self._validate_genetic_ancestry() categorical_types = {type(x) for x in column.dtype.categories.values} # Check for columns that have illegal categories, which are not supported by anndata 0.8.0 # TODO: check if this can be removed after upgading to anndata 0.10.0 diff --git a/cellxgene_schema_cli/tests/fixtures/examples_validate.py b/cellxgene_schema_cli/tests/fixtures/examples_validate.py index 470c165ce..accbecfcd 100644 --- a/cellxgene_schema_cli/tests/fixtures/examples_validate.py +++ b/cellxgene_schema_cli/tests/fixtures/examples_validate.py @@ -48,6 +48,12 @@ "HsapDv:0000003", "donor_1", "nucleus", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], [ "CL:0000192", @@ -62,6 +68,12 @@ "MmusDv:0000003", "donor_2", "na", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], ], index=["X", "Y"], @@ -78,6 +90,12 @@ "development_stage_ontology_term_id", "donor_id", "suspension_type", + "genetic_ancestry_African", + "genetic_ancestry_East_Asian", + "genetic_ancestry_European", + "genetic_ancestry_Indigenous_American", + "genetic_ancestry_Oceanian", + "genetic_ancestry_South_Asian", ], ) @@ -144,6 +162,12 @@ "donor_1", "na", 0, + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], [ 2, @@ -161,6 +185,12 @@ "donor_2", "na", 1, + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], ], index=["X", "Y"], @@ -180,6 +210,12 @@ "donor_id", "suspension_type", "in_tissue", + "genetic_ancestry_African", + "genetic_ancestry_East_Asian", + "genetic_ancestry_European", + "genetic_ancestry_Indigenous_American", + "genetic_ancestry_Oceanian", + "genetic_ancestry_South_Asian", ], ) @@ -203,6 +239,12 @@ "HsapDv:0000003", "donor_1", "na", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], [ "CL:0000192", @@ -217,6 +259,12 @@ "MmusDv:0000003", "donor_2", "na", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], ], index=["X", "Y"], @@ -233,6 +281,12 @@ "development_stage_ontology_term_id", "donor_id", "suspension_type", + "genetic_ancestry_African", + "genetic_ancestry_East_Asian", + "genetic_ancestry_European", + "genetic_ancestry_Indigenous_American", + "genetic_ancestry_Oceanian", + "genetic_ancestry_South_Asian", ], ) @@ -255,6 +309,12 @@ "HsapDv:0000003", "donor_1", "na", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], [ "CL:0000192", @@ -269,6 +329,12 @@ "MmusDv:0000003", "donor_2", "na", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], ], index=["X", "Y"], @@ -285,6 +351,12 @@ "development_stage_ontology_term_id", "donor_id", "suspension_type", + "genetic_ancestry_African", + "genetic_ancestry_East_Asian", + "genetic_ancestry_European", + "genetic_ancestry_Indigenous_American", + "genetic_ancestry_Oceanian", + "genetic_ancestry_South_Asian", ], ) @@ -493,6 +565,12 @@ "tissue:1", "sre:1", "development_stage:1", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], [ "cell_type:1", @@ -503,6 +581,12 @@ "tissue:1", "sre:1", "development_stage:1", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], ], index=["X", "Y"], @@ -515,6 +599,12 @@ "tissue_ontology_term_id", "self_reported_ethnicity_ontology_term_id", "development_stage_ontology_term_id", + "genetic_ancestry_African", + "genetic_ancestry_East_Asian", + "genetic_ancestry_European", + "genetic_ancestry_Indigenous_American", + "genetic_ancestry_Oceanian", + "genetic_ancestry_South_Asian", ], ) diff --git a/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad b/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad index ec5f0aee2..a1b121bdf 100644 Binary files a/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad and b/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad differ diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index 425086fcd..7ee65a6df 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -1510,6 +1510,124 @@ def test_nan_values_must_be_rejected(self, validator_with_adata): in validator.errors ) + @pytest.mark.parametrize( + "genetic_ancestry_African, genetic_ancestry_East_Asian, genetic_ancestry_European, " + "genetic_ancestry_Indigenous_American, genetic_ancestry_Oceanian, genetic_ancestry_South_Asian", + [ + (0.0, 0.0, 0.0, 0.0, 0.0, 1.0), + (0.5, 0.5, 0.0, 0.0, 0.0, 0.0), + (0.0, 0.25, 0.25, 0.25, 0.25, 0.0), + (float("nan"), float("nan"), float("nan"), float("nan"), float("nan"), float("nan")), + (numpy.nan, numpy.nan, numpy.nan, numpy.nan, numpy.nan, numpy.nan), + ], + ) + def test_genetic_ancestry__OK( + self, + validator_with_adata, + genetic_ancestry_African, + genetic_ancestry_East_Asian, + genetic_ancestry_European, + genetic_ancestry_Indigenous_American, + genetic_ancestry_Oceanian, + genetic_ancestry_South_Asian, + ): + """ + genetic_ancestry_X fields must all be floats between 0 and 1 and sum to 1 + OR they can all be NaN + """ + validator = validator_with_adata + # Second organism in adata is not homo sapiens + validator.adata.obs["genetic_ancestry_African"] = [genetic_ancestry_African, float("nan")] + validator.adata.obs["genetic_ancestry_East_Asian"] = [genetic_ancestry_East_Asian, float("nan")] + validator.adata.obs["genetic_ancestry_European"] = [genetic_ancestry_European, float("nan")] + validator.adata.obs["genetic_ancestry_Indigenous_American"] = [ + genetic_ancestry_Indigenous_American, + float("nan"), + ] + validator.adata.obs["genetic_ancestry_Oceanian"] = [genetic_ancestry_Oceanian, float("nan")] + validator.adata.obs["genetic_ancestry_South_Asian"] = [genetic_ancestry_South_Asian, float("nan")] + validator.validate_adata() + assert validator.errors == [] + + @pytest.mark.parametrize( + "genetic_ancestry_African, genetic_ancestry_East_Asian, genetic_ancestry_European, " + "genetic_ancestry_Indigenous_American, genetic_ancestry_Oceanian, genetic_ancestry_South_Asian", + [ + # Non-float value of "random string" + (0.0, 0.0, 0.0, 1.0, 0.0, "random string"), + # Non-float value of True + (0.0, 0.0, 0.0, 1.0, 0.0, True), + # Non-float value of None + (0.0, 0.0, 0.0, 1.0, 0.0, None), + # Non-float value of numpy True + (0.0, 0.0, 0.0, 1.0, 0.0, numpy.True_), + # Non-float value of numpy NaN + (0.0, numpy.nan, 0.0, 1.0, 0.0, 0.0), + # One value is > 1 + (0.0, 0.0, 1.1, 0.0, 0.0, 0.0), + # One value is < 0.0 + (0.0, 0.0, -0.25, 1.0, 0.25, 0.0), + # Sum is > 1.0 + (0.0, 0.1, 1.0, 0.0, 0.0, 0.0), + # Sum is < 1.0 + (0.0, 0.25, 0.25, 0.25, 0.0, 0.0), + # Only all NaN is valid + (float("nan"), 0.0, 0.0, 0.0, 0.0, 0.0), + # Only all NaN is valid + (numpy.nan, 0.0, 0.0, 0.0, 0.0, 0.0), + ], + ) + def test_genetic_ancestry__invalid( + self, + validator_with_adata, + genetic_ancestry_African, + genetic_ancestry_East_Asian, + genetic_ancestry_European, + genetic_ancestry_Indigenous_American, + genetic_ancestry_Oceanian, + genetic_ancestry_South_Asian, + ): + validator = validator_with_adata + # Second organism in adata is not homo sapiens + validator.adata.obs["genetic_ancestry_African"] = [genetic_ancestry_African, float("nan")] + validator.adata.obs["genetic_ancestry_East_Asian"] = [genetic_ancestry_East_Asian, float("nan")] + validator.adata.obs["genetic_ancestry_European"] = [genetic_ancestry_European, float("nan")] + validator.adata.obs["genetic_ancestry_Indigenous_American"] = [ + genetic_ancestry_Indigenous_American, + float("nan"), + ] + validator.adata.obs["genetic_ancestry_Oceanian"] = [genetic_ancestry_Oceanian, float("nan")] + validator.adata.obs["genetic_ancestry_South_Asian"] = [genetic_ancestry_South_Asian, float("nan")] + validator.validate_adata() + assert len(validator.errors) > 0 + + def test_genetic_ancestry_same_donor_id(self, validator_with_adata): + """ + genetic_ancestry_X fields must be the same when the donor id is the same + """ + validator = validator_with_adata + original_donor_id_column = validator.adata.obs["donor_id"].copy() + + # Second row should have identical donor id + genetic ancestry values, so this should pass validation + validator.adata.obs.iloc[1] = validator.adata.obs.iloc[0].values + validator.validate_adata() + assert validator.errors == [] + + # Update the genetic ancestry values to be different. This should now fail validation + validator.adata.obs["genetic_ancestry_African"] = [1.0, 0.0] + validator.adata.obs["genetic_ancestry_East_Asian"] = [0.0, 1.0] + validator.adata.obs["genetic_ancestry_European"] = [0.0, 0.0] + validator.adata.obs["genetic_ancestry_Indigenous_American"] = [0.0, 0.0] + validator.adata.obs["genetic_ancestry_Oceanian"] = [0.0, 0.0] + validator.adata.obs["genetic_ancestry_South_Asian"] = [0.0, 0.0] + validator.validate_adata() + assert len(validator.errors) > 0 + + # Change the donor id back to two different donor id's. Now, this should pass validation + validator.adata.obs["donor_id"] = original_donor_id_column + validator.validate_adata() + assert validator.errors == [] + class TestVar: """