Skip to content

Commit

Permalink
feat: add genetic ancestry fields for schema 5.3 (#1132)
Browse files Browse the repository at this point in the history
  • Loading branch information
joyceyan authored Nov 27, 2024
1 parent 478648e commit 7f840ce
Show file tree
Hide file tree
Showing 5 changed files with 327 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -582,3 +582,15 @@ components:
- "cell culture"
- "organoid"
- "tissue"
genetic_ancestry_African:
type: genetic_ancestry_value
genetic_ancestry_East_Asian:
type: genetic_ancestry_value
genetic_ancestry_European:
type: genetic_ancestry_value
genetic_ancestry_Indigenous_American:
type: genetic_ancestry_value
genetic_ancestry_Oceanian:
type: genetic_ancestry_value
genetic_ancestry_South_Asian:
type: genetic_ancestry_value
107 changes: 107 additions & 0 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,109 @@ def _count_matrix_nonzero(self, matrix_name: str, matrix: Union[np.ndarray, spar
self.number_non_zero[matrix_name] = nnz
return nnz

def _validate_genetic_ancestry(self):
"""
Performs row-based validation of the genetic_ancestry_X fields. This ensures that a valid row must be:
- all float('nan') if organism is not homo sapiens or info is unavailable
- sum to 1.0
Additionally, verifies that all rows with the same donor_id must have the same genetic ancestry values
"""
ancestry_columns = [
"genetic_ancestry_African",
"genetic_ancestry_East_Asian",
"genetic_ancestry_European",
"genetic_ancestry_Indigenous_American",
"genetic_ancestry_Oceanian",
"genetic_ancestry_South_Asian",
]

organism_column = "organism_ontology_term_id"
donor_id_column = "donor_id"

# Skip any additional validation if the genetic ancestry or organism columns are not present
# An error for missing columns will be raised at a different point
required_columns = ancestry_columns + [organism_column, donor_id_column]
for column in required_columns:
if column not in self.adata.obs.columns:
return

donor_id_to_ancestry_values = dict()

def is_valid_row(row):
ancestry_values = row[ancestry_columns]

# If ancestry values are different for the same donor id, then this row is invalid
donor_id = row[donor_id_column]
if donor_id in donor_id_to_ancestry_values:
if not donor_id_to_ancestry_values[donor_id].equals(ancestry_values):
return False
else:
donor_id_to_ancestry_values[donor_id] = ancestry_values

# All values are NaN. This is always valid, regardless of organism
if ancestry_values.isna().all():
return True

# If any values are NaN, and we didn't return in the earlier all NaN check, then
# this is invalid
if ancestry_values.isna().any():
return False

# If organism is not homo sapiens, and we didn't return in the earlier all NaN check,
# then this row is invalid
if row[organism_column] != "NCBITaxon:9606":
return False

# The sum of genetic ancestry values should be approximately 1.0
if (
ancestry_values.apply(lambda x: isinstance(x, (float, int))).all()
and abs(ancestry_values.sum() - 1.0) <= 1e-6
):
return True

return False

invalid_rows = ~self.adata.obs.apply(is_valid_row, axis=1)

if invalid_rows.any():
invalid_indices = self.adata.obs.index[invalid_rows].tolist()
self.errors.append(
f"obs rows with indices {invalid_indices} have invalid genetic_ancestry_* values. All "
f"observations with the same donor_id must contain the same genetic_ancestry_* values. If "
f"organism_ontolology_term_id is NOT 'NCBITaxon:9606' for Homo sapiens, then all genetic"
f"ancestry values MUST be float('nan'). If organism_ontolology_term_id is 'NCBITaxon:9606' "
f"for Homo sapiens, then the value MUST be a float('nan') if unavailable; otherwise, the "
f"sum of all genetic_ancestry_* fields must be equal to 1.0"
)

def _validate_individual_genetic_ancestry_value(self, column: pd.Series, column_name: str):
"""
The following fields are valid for genetic_ancestry_value columns:
- float values between 0 and 1
- float('nan')
"""
if column.dtype != float:
self.errors.append(f"Column '{column_name}' in obs must be float, not '{column.dtype.name}'.")
return

def is_individual_value_valid(value):
if isinstance(value, (float, int)) and 0 <= value <= 1:
return True
# Ensures only float('nan') or numpy.nan is valid, None is invalid
if isinstance(value, float) and pd.isna(value):
return True
return False

# Identify invalid values
invalid_values = column[~column.map(is_individual_value_valid)]

if not invalid_values.empty:
self.errors.append(
f"Column '{column_name}' in obs contains invalid values: {invalid_values.to_list()}. "
f"Valid values are floats between 0 and 1 or float('nan')."
)

def _validate_column_feature_is_filtered(self, column: pd.Series, column_name: str, df_name: str):
"""
Validates the "is_feature_filtered" in adata.var. This column must be bool, and for genes that are set to
Expand Down Expand Up @@ -505,6 +608,9 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co
if column_def.get("type") == "feature_is_filtered":
self._validate_column_feature_is_filtered(column, column_name, df_name)

if column_def.get("type") == "genetic_ancestry_value":
self._validate_individual_genetic_ancestry_value(column, column_name)

if "enum" in column_def:
bad_enums = [v for v in column.drop_duplicates() if v not in column_def["enum"]]
if bad_enums:
Expand Down Expand Up @@ -781,6 +887,7 @@ def _validate_dataframe(self, df_name: str):
f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with "
f"zero observations. These categories will be removed when `--add-labels` flag is present."
)
self._validate_genetic_ancestry()
categorical_types = {type(x) for x in column.dtype.categories.values}
# Check for columns that have illegal categories, which are not supported by anndata 0.8.0
# TODO: check if this can be removed after upgading to anndata 0.10.0
Expand Down
90 changes: 90 additions & 0 deletions cellxgene_schema_cli/tests/fixtures/examples_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@
"HsapDv:0000003",
"donor_1",
"nucleus",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
[
"CL:0000192",
Expand All @@ -62,6 +68,12 @@
"MmusDv:0000003",
"donor_2",
"na",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
],
index=["X", "Y"],
Expand All @@ -78,6 +90,12 @@
"development_stage_ontology_term_id",
"donor_id",
"suspension_type",
"genetic_ancestry_African",
"genetic_ancestry_East_Asian",
"genetic_ancestry_European",
"genetic_ancestry_Indigenous_American",
"genetic_ancestry_Oceanian",
"genetic_ancestry_South_Asian",
],
)

Expand Down Expand Up @@ -144,6 +162,12 @@
"donor_1",
"na",
0,
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
[
2,
Expand All @@ -161,6 +185,12 @@
"donor_2",
"na",
1,
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
],
index=["X", "Y"],
Expand All @@ -180,6 +210,12 @@
"donor_id",
"suspension_type",
"in_tissue",
"genetic_ancestry_African",
"genetic_ancestry_East_Asian",
"genetic_ancestry_European",
"genetic_ancestry_Indigenous_American",
"genetic_ancestry_Oceanian",
"genetic_ancestry_South_Asian",
],
)

Expand All @@ -203,6 +239,12 @@
"HsapDv:0000003",
"donor_1",
"na",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
[
"CL:0000192",
Expand All @@ -217,6 +259,12 @@
"MmusDv:0000003",
"donor_2",
"na",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
],
index=["X", "Y"],
Expand All @@ -233,6 +281,12 @@
"development_stage_ontology_term_id",
"donor_id",
"suspension_type",
"genetic_ancestry_African",
"genetic_ancestry_East_Asian",
"genetic_ancestry_European",
"genetic_ancestry_Indigenous_American",
"genetic_ancestry_Oceanian",
"genetic_ancestry_South_Asian",
],
)

Expand All @@ -255,6 +309,12 @@
"HsapDv:0000003",
"donor_1",
"na",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
[
"CL:0000192",
Expand All @@ -269,6 +329,12 @@
"MmusDv:0000003",
"donor_2",
"na",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
],
index=["X", "Y"],
Expand All @@ -285,6 +351,12 @@
"development_stage_ontology_term_id",
"donor_id",
"suspension_type",
"genetic_ancestry_African",
"genetic_ancestry_East_Asian",
"genetic_ancestry_European",
"genetic_ancestry_Indigenous_American",
"genetic_ancestry_Oceanian",
"genetic_ancestry_South_Asian",
],
)

Expand Down Expand Up @@ -493,6 +565,12 @@
"tissue:1",
"sre:1",
"development_stage:1",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
[
"cell_type:1",
Expand All @@ -503,6 +581,12 @@
"tissue:1",
"sre:1",
"development_stage:1",
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
float("nan"),
],
],
index=["X", "Y"],
Expand All @@ -515,6 +599,12 @@
"tissue_ontology_term_id",
"self_reported_ethnicity_ontology_term_id",
"development_stage_ontology_term_id",
"genetic_ancestry_African",
"genetic_ancestry_East_Asian",
"genetic_ancestry_European",
"genetic_ancestry_Indigenous_American",
"genetic_ancestry_Oceanian",
"genetic_ancestry_South_Asian",
],
)

Expand Down
Binary file modified cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad
Binary file not shown.
Loading

0 comments on commit 7f840ce

Please sign in to comment.