diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fb5755e32..32c1ba6f9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -82,10 +82,14 @@ repos: exclude: ^tools/(cellxgene_census_builder|census_contrib) args: ["--config", "./tools/pyproject.toml"] additional_dependencies: + - attrs - numpy - pandas-stubs - typing_extensions - types-PyYAML + - pytest + - types-click + - repo: https://github.com/igorshubovych/markdownlint-cli rev: v0.39.0 diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 000000000..ce3542c95 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,72 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Memento (tiny)", + "type": "python", + "request": "launch", + "cwd": "${workspaceFolder}/api/python/cellxgene_census/src/", + "module": "cellxgene_census.experimental.diffexp.memento.diff_expr", + "justMyCode": true, + "args": [ + "tissue_general_ontology_term_id in ['UBERON:0002405']", + "sex_ontology_term_id", + "/mnt/census/estimators-cube-70a7705/", + "1", + "1000"], + "subProcess": true + }, + { + "name": "Memento (small)", + "type": "python", + "request": "launch", + "cwd": "${workspaceFolder}/api/python/cellxgene_census/src/", + "module": "cellxgene_census.experimental.diffexp.memento.diff_expr", + "justMyCode": true, + "args": [ + "tissue_general_ontology_term_id in ['UBERON:0000030', 'UBERON:0000992']", + "tissue_general_ontology_term_id", + "/mnt/census/estimators-cube-70a7705/", + "1", + "1000"], + "subProcess": true + }, + { + "name": "Memento (medium)", + "type": "python", + "request": "launch", + "cwd": "${workspaceFolder}/api/python/cellxgene_census/src/", + "module": "cellxgene_census.experimental.diffexp.memento.diff_expr", + "justMyCode": true, + "args": [ + "tissue_general_ontology_term_id in ['UBERON:0000948', 'UBERON:0001004']", + "tissue_general_ontology_term_id", + "/mnt/census/estimators-cube-70a7705/", + "1", + "5000"], + "subProcess": true + }, + { + "name": "Memento (large)", + "type": "python", + "request": "launch", + "cwd": "${workspaceFolder}/api/python/cellxgene_census/src/", + "module": "cellxgene_census.experimental.diffexp.memento.diff_expr", + "justMyCode": true, + "args": [ + "tissue_general_ontology_term_id in ['UBERON:0000948', 'UBERON:0001004']", + "tissue_general_ontology_term_id", + "/mnt/census/estimators-cube-70a7705/", + "1", + ""], + "subProcess": true + }, + { + "name": "Python: File", + "type": "python", + "request": "launch", + "program": "${file}", + "justMyCode": true + } + ] +} \ No newline at end of file diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml index 2da2fb56f..898c253f6 100644 --- a/api/python/cellxgene_census/pyproject.toml +++ b/api/python/cellxgene_census/pyproject.toml @@ -45,6 +45,7 @@ experimental = [ "torchdata~=0.7", "scikit-learn~=1.0", "scikit-misc>=0.2", # scikit-misc 0.3 dropped Python 3.8 support + "polars==0.20.4", "psutil~=5.0", "datasets~=2.0", "tdigest~=0.5", diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/diffexp/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/diffexp/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/diffexp/memento/README.md b/api/python/cellxgene_census/src/cellxgene_census/experimental/diffexp/memento/README.md new file mode 100644 index 000000000..a010e4751 --- /dev/null +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/diffexp/memento/README.md @@ -0,0 +1,9 @@ +# Differential Expression using memento + +This directory contains code for a Census-integrated version of the `memento` method for differential expression +analysis, including differential variability and co-expression. The underlying method is described in +the [memento pre-print](https://www.biorxiv.org/content/10.1101/2022.11.09.515836v1). + +This implementation relies upon a database of pre-computed estimators that are derived from a given Census data release. +The database is a TileDB array, structured as a multi-dimensional cube. It is built by +the `tools/models/memento/src/estimators_cube_builder/cube_builder.py` script. diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/diffexp/memento/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/diffexp/memento/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/diffexp/memento/diff_expr.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/diffexp/memento/diff_expr.py new file mode 100644 index 000000000..f6d3c5a16 --- /dev/null +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/diffexp/memento/diff_expr.py @@ -0,0 +1,335 @@ +#!/usr/bin/env python +import itertools +import json +import logging +import os +import pstats +import sys +from concurrent.futures import ProcessPoolExecutor +from functools import partial, reduce +from typing import List, Optional, Tuple, cast + +import numpy as np +import numpy.typing as npt +import pandas as pd +import polars as pl +import scipy.stats as stats +import tiledb +from sklearn.linear_model import LinearRegression + +OBS_GROUPS_ARRAY = "obs_groups" +ESTIMATORS_ARRAY = "estimators" +FEATURE_IDS_FILE = "feature_ids.json" + +CUBE_LOGICAL_DIMS_OBS = [ + "cell_type_ontology_term_id", + "dataset_id", + "tissue_general_ontology_term_id", + "assay_ontology_term_id", + "donor_id", + "disease_ontology_term_id", + "sex_ontology_term_id", + "development_stage_ontology_term_id", + "self_reported_ethnicity_ontology_term_id", + "suspension_type", +] + + +def compute_memento_estimators_from_precomputed_stats(estimators_df: pl.DataFrame) -> pl.DataFrame: + """ + Computes the mean and standard error of the mean (SEM) for each feature in the estimators DataFrame. + + This function takes a DataFrame containing precomputed statistics for each feature, including the number of observations, + sum, sum of squares, and size factor. It calculates the mean and SEM for each feature based on these statistics. + + Parameters: + estimators_df (pl.DataFrame): A DataFrame containing the precomputed statistics for each feature. Must include + columns 'n_obs', 'sum', 'sumsq', and 'size_factor'. + + Returns: + pl.DataFrame: A DataFrame with the original columns from `estimators_df` plus two new columns: + 'mean' - the mean expression level for each feature. + 'sem' - the standard error of the mean for each feature. + """ + n_obs = estimators_df["n_obs"].to_numpy() + expr_sum = estimators_df["sum"].to_numpy() + expr_sumsq = estimators_df["sumsq"].to_numpy() + size_factors = estimators_df["size_factor"].to_numpy() + mean = (expr_sum + 1) / (size_factors + 1) + var = expr_sumsq / n_obs - (expr_sum / n_obs) ** 2 + var[var < 0] = 0 # ensure variances are non-negative + sem = np.sqrt(var) * np.sqrt(n_obs) / size_factors + + estimators_df = estimators_df.with_columns([pl.Series("mean", mean), pl.Series("sem", sem)]) + return estimators_df + + +# @timeit +def query_estimators( + cube_path: str, + obs_groups_df: pd.DataFrame, + features: List[str], +) -> pl.DataFrame: + tiledb_config = { + "py.init_buffer_bytes": 2**31, + } + with tiledb.open(os.path.join(cube_path, ESTIMATORS_ARRAY), "r", config=tiledb_config) as estimators_array: + estimators_df = estimators_array.df[features, obs_groups_df.obs_group_joinid.values] + estimators_df = ( + estimators_df.merge( + obs_groups_df[["obs_group_joinid", "selected_vars_group_joinid"]], on="obs_group_joinid" + ) + .groupby(["feature_id", "selected_vars_group_joinid"]) + .sum() + .reset_index() + ) + estimators_df["obs_group_joinid"] = estimators_df["selected_vars_group_joinid"].astype("uint32") + del estimators_df["selected_vars_group_joinid"] + estimators_df = pl.DataFrame(estimators_df) + + estimators_df = compute_memento_estimators_from_precomputed_stats(estimators_df) + # TODO: Determine whether it's reasonable to drop these values, or if we should revisit how they're being + # computed in the first place. If reasonable, this filtering should be done by the cube builder, not here. + # This filtering ensures that we will not take of logs of non-positive values, or end up with selm values of 0 + estimators_df = drop_invalid_data(estimators_df) + + return cast(pl.DataFrame, estimators_df) + + +# @timeit +def drop_invalid_data(estimators_df: pl.DataFrame) -> pl.DataFrame: + drop_mask = (estimators_df["sem"] <= 0) | (estimators_df["sem"] >= estimators_df["mean"]) + if drop_mask.any(): + logging.warning(f"dropping {drop_mask.sum()} rows with invalid values ({drop_mask.sum() / len(drop_mask):.2%})") + estimators_df = estimators_df.filter(~drop_mask) + return estimators_df + + +def compute_all( + cube_path: str, + query_filter: str, + treatment: str, + n_processes: int, + covariates_str: Optional[str] = None, +) -> Tuple[pd.DataFrame, pstats.Stats]: + default_covariates = CUBE_LOGICAL_DIMS_OBS + + if covariates_str is None: + covariates = default_covariates + else: + covariates = covariates_str.split(",") + with tiledb.open(os.path.join(cube_path, OBS_GROUPS_ARRAY), "r") as obs_groups_array: + obs_groups_df = obs_groups_array.query(cond=query_filter or None).df[:] + if covariates != default_covariates: + obs_groups_df = obs_groups_df[covariates + [treatment, "obs_group_joinid", "n_obs"]] + + distinct_treatment_values = obs_groups_df[treatment].nunique() + assert distinct_treatment_values == 2, "treatment must have exactly 2 distinct values" + + features = get_features(cube_path, None) + + # compute each feature group in parallel + n_feature_groups = min(len(features), n_processes) + feature_groups = [features.tolist() for features in np.array_split(np.array(features), n_feature_groups)] + logging.debug( + f"computing for {len(obs_groups_df)} obs groups ({obs_groups_df.n_obs.sum()} cells) and {len(features)} features using {n_feature_groups} processes, {len(features) // n_feature_groups} features/process" + ) + + # make treatment variable be in the first column of the design matrix + variables = [treatment] + [covariate for covariate in covariates if covariate != treatment] + selected_vars_groups_groupby = obs_groups_df.groupby(variables, observed=True) + + agg_dict = {i: "first" for i in variables} + agg_dict["n_obs"] = "sum" + selected_vars_groups_df = selected_vars_groups_groupby.agg(agg_dict) + + obs_groups_df["selected_vars_group_joinid"] = selected_vars_groups_groupby.ngroup().astype("uint32") + selected_vars_groups_df["obs_group_joinid"] = np.arange(len(selected_vars_groups_df), dtype="uint32") + + design = pd.get_dummies(selected_vars_groups_df[variables].astype(str), drop_first=True, dtype=int) + assert design.shape[1] == selected_vars_groups_df[variables].nunique().sum() - len(variables) + + result_groups = ProcessPoolExecutor(max_workers=n_processes).map( + partial( + compute_for_features, + cube_path, + design, + obs_groups_df[["obs_group_joinid", "selected_vars_group_joinid", "n_obs"]], + selected_vars_groups_df[["obs_group_joinid", "n_obs"]], + ), + feature_groups, + range(len(feature_groups)), + ) + + results = list(result_groups) + assert len(results) + + # HACK: handle tuple-typed rests when @cprofile decorator is used on compute_for_features() + if isinstance(results[0], tuple): # type:ignore + # flatten results + data = itertools.chain.from_iterable([r[0] for r in results]) # type: ignore[unreachable] + stats = reduce(lambda s1, s2: s1.add(s2), [pstats.Stats(r[1]) if r[1] else pstats.Stats() for r in results]) + else: + data = itertools.chain.from_iterable(results) # flatten results + stats = pstats.Stats() + + return ( + pd.DataFrame(data, columns=["feature_id", "coef", "z", "pval"], copy=False) + .set_index("feature_id") + .sort_values("z", ascending=False, inplace=False), + stats, + ) + + +def get_features(cube_path: str, n_features: Optional[int] = None) -> List[str]: + with open(os.path.join(cube_path, FEATURE_IDS_FILE)) as f: + feature_ids = json.load(f) + + if n_features is not None: + # for testing purposes, useful to limit the number of features + rng = np.random.default_rng(1024) + feature_ids = rng.choice(feature_ids, size=n_features, replace=False) + + return cast(List[str], feature_ids) + + +# @cprofile +# @timeit_report +def compute_for_features( + cube_path: str, + design: pd.DataFrame, + obs_groups_df: pd.DataFrame, + selected_vars_groups_df: pd.DataFrame, + features: List[str], + feature_group_key: int, +) -> List[Tuple[str, np.float32, np.float32, np.float32]]: + logging.debug( + f"computing for feature group {feature_group_key}, n={len(features)}, {features[0]}..{features[-1]}..." + ) + estimators = query_estimators(cube_path, obs_groups_df, features) + cell_counts = selected_vars_groups_df["n_obs"].values + obs_group_joinids = selected_vars_groups_df[["obs_group_joinid"]] + + result = [ + (feature_id, *compute_for_feature(cell_counts, design, feature_estimators, obs_group_joinids)) # type:ignore + for feature_id, feature_estimators in estimators.group_by(["feature_id"]) + ] + + logging.debug(f"computed for feature group {feature_group_key}, {features[0]}..{features[-1]}") + + return result + + +# @timeit +def compute_for_feature( + cell_counts: npt.NDArray[np.float32], + design: pd.DataFrame, + estimators: pd.DataFrame, + obs_group_joinids: pd.DataFrame, +) -> Tuple[np.float32, np.float32, np.float32]: + # ensure estimators are available for all obs groups (for when feature had no expression data for some obs groups) + estimators = fill_missing_data(obs_group_joinids, estimators) + + assert len(estimators) == len(design) + + # Transform to log space (alternatively can resample in log space) + lm, selm = transform_to_log_space(estimators["mean"].to_numpy(), estimators["sem"].to_numpy()) + + return de_wls(X=design.values, y=lm, n=cell_counts, v=selm**2) + + +# @timeit +def transform_to_log_space( + m: npt.NDArray[np.float32], sem: npt.NDArray[np.float32] +) -> Tuple[npt.NDArray[np.float32], npt.NDArray[np.float32]]: + lm = np.log(m) + selm = (np.log(m + sem) - np.log(m - sem)) / 2 + assert (selm > 0).all() + return lm, selm + + +# @timeit +def fill_missing_data(obs_group_joinids: pd.DataFrame, feature_estimators: pl.DataFrame) -> pl.DataFrame: + feature_estimators = pl.DataFrame(obs_group_joinids).join( + feature_estimators[["obs_group_joinid", "mean", "sem"]], on="obs_group_joinid", how="left" + ) + + return feature_estimators.with_columns( + feature_estimators["mean"].fill_null(1e-3), feature_estimators["sem"].fill_null(1e-4) + ) + + +# @timeit +def de_wls_fit(X: npt.NDArray[np.float32], y: npt.NDArray[np.float32], n: npt.NDArray[np.float32]) -> np.float32: + # fit WLS using sample_weights + WLS = LinearRegression() + WLS.fit(X, y, sample_weight=n) + + # note: we have all the other coefficients (i.e. effect size) for the other covariates here as well, but we only + # want the treatment effect for now + return cast(np.float32, WLS.coef_[0]) + + +# @timeit +def de_wls_stats( + X: npt.NDArray[np.float32], v: npt.NDArray[np.float32], coef: np.float32 +) -> Tuple[np.float32, np.float32]: + W = de_wls_stats_W(v) + m = de_wls_stats_matmul(W, X) + pinv = de_wls_stats_pinv(m) + beta_var_hat = np.diag(pinv) + se = np.sqrt(beta_var_hat[0]) + + z = coef / se + pv = stats.norm.sf(np.abs(z)) * 2 + + return z, pv + + +# @timeit +def de_wls_stats_pinv(m: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]: + return np.linalg.pinv(m) + + +# @timeit +def de_wls_stats_matmul(W: npt.NDArray[np.float32], X: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]: + return (X.T * W) @ X + + +# @timeit +def de_wls_stats_W(v: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]: + return 1 / v + + +# @timeit +def de_wls( + X: npt.NDArray[np.float32], + y: npt.NDArray[np.float32], + n: npt.NDArray[np.float32], + v: npt.NDArray[np.float32], +) -> Tuple[np.float32, np.float32, np.float32]: + """ + Perform DE for each gene using Weighted Least Squares (i.e., a weighted Linear Regression model) + """ + coef = de_wls_fit(X, y, n) + z, pv = de_wls_stats(X, v, coef) + + return coef, z, pv + + +# Script entrypoint +if __name__ == "__main__": + if len(sys.argv) < 5: + print("Usage: python diff_expr.py ") + sys.exit(1) + + filter_arg, treatment_arg, cube_path_arg, n_processes, covariates = sys.argv[1:6] + + logging.getLogger().setLevel(logging.DEBUG) + + de_result = compute_all( + cube_path_arg, filter_arg, treatment_arg, int(n_processes), covariates if covariates else None + ) + + # Output DE result + print(de_result[0]) diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/diffexp/memento/profile.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/diffexp/memento/profile.py new file mode 100644 index 000000000..4ae7afeb2 --- /dev/null +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/diffexp/memento/profile.py @@ -0,0 +1,74 @@ +import cProfile +import os +import tempfile +import time +from collections import defaultdict +from functools import wraps +from typing import Dict + +fn_cum_time: Dict[str, float] = defaultdict(lambda: 0) +fn_calls: Dict[str, int] = defaultdict(lambda: 0) + + +def cprofile(func): # type: ignore[no-untyped-def] + """ + Wraps a function with cProfile. Returns the result of the function and the path a file containing a dump of cProfile Stats object file, as 2-tuple. + The stats are returned as a file to allow for aggregation of stats from multiple child processes. + """ + + @wraps(func) + def cprofile_wrapper(*args, **kwargs): # type: ignore[no-untyped-def] + with cProfile.Profile() as prof: + result = func(*args, **kwargs) + + f = tempfile.mkstemp()[1] + prof.dump_stats(f) + + return result, f + + return cprofile_wrapper + + +def timeit_report(func): # type: ignore[no-untyped-def] + """ + Decorator to calls to all nested functions that are decorated with @timeit (including the function being decorated by this decorator, + if it is also decorated with @timeit). + Prints a report of the cumulative and average time spent in each function, along with the number of calls to each function. + The profiling is simpler than cProfile-based profiling, but prints immediate output to the command line that is easier to read. + """ + + @wraps(func) + def timeit_report_wrapper(*args, **kwargs): # type: ignore[no-untyped-def] + result = func(*args, **kwargs) + + sorted_fn_names = [k for k, _ in sorted(fn_cum_time.items(), key=lambda i: i[1], reverse=True)] + for fn_name in sorted_fn_names: + print( + f"[timing {os.getpid()}] {fn_name}: " + f"cum_time={fn_cum_time[fn_name]} sec; avg_time={(fn_cum_time[fn_name] / fn_calls[fn_name]):.3f}; " + f"calls={fn_calls[fn_name]}" + ) + + return result + + return timeit_report_wrapper + + +def timeit(func): # type: ignore[no-untyped-def] + @wraps(func) + def timeit_wrapper(*args, **kwargs): # type: ignore[no-untyped-def] + start_time = time.perf_counter() + result = func(*args, **kwargs) + end_time = time.perf_counter() + exec_time = end_time - start_time + + fn_name = func.__name__ + fn_cum_time[fn_name] += exec_time + fn_calls[fn_name] += 1 + # print(f'[timing] {fn_name}: exec time={exec_time:.3f} sec; ' + # f'cum_time={fn_cum_time[fn_name]} sec; avg_time={(fn_cum_time[fn_name] / fn_calls[fn_name]):.3f}; ' + # f'calls={fn_calls[fn_name]}') + + return result + + return timeit_wrapper diff --git a/api/python/cellxgene_census/tests/experimental/diffexp/__init__.py b/api/python/cellxgene_census/tests/experimental/diffexp/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/api/python/cellxgene_census/tests/experimental/diffexp/memento/__init__.py b/api/python/cellxgene_census/tests/experimental/diffexp/memento/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/api/python/cellxgene_census/tests/experimental/diffexp/memento/test_diff_expr.py b/api/python/cellxgene_census/tests/experimental/diffexp/memento/test_diff_expr.py new file mode 100644 index 000000000..ea83c597a --- /dev/null +++ b/api/python/cellxgene_census/tests/experimental/diffexp/memento/test_diff_expr.py @@ -0,0 +1,93 @@ +from os import path +from typing import Any, Dict + +import numpy as np +import pandas as pd +import pytest + +from cellxgene_census.experimental.diffexp.memento import diff_expr + + +class TestDiffExprRealDataset: + """ + This class contains regression tests that run on realistic datasets. + + Since this class is intended to be serve as a regression test suite + using real data, it is strongly recommended that the functions under + test are all PUBLIC functions. Private functions are best encapsulated + in separate class. + """ + + @pytest.fixture(scope="class", params=["test_case_1", "test_case_2"]) + def test_cases_for_compute_all_fn(self, request: Any) -> Dict[str, Any]: + """ + Fixture that generates test cases for function + calls to `compute_all` given a test case name. + + This fixture returns a tuple `(diff_exp_query, expected_result)` + such that `diff_exp_query` encapsulates the differential expression + query that will be executed by `compute_all()`. + + `expected_result` is a datastructure containing the pertinent parts of + the return value of `compute_all()`. + """ + # TODO: Figure out a common location to store estimator cube fixtures so that it is + # explicitly clear that both the differential expression API and differential expression cube builder + # components use it for testing + pwd = path.dirname(__file__) + estimator_cube_path = path.join( + pwd, "../../../../../../../tools/models/memento/tests/fixtures/estimators-cube-expected/" + ) + + test_cases = { + "test_case_1": { + "diff_exp_query": { + "cube_path": estimator_cube_path, + "query_filter": "tissue_general_ontology_term_id in ['UBERON:0001723'] and sex_ontology_term_id in ['PATO:0000383', 'PATO:0000384']", + "treatment": "sex_ontology_term_id", + }, + "expected_diff_exp_result": [ + ("ENSG00000000419", -0.111612, -1.895204, 0.058065), + ("ENSG00000002330", 0.229054, 4.085651, 0.000044), + ], + }, + "test_case_2": { + "diff_exp_query": { + "cube_path": estimator_cube_path, + "query_filter": "tissue_general_ontology_term_id in ['UBERON:0001723'] and cell_type_ontology_term_id in ['CL:0000066', 'CL:0000057']", + "treatment": "cell_type_ontology_term_id", + }, + "expected_diff_exp_result": [ + ("ENSG00000000419", 0.868715, 6.048411, 1.462810e-09), + ("ENSG00000002330", 0.834346, 5.218739, 1.801458e-07), + ], + }, + } + + return test_cases[request.param] + + def test_diff_exp_query_basic(self, test_cases_for_compute_all_fn: Any) -> None: + # Arrange + estimator_cube_path = test_cases_for_compute_all_fn["diff_exp_query"]["cube_path"] + query_filter = test_cases_for_compute_all_fn["diff_exp_query"]["query_filter"] + treatment = test_cases_for_compute_all_fn["diff_exp_query"]["treatment"] + + # Act + observed_diff_exp_result_df, _ = diff_expr.compute_all( + cube_path=estimator_cube_path, + query_filter=query_filter, + treatment=treatment, + n_processes=1, + ) + + observed_diff_exp_result_df = observed_diff_exp_result_df.reset_index().set_index("feature_id").sort_index() + + expected_data = test_cases_for_compute_all_fn["expected_diff_exp_result"] + expected_diff_exp_result_df = ( + pd.DataFrame(expected_data, columns=["feature_id", "coef", "z", "pval"]) + .set_index("feature_id") + .sort_index() + ) + + # Assert + assert np.allclose(observed_diff_exp_result_df.values, expected_diff_exp_result_df.values, atol=1e-07) diff --git a/tools/models/__init__.py b/tools/models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/README.md b/tools/models/memento/README.md new file mode 100644 index 000000000..c9e2c19bc --- /dev/null +++ b/tools/models/memento/README.md @@ -0,0 +1,29 @@ +# Memento Pre-computed Estimator Cube Builder + +The `cell_census_summary_cube` script pre-computes the estimators that are used by Memento, using the CELLxGENE Census +single-cell data. The estimators are output to a TileDB array named `estimators_cube`. + +Usage instructions: + +1. It is recommended to run this script on an AWS EC2 `r6id.24xlarge` instance running `Ubuntu 22.04`, 1024GB root drive, in the `us-west-2` region. +2. While the builder has been tuned to run within the available memory of this instance type, it is safest to configure the instance with swap space to avoid OOM errors. Copy this [script](https://github.com/chanzuckerberg/cellxgene-census/blob/d9bd1eb4a3e14974a0e7d9c23fb8368e79b92c2d/tools/scripts/aws/swapon_instance_storage.sh) to the instance and run as root: `sudo swapon_instance_storage.sh 1`. Note the `1` will only utilize one SSD for swap space, which should be sufficient. +3. Install Python: `sudo apt install python3-venv` +4. `git clone git@github.com:chanzuckerberg/cellxgene-census.git` +5. Setup a virtualenv and `pip install -r tools/models/memento/requirements.txt`. +6. Download the Census to local filesystem: `sudo aws s3 --no-sign-request sync s3://cellxgene-data-public/cell-census//soma/ ` +7. To run: `python -O -m estimators_cube_builder --cube-uri / --experiment-uri /census_data/homo_sapiens --overwrite --validate --consolidate 2>&1 | tee build-cube.log`. + +For further performance, the local Census path can be on a volume mounted on SSD drive. E.g.: + +```sh +sudo mkfs.ext4 -L census /dev/nvme1n2 +sudo mkdir -p /mnt/census +sudo mount /dev/nvme1n2 /mnt/census +``` + +To inspect the results of the cube, see `estimators_cube_builder/cube-adhoc-query.ipynb`. + +Notes: + +- The scripts makes use of Python's multiprocessing to parallelize the estimator computations. The amount of memory used per sub-process and overall on the instance will be impacted by the constants `MIN_BATCH_SIZE`, `MAX_CELLS`, and `MAX_WORKERS`. The `MAX_CELLS` is the upper limit of cells that worker processes will be allowed to process at a given time (enforced by `ResourcePoolProcessExecutor`). This effectively controls the peak memory usage to avoid using swap space, which would negatively impact performance. However, if this causes the worker process count to be less than the CPU count, the CPUs will be underutilized. This can be rectified by decreasing the MIN_BATCH_SIZE, which will reduce the memory used per process and allow more workers processes to run in parallel. +- The script takes ~17 hours to run in the default configuration on the `r6id.24xlarge` instance size. diff --git a/tools/models/memento/__init__.py b/tools/models/memento/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/requirements.txt b/tools/models/memento/requirements.txt new file mode 100644 index 000000000..29b069fc0 --- /dev/null +++ b/tools/models/memento/requirements.txt @@ -0,0 +1,6 @@ +pandas +pyarrow +tiledbsoma~=1.6.0 +click +numpy +psutil diff --git a/tools/models/memento/src/__init__.py b/tools/models/memento/src/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/src/estimators_cube_builder/__init__.py b/tools/models/memento/src/estimators_cube_builder/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/src/estimators_cube_builder/__main__.py b/tools/models/memento/src/estimators_cube_builder/__main__.py new file mode 100644 index 000000000..29e3c2bfc --- /dev/null +++ b/tools/models/memento/src/estimators_cube_builder/__main__.py @@ -0,0 +1,4 @@ +from .cube_builder import build_cli + +if __name__ == "__main__": + build_cli() diff --git a/tools/models/memento/src/estimators_cube_builder/cube-adhoc-query.ipynb b/tools/models/memento/src/estimators_cube_builder/cube-adhoc-query.ipynb new file mode 100644 index 000000000..038354304 --- /dev/null +++ b/tools/models/memento/src/estimators_cube_builder/cube-adhoc-query.ipynb @@ -0,0 +1,458 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Query the estimators cube" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# load cube data\n", + "\n", + "import json\n", + "from os import path\n", + "import tiledb as tdb\n", + "\n", + "cube_dir = '/mnt/census/estimators-cube-70a7705-float32'\n", + "estimators_array = tdb.open(path.join(cube_dir, \"estimators\"), config={\"soma.init_buffer_bytes\": 2**32})\n", + "obs_groups_df = tdb.open(path.join(cube_dir, \"obs_groups\")).df[:].set_index('obs_group_joinid')\n", + "with open(path.join(cube_dir, \"feature_ids.json\"), 'r') as json_file:\n", + " features = json.load(json_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# filter to a single tissue and a few features, as an example\n", + "\n", + "obs_groups_df = obs_groups_df[obs_groups_df.tissue_general_ontology_term_id.isin(['UBERON:0000970'])]\n", + "\n", + "features = features[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cell_type_ontology_term_iddataset_idtissue_general_ontology_term_idassay_ontology_term_iddonor_iddisease_ontology_term_idsex_ontology_term_iddevelopment_stage_ontology_term_idself_reported_ethnicity_ontology_term_idsuspension_typen_obsmeansem
feature_idobs_group_joinid
ENSG00000000003780CL:0000003d5c67a4e-a8d9-456d-a273-fa01adb1b308UBERON:0000970EFO:0009899Donor1MONDO:0005129PATO:0000383HsapDv:0000206unknowncell2360.3522600.053637
ENSG00000000419780CL:0000003d5c67a4e-a8d9-456d-a273-fa01adb1b308UBERON:0000970EFO:0009899Donor1MONDO:0005129PATO:0000383HsapDv:0000206unknowncell2361.0388700.101254
ENSG00000000003781CL:0000003d5c67a4e-a8d9-456d-a273-fa01adb1b308UBERON:0000970EFO:0009899Donor2PATO:0000461PATO:0000384HsapDv:0000136unknowncell26840.1298520.010113
ENSG00000000419781CL:0000003d5c67a4e-a8d9-456d-a273-fa01adb1b308UBERON:0000970EFO:0009899Donor2PATO:0000461PATO:0000384HsapDv:0000136unknowncell26840.8303120.033632
ENSG00000000003782CL:0000003d5c67a4e-a8d9-456d-a273-fa01adb1b308UBERON:0000970EFO:0009899Donor3PATO:0000461PATO:0000383HsapDv:0000147unknowncell7840.1155060.016165
.............................................
ENSG00000000005114533CL:40300285cdbb2ea-c622-466d-9ead-7884ad8cb99fUBERON:0000970EFO:0009899H5PATO:0000461PATO:0000384HsapDv:0000163HANCESTRO:0005cell7400.0038240.001914
ENSG00000000419114533CL:40300285cdbb2ea-c622-466d-9ead-7884ad8cb99fUBERON:0000970EFO:0009899H5PATO:0000461PATO:0000384HsapDv:0000163HANCESTRO:0005cell7400.1605980.017524
114534CL:40300285cdbb2ea-c622-466d-9ead-7884ad8cb99fUBERON:0000970EFO:0009899H9PATO:0000461PATO:0000383HsapDv:0000147HANCESTRO:0005cell310.2623590.093638
NaN114535CL:40300285cdbb2ea-c622-466d-9ead-7884ad8cb99fUBERON:0000970EFO:0009899sanes_Pt2PATO:0000461PATO:0000384HsapDv:0000172HANCESTRO:0005cell6NaNNaN
ENSG00000000419114536CL:40300285cdbb2ea-c622-466d-9ead-7884ad8cb99fUBERON:0000970EFO:0009922H11PATO:0000461PATO:0000384HsapDv:0000159HANCESTRO:0005cell420.0652960.033346
\n", + "

1444 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " cell_type_ontology_term_id \\\n", + "feature_id obs_group_joinid \n", + "ENSG00000000003 780 CL:0000003 \n", + "ENSG00000000419 780 CL:0000003 \n", + "ENSG00000000003 781 CL:0000003 \n", + "ENSG00000000419 781 CL:0000003 \n", + "ENSG00000000003 782 CL:0000003 \n", + "... ... \n", + "ENSG00000000005 114533 CL:4030028 \n", + "ENSG00000000419 114533 CL:4030028 \n", + " 114534 CL:4030028 \n", + "NaN 114535 CL:4030028 \n", + "ENSG00000000419 114536 CL:4030028 \n", + "\n", + " dataset_id \\\n", + "feature_id obs_group_joinid \n", + "ENSG00000000003 780 d5c67a4e-a8d9-456d-a273-fa01adb1b308 \n", + "ENSG00000000419 780 d5c67a4e-a8d9-456d-a273-fa01adb1b308 \n", + "ENSG00000000003 781 d5c67a4e-a8d9-456d-a273-fa01adb1b308 \n", + "ENSG00000000419 781 d5c67a4e-a8d9-456d-a273-fa01adb1b308 \n", + "ENSG00000000003 782 d5c67a4e-a8d9-456d-a273-fa01adb1b308 \n", + "... ... \n", + "ENSG00000000005 114533 5cdbb2ea-c622-466d-9ead-7884ad8cb99f \n", + "ENSG00000000419 114533 5cdbb2ea-c622-466d-9ead-7884ad8cb99f \n", + " 114534 5cdbb2ea-c622-466d-9ead-7884ad8cb99f \n", + "NaN 114535 5cdbb2ea-c622-466d-9ead-7884ad8cb99f \n", + "ENSG00000000419 114536 5cdbb2ea-c622-466d-9ead-7884ad8cb99f \n", + "\n", + " tissue_general_ontology_term_id \\\n", + "feature_id obs_group_joinid \n", + "ENSG00000000003 780 UBERON:0000970 \n", + "ENSG00000000419 780 UBERON:0000970 \n", + "ENSG00000000003 781 UBERON:0000970 \n", + "ENSG00000000419 781 UBERON:0000970 \n", + "ENSG00000000003 782 UBERON:0000970 \n", + "... ... \n", + "ENSG00000000005 114533 UBERON:0000970 \n", + "ENSG00000000419 114533 UBERON:0000970 \n", + " 114534 UBERON:0000970 \n", + "NaN 114535 UBERON:0000970 \n", + "ENSG00000000419 114536 UBERON:0000970 \n", + "\n", + " assay_ontology_term_id donor_id \\\n", + "feature_id obs_group_joinid \n", + "ENSG00000000003 780 EFO:0009899 Donor1 \n", + "ENSG00000000419 780 EFO:0009899 Donor1 \n", + "ENSG00000000003 781 EFO:0009899 Donor2 \n", + "ENSG00000000419 781 EFO:0009899 Donor2 \n", + "ENSG00000000003 782 EFO:0009899 Donor3 \n", + "... ... ... \n", + "ENSG00000000005 114533 EFO:0009899 H5 \n", + "ENSG00000000419 114533 EFO:0009899 H5 \n", + " 114534 EFO:0009899 H9 \n", + "NaN 114535 EFO:0009899 sanes_Pt2 \n", + "ENSG00000000419 114536 EFO:0009922 H11 \n", + "\n", + " disease_ontology_term_id \\\n", + "feature_id obs_group_joinid \n", + "ENSG00000000003 780 MONDO:0005129 \n", + "ENSG00000000419 780 MONDO:0005129 \n", + "ENSG00000000003 781 PATO:0000461 \n", + "ENSG00000000419 781 PATO:0000461 \n", + "ENSG00000000003 782 PATO:0000461 \n", + "... ... \n", + "ENSG00000000005 114533 PATO:0000461 \n", + "ENSG00000000419 114533 PATO:0000461 \n", + " 114534 PATO:0000461 \n", + "NaN 114535 PATO:0000461 \n", + "ENSG00000000419 114536 PATO:0000461 \n", + "\n", + " sex_ontology_term_id \\\n", + "feature_id obs_group_joinid \n", + "ENSG00000000003 780 PATO:0000383 \n", + "ENSG00000000419 780 PATO:0000383 \n", + "ENSG00000000003 781 PATO:0000384 \n", + "ENSG00000000419 781 PATO:0000384 \n", + "ENSG00000000003 782 PATO:0000383 \n", + "... ... \n", + "ENSG00000000005 114533 PATO:0000384 \n", + "ENSG00000000419 114533 PATO:0000384 \n", + " 114534 PATO:0000383 \n", + "NaN 114535 PATO:0000384 \n", + "ENSG00000000419 114536 PATO:0000384 \n", + "\n", + " development_stage_ontology_term_id \\\n", + "feature_id obs_group_joinid \n", + "ENSG00000000003 780 HsapDv:0000206 \n", + "ENSG00000000419 780 HsapDv:0000206 \n", + "ENSG00000000003 781 HsapDv:0000136 \n", + "ENSG00000000419 781 HsapDv:0000136 \n", + "ENSG00000000003 782 HsapDv:0000147 \n", + "... ... \n", + "ENSG00000000005 114533 HsapDv:0000163 \n", + "ENSG00000000419 114533 HsapDv:0000163 \n", + " 114534 HsapDv:0000147 \n", + "NaN 114535 HsapDv:0000172 \n", + "ENSG00000000419 114536 HsapDv:0000159 \n", + "\n", + " self_reported_ethnicity_ontology_term_id \\\n", + "feature_id obs_group_joinid \n", + "ENSG00000000003 780 unknown \n", + "ENSG00000000419 780 unknown \n", + "ENSG00000000003 781 unknown \n", + "ENSG00000000419 781 unknown \n", + "ENSG00000000003 782 unknown \n", + "... ... \n", + "ENSG00000000005 114533 HANCESTRO:0005 \n", + "ENSG00000000419 114533 HANCESTRO:0005 \n", + " 114534 HANCESTRO:0005 \n", + "NaN 114535 HANCESTRO:0005 \n", + "ENSG00000000419 114536 HANCESTRO:0005 \n", + "\n", + " suspension_type n_obs mean sem \n", + "feature_id obs_group_joinid \n", + "ENSG00000000003 780 cell 236 0.352260 0.053637 \n", + "ENSG00000000419 780 cell 236 1.038870 0.101254 \n", + "ENSG00000000003 781 cell 2684 0.129852 0.010113 \n", + "ENSG00000000419 781 cell 2684 0.830312 0.033632 \n", + "ENSG00000000003 782 cell 784 0.115506 0.016165 \n", + "... ... ... ... ... \n", + "ENSG00000000005 114533 cell 740 0.003824 0.001914 \n", + "ENSG00000000419 114533 cell 740 0.160598 0.017524 \n", + " 114534 cell 31 0.262359 0.093638 \n", + "NaN 114535 cell 6 NaN NaN \n", + "ENSG00000000419 114536 cell 42 0.065296 0.033346 \n", + "\n", + "[1444 rows x 13 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# query for matching estimators and create the cube from the obs dimensions and estimator values\n", + "\n", + "estimators = estimators_array.df[features, obs_groups_df.index.values].set_index('obs_group_joinid')\n", + "cube = obs_groups_df.join(estimators).reset_index().set_index(['feature_id', 'obs_group_joinid'])\n", + "cube" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv-api", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tools/models/memento/src/estimators_cube_builder/cube_builder.py b/tools/models/memento/src/estimators_cube_builder/cube_builder.py new file mode 100644 index 000000000..bd4d25187 --- /dev/null +++ b/tools/models/memento/src/estimators_cube_builder/cube_builder.py @@ -0,0 +1,475 @@ +from __future__ import annotations + +import gc +import json +import logging +import multiprocessing +import os +import shutil +import warnings +from concurrent import futures +from datetime import datetime +from typing import Any, Dict, List, Tuple, cast + +import click +import numpy as np +import pandas as pd +import pyarrow as pa +import tiledb +import tiledbsoma as soma +from somacore import AxisQuery, ExperimentAxisQuery +from tiledbsoma import SOMATileDBContext + +from .cube_schema import ( + ESTIMATOR_NAMES, + ESTIMATORS_ARRAY, + FEATURE_IDS_FILE, + OBS_GROUPS_ARRAY, + OBS_LOGICAL_DIMS, + build_estimators_schema, + build_obs_categorical_values, + build_obs_groups_schema, +) +from .cube_validator import validate_cube +from .estimators import bin_size_factor, gen_multinomial +from .mp import create_resource_pool_executor + +PROFILE_MODE = bool(os.getenv("PROFILE_MODE", False)) # Run Step 3 in single-process mode with profiling output + +# TODO: parameterize constants below + +OBS_SIZE_FACTORS_ARRAY = "size_factors" + +TILEDB_SOMA_BUFFER_BYTES = 2**31 + +# The minimum number of cells that should be processed at a time by each child process. +MIN_BATCH_SIZE = 2**13 + +Q = 0.1 # RNA capture efficiency depending on technology + +MAX_WORKERS = None # None means use multiprocessing's dynamic default + +# The maximum number of cells values to be processed at any given time ("X nnz per batch" would be a better metric +# due to differences in X sparsity across cells, but it is not efficient to compute). The multiprocessing logic will +# not submit new jobs while this value is exceeded, thereby keeping memory usage bounded. This is needed since job +# sizes vary considerably in their memory usage, due to the high cell count of some batches (if batch sizes were not +# highly variable, we could just limit by process/worker count). +MAX_CELLS = 512_000 + +OBS_VALUE_FILTER = "is_primary_data == True" + + +logging.basicConfig( + format="%(asctime)s %(process)-7s %(levelname)-8s %(message)s", + level=logging.INFO, + datefmt="%Y-%m-%d %H:%M:%S", +) +logging.captureWarnings(True) + +# Suppress warnings from Pandas and NumPy +# TODO: Make this more specific! We just want to ignore "runtimewarning...degrees of freedom" errors +warnings.filterwarnings("ignore") + + +# pd.options.display.max_columns = None +# pd.options.display.width = 1024 +# pd.options.display.min_rows = 40 + + +def compute_all_estimators_for_obs_group(obs_group_rows: pd.DataFrame, obs_df: pd.DataFrame) -> pd.Series[float]: + """Computes all estimators for a given obs group's expression values""" + obs_group_name = cast(Tuple[str, ...], obs_group_rows.name) + + # Filter obs to the rows of the current group, and retrieve the "size factors" data for those rows. The full set + # of rows in the group is needed to create "dense" arrays used to compute estimators each gene in + # compute_all_estimators_for_gene(), where the full set of obs rows can no longer be determined from the "sparse" + # set of expression values for a given gene. + size_factors_for_obs_group = obs_df[["approx_size_factor"]].loc[obs_group_rows[[]].index.drop_duplicates()] + + gene_groups = obs_group_rows.groupby(["feature_id"], observed=True) + estimators = gene_groups.apply( + lambda gene_group_rows: compute_all_estimators_for_gene( + obs_group_name, gene_group_rows, size_factors_for_obs_group + ) + ) + + assert ( + estimators.index.nunique() == obs_group_rows["feature_id"].nunique() + ), f"estimators count incorrect in group {obs_group_name}" + return estimators # type: ignore + + +def compute_all_estimators_for_gene( + gene_group_name: Tuple[str, ...], gene_group_rows: pd.DataFrame, size_factors_for_obs_group: pd.DataFrame +) -> pd.Series[float]: + """Computes all estimators for a given {, ..., , gene} group of expression values""" + + estimators: Dict[str, Any] = {} + estimators["sum"] = gene_group_rows.soma_data.sum() + estimators["sumsq"] = (gene_group_rows.soma_data**2).sum() + estimators["size_factor"] = size_factors_for_obs_group.approx_size_factor.sum() + estimators["n_obs"] = size_factors_for_obs_group.shape[0] + + # order matters for estimators + return pd.Series(data=[estimators[n] for n in ESTIMATOR_NAMES], dtype=np.float64) + + +def compute_all_estimators_for_batch_tdb( + soma_dim_0: List[int], obs_df: pd.DataFrame, var_df: pd.DataFrame, X_uri: str, batch: int, estimators_uri: str +) -> int: + """Compute estimators for each gene""" + + with soma.SparseNDArray.open( + X_uri, + context=soma.SOMATileDBContext().replace( + tiledb_config={ + "soma.init_buffer_bytes": TILEDB_SOMA_BUFFER_BYTES, + "vfs.s3.region": "us-west-2", + "vfs.s3.no_sign_request": True, + } + ), + ) as X: + X_df = X.read(coords=(soma_dim_0, var_df.index.values)).tables().concat().to_pandas() + logging.info(f"Step 3: Start X batch {batch}, cells={len(soma_dim_0)}, nnz={len(X_df)}") + result = compute_all_estimators_for_batch_pd(X_df, obs_df, var_df) + if len(result) == 0: + logging.warning(f"Step 3: Batch {batch} had empty result, cells={len(soma_dim_0)}, nnz={len(X_df)}") + logging.info(f"Step 3: End X batch {batch}, cells={len(soma_dim_0)}, nnz={len(X_df)}") + + assert all(result.index.value_counts() <= 1), "tiledb batch has repeated cube rows" + + write_estimators_batch(result, estimators_uri) + + gc.collect() + + return len(soma_dim_0) + + +def compute_all_estimators_for_batch_pd(X_df: pd.DataFrame, obs_df: pd.DataFrame, var_df: pd.DataFrame) -> pd.DataFrame: + result = ( + X_df.set_index("soma_dim_1") + .join(var_df[["feature_id"]]) + .set_index("soma_dim_0") + .join( + obs_df[["obs_group_joinid"]] + ) # TODO: If we do a left join here, we end up with the "dense" array needed for compute_all_estimators_for_gene(); might be more efficient to dense first and make sparse later + .groupby("obs_group_joinid", sort=False) + .apply(lambda obs_group: compute_all_estimators_for_obs_group(obs_group, obs_df)) + .rename(mapper=dict(enumerate(ESTIMATOR_NAMES)), axis=1) + ) + return result + + +# TODO: replace this with obs.raw_sum +def sum_gene_expression_levels_by_cell(X_tbl: pa.Table, batch: int) -> pd.Series[float]: + logging.info(f"Pass 1: Computing X batch {batch}, nnz={X_tbl.shape[0]}") + + # TODO: use PyArrow API only; avoid Pandas conversion + result = X_tbl.to_pandas()[["soma_dim_0", "soma_data"]].groupby("soma_dim_0", sort=False).sum()["soma_data"] + + logging.info(f"Pass 1: Computing X batch {batch}, nnz={X_tbl.shape[0]}: done") + + return result # type: ignore + + +def step_2_compute_size_factors(query: ExperimentAxisQuery) -> pd.DataFrame: + obs_df = ( + query.obs(column_names=["soma_joinid", "raw_sum"] + OBS_LOGICAL_DIMS) + .concat() + .to_pandas() + .set_index("soma_joinid") + ) + + # Convert size factors to relative - prevents small floats for variance + global_n_umi = obs_df["raw_sum"].values.mean() + obs_df["size_factor"] = obs_df["raw_sum"].values / global_n_umi + + # Bin all sums to have fewer unique values, to speed up bootstrap computation + obs_df["approx_size_factor"] = bin_size_factor(obs_df["size_factor"].values) + + return cast(pd.DataFrame, obs_df[OBS_LOGICAL_DIMS + ["approx_size_factor"]]) + + +def step_3_compute_estimators( + cube_uri: str, query: ExperimentAxisQuery, size_factors: pd.DataFrame, /, measurement_name: str, layer: str +) -> None: + var_df = query.var().concat().to_pandas().set_index("soma_joinid") + obs_df = query.obs(column_names=["soma_joinid"] + OBS_LOGICAL_DIMS).concat().to_pandas().set_index("soma_joinid") + + # Process X by obs groups (i.e. cube rows). This ensures that estimators are computed + # for all X data contributing to a given obs group aggregation. + logging.info("Step 3: Computing obs groups") + obs_grouped = obs_df[OBS_LOGICAL_DIMS].groupby(OBS_LOGICAL_DIMS, observed=True) + obs_df["obs_group_joinid"] = obs_grouped.ngroup() + # obs_df["n_obs"] = obs_grouped.size() + obs_groups_soma_joinids = obs_grouped.groups + + obs_groups_uri = os.path.join(cube_uri, OBS_GROUPS_ARRAY) + estimators_uri = os.path.join(cube_uri, ESTIMATORS_ARRAY) + + if tiledb.array_exists(estimators_uri): + logging.info("Step 3: Resuming from existing estimators cube") + with tiledb.open(obs_groups_uri, mode="r") as estimators_cube_array: + existing_obs_group_joinids = ( + estimators_cube_array.query(attrs=[], dims=["obs_group_joinid"]).df[:].index.drop_duplicates() + ) + else: + logging.info("Step 3: Creating new estimators cube") + existing_obs_group_joinids = None + obs_groups_df = ( + obs_df.groupby(["obs_group_joinid"] + OBS_LOGICAL_DIMS) + .size() + .reset_index(OBS_LOGICAL_DIMS) + .rename(columns={0: "n_obs"}) + ) + obs_categorical_values = build_obs_categorical_values(obs_groups_df) + tiledb.Array.create( + uri=obs_groups_uri, schema=build_obs_groups_schema(len(obs_grouped), obs_categorical_values) + ) + tiledb.Array.create(uri=estimators_uri, schema=build_estimators_schema(len(obs_grouped))) + # TODO: Can remove once https://github.com/TileDB-Inc/TileDB-Py/issues/1879 fix is available + # Ensure Pandas categorical columns must have the same underlying dictionaries as the TileDB Array + # schema's enumeration columns + for col in OBS_LOGICAL_DIMS: + obs_groups_df[col] = pd.Categorical(obs_groups_df[col], categories=obs_categorical_values[col]) + tiledb.from_pandas(obs_groups_uri, obs_groups_df, mode="append") + + logging.info("Step 3: Starting estimators computation") + + obs_df = obs_df.join(size_factors[["approx_size_factor"]]) + + soma_dim_0_batch: List[int] = [] + batch_futures = [] + n_batches_submitted = n_cells_submitted = 0 + + executor = create_resource_pool_executor(max_workers=MAX_WORKERS, max_resources=MAX_CELLS) + + n_total_cells = query.n_obs + + # For testing/debugging: Run Step 3 without multiprocessing + if PROFILE_MODE: + # force numba jit compilation outside of profiling + gen_multinomial(np.array([1, 1, 1]), 3, 1) + + import cProfile + + def process_batch() -> None: + nonlocal n_batches_submitted + n_batches_submitted += 1 + compute_all_estimators_for_batch_tdb( + soma_dim_0_batch, + obs_df, + var_df, + query.experiment.ms[measurement_name].X[layer].uri, + n_batches_submitted, + estimators_uri, + ) + + with cProfile.Profile() as pr: + for obs_group_soma_joinids in obs_groups_soma_joinids.values(): + soma_dim_0_batch.extend(obs_group_soma_joinids) + if len(soma_dim_0_batch) < MIN_BATCH_SIZE: + continue + + process_batch() + soma_dim_0_batch = [] + + if len(soma_dim_0_batch) > 0: + process_batch() + + pr.dump_stats(f"/tmp/pass_2_compute_estimators_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.prof") + + else: # use multiprocessing + + def submit_batch(soma_dim_0_batch_: List[int]) -> None: + nonlocal n_batches_submitted, n_cells_submitted + n_batches_submitted += 1 + n_cells_submitted += len(soma_dim_0_batch_) + + X_uri = query.experiment.ms[measurement_name].X[layer].uri + + logging.info( + f"Step 3: Submitting cells batch {n_batches_submitted}, cells={len(soma_dim_0_batch_)}, " + f"{100 * n_cells_submitted / n_total_cells:0.1f}%" + ) + + batch_futures.append( + executor.submit( + len(soma_dim_0_batch_), + compute_all_estimators_for_batch_tdb, + soma_dim_0_batch_, + obs_df, + var_df, + X_uri, + n_batches_submitted, + estimators_uri, + ) + ) + + start_time = datetime.now() + + for group_id, obs_group_soma_joinids in obs_groups_soma_joinids.items(): + if existing_obs_group_joinids is None or group_id not in existing_obs_group_joinids: + soma_dim_0_batch.extend(obs_group_soma_joinids) + else: + logging.info(f"Step 3: Group {group_id} already computed. Skipping computation.") + continue + + # Fetch data for multiple cube rows at once, to reduce X.read() call count + if len(soma_dim_0_batch) < MIN_BATCH_SIZE: + continue + + submit_batch(soma_dim_0_batch) + soma_dim_0_batch = [] + + # Process final batch + if len(soma_dim_0_batch) > 0: + submit_batch(soma_dim_0_batch) + + # Accumulate results + + n_cells_processed = 0 + for n_batches_submitted, future in enumerate(futures.as_completed(batch_futures), start=1): + n = future.result() + n_cells_processed += n + + current_time = datetime.now() + elapsed_time = current_time - start_time + pct_complete = n_cells_processed / n_total_cells + est_total_time = elapsed_time / pct_complete + logging.info( + f"Step 3: Completed {n_batches_submitted} of {len(batch_futures)} batches, " + f"batches={100 * n_batches_submitted / len(batch_futures):0.1f}%, " + f"cells={100 * n_cells_processed / n_total_cells:0.1f}%, " + f"elapsed={elapsed_time}, " + f"est. total time={est_total_time}, " + f"est. remaining time={est_total_time - elapsed_time}" + ) + gc.collect() + + logging.info("Step 3: Completed") + + +def write_estimators_batch(batch_result: pd.DataFrame, estimators_uri: str) -> None: + if len(batch_result) > 0: + batch_result = batch_result.reset_index() + + logging.info("Step 3: Writing to estimator cube.") + + tiledb.from_pandas(estimators_uri, batch_result, mode="append") + + else: + logging.warning("Step 3: Batch had empty result") + + +def build( + cube_uri: str, + experiment_uri: str, + measurement_name: str = "RNA", + layer: str = "raw", + validate: bool = True, + consolidate: bool = True, +) -> bool: + # init multiprocessing + if multiprocessing.get_start_method(True) != "spawn": + multiprocessing.set_start_method("spawn", True) + + soma_ctx = SOMATileDBContext( + tiledb_config={ + "vfs.s3.region": os.getenv("AWS_REGION", "us-west-2"), + "vfs.s3.no_sign_request": True, + } + ) + + os.makedirs(cube_uri, exist_ok=True) + + with soma.Experiment.open(uri=experiment_uri, context=soma_ctx) as exp: + query = exp.axis_query( + measurement_name=measurement_name, + obs_query=AxisQuery(value_filter=OBS_VALUE_FILTER), + ) + logging.info(f"Processing {query.n_obs} cells and {query.n_vars} genes") + + logging.info("Step 1: Store Features") + with open(os.path.join(cube_uri, FEATURE_IDS_FILE), "w") as f: + feature_ids = query.var(column_names=["feature_id"]).concat().to_pandas()["feature_id"].tolist() + json.dump(feature_ids, f) + logging.info(f"Stored {len(feature_ids)} features in '{FEATURE_IDS_FILE}'") + + obs_size_factors_uri = os.path.join(cube_uri, OBS_SIZE_FACTORS_ARRAY) + if not tiledb.array_exists(obs_size_factors_uri): + logging.info("Step 2: Compute Approx Size Factors") + size_factors = step_2_compute_size_factors(query) + + size_factors = size_factors.astype({col: "category" for col in OBS_LOGICAL_DIMS}) + tiledb.from_pandas(obs_size_factors_uri, size_factors.reset_index(), index_col=[0]) + logging.info("Saved `obs_with_size_factor` TileDB Array") + else: + # TODO: Can remove caching of size factors; computing this is now fast + logging.info("Step 2: Compute Approx Size Factors (loading from stored data)") + size_factors = tiledb.open(obs_size_factors_uri).df[:].set_index("soma_joinid") + + logging.info("Step 3: Compute Estimators") + + query = exp.axis_query( + measurement_name=measurement_name, + obs_query=AxisQuery(value_filter=OBS_VALUE_FILTER), + ) + logging.info(f"Step 3: Processing {query.n_obs} cells and {query.n_vars} genes") + + step_3_compute_estimators(cube_uri, query, size_factors, measurement_name=measurement_name, layer=layer) + + if validate: + logging.info("Validating estimators cube") + validate_cube(cube_uri, experiment_uri) # raises exception if invalid + logging.info("Validation complete") + + if consolidate: + logging.info("Consolidating and vacuuming estimators array") + tiledb.consolidate(os.path.join(cube_uri, ESTIMATORS_ARRAY)) + tiledb.vacuum(os.path.join(cube_uri, ESTIMATORS_ARRAY)) + + logging.info("Done building estimators cube") + + return True + + +@click.command() +@click.option("--cube-uri") +@click.option("--experiment-uri") +@click.option("--measurement_name", default="RNA") +@click.option("--layer", default="raw") +@click.option("--validate/--no-validate", is_flag=True, default=True) +@click.option("--consolidate/--no-consolidate", is_flag=True, default=True) +@click.option("--resume", is_flag=True, default=False) +@click.option("--overwrite", is_flag=True, default=False) +def build_cli( + cube_uri: str, + experiment_uri: str, + measurement_name: str, + layer: str, + validate: bool, + resume: bool, + overwrite: bool, + consolidate: bool, +) -> None: + if resume and overwrite: + raise ValueError("Cannot specify both --resume and --overwrite") + + if os.path.exists(cube_uri): + if resume: + logging.info(f"Resuming from existing estimators cube at {cube_uri}.") + elif overwrite: + logging.info(f"Overwriting existing estimators cube at {cube_uri}.") + shutil.rmtree(cube_uri) + else: + logging.error( + "Estimators cube already exists and neither --resume or --overwrite options specified. Exiting." + ) + exit(1) + + build(cube_uri, experiment_uri, measurement_name, layer, validate, consolidate) + + +if __name__ == "__main__": + build_cli() diff --git a/tools/models/memento/src/estimators_cube_builder/cube_schema.py b/tools/models/memento/src/estimators_cube_builder/cube_schema.py new file mode 100644 index 000000000..b902dcaab --- /dev/null +++ b/tools/models/memento/src/estimators_cube_builder/cube_schema.py @@ -0,0 +1,113 @@ +from __future__ import annotations + +from typing import Dict + +import numpy as np +import pandas as pd +from tiledb import ( + ArraySchema, + Attr, + ByteShuffleFilter, + DictionaryFilter, + Dim, + Domain, + DoubleDeltaFilter, + Enumeration, + FilterList, + ZstdFilter, +) + +OBS_TILEDB_DIMS = ["obs_group_joinid"] + +OBS_LOGICAL_DIMS = [ + "cell_type_ontology_term_id", + "dataset_id", + "tissue_general_ontology_term_id", + "assay_ontology_term_id", + "donor_id", + "disease_ontology_term_id", + "sex_ontology_term_id", + "development_stage_ontology_term_id", + "self_reported_ethnicity_ontology_term_id", + "suspension_type", +] + +CUBE_LOGICAL_DIMS = ["feature_id"] + OBS_LOGICAL_DIMS + +ESTIMATORS_TILEDB_DIMS = ["feature_id", "obs_group_joinid"] + +ESTIMATOR_NAMES = ["sum", "sumsq", "size_factor", "n_obs"] + + +def build_obs_categorical_values(obs_groups: pd.DataFrame) -> Dict[str, Enumeration]: + return {dim_name: obs_groups[dim_name].unique().astype(str) for dim_name in OBS_LOGICAL_DIMS} + + +def build_obs_groups_schema(n_obs_groups: int, obs_categorical_values: Dict[str, Enumeration]) -> ArraySchema: + domain = Domain( + Dim( + name="obs_group_joinid", + dtype=np.uint32, + domain=(0, n_obs_groups), + filters=FilterList([ZstdFilter(level=19)]), + ) + ) + assert set(OBS_TILEDB_DIMS) == set([dim.name for dim in domain]) + return ArraySchema( + enums=[ + Enumeration(name=dim_name, ordered=False, values=categories) + for (dim_name, categories) in obs_categorical_values.items() + ], + domain=domain, + # TODO: Not all attrs need to be int32 + attrs=[ + Attr( + name=attr_name, + dtype=np.int32, + enum_label=attr_name, + nullable=False, + filters=FilterList([ZstdFilter(level=19)]), + ) + for attr_name in OBS_LOGICAL_DIMS + ] + + [ + Attr(name="n_obs", dtype=np.int32, nullable=False, filters=FilterList([ZstdFilter(level=19)])), + ], + offsets_filters=FilterList([DoubleDeltaFilter(), ZstdFilter(level=19)]), + cell_order="row-major", + tile_order="row-major", + capacity=10000, + sparse=True, # TODO: Dense would work + allows_duplicates=True, + ) + + +def build_estimators_schema(n_groups: int) -> ArraySchema: + domain = Domain( + Dim(name="feature_id", dtype="ascii", filters=FilterList([DictionaryFilter(), ZstdFilter(level=19)])), + Dim(name="obs_group_joinid", dtype=np.uint32, domain=(0, n_groups), filters=FilterList([ZstdFilter(level=19)])), + ) + assert ESTIMATORS_TILEDB_DIMS == [dim.name for dim in domain] + return ArraySchema( + domain=domain, + attrs=[ + Attr( + name=estimator_name, + dtype="float32", + var=False, + nullable=False, + filters=FilterList([ByteShuffleFilter(), ZstdFilter(level=5)]), + ) + for estimator_name in ESTIMATOR_NAMES + ], + cell_order="row-major", + tile_order="row-major", + capacity=10000, + sparse=True, + allows_duplicates=True, + ) + + +OBS_GROUPS_ARRAY = "obs_groups" +ESTIMATORS_ARRAY = "estimators" +FEATURE_IDS_FILE = "feature_ids.json" diff --git a/tools/models/memento/src/estimators_cube_builder/cube_validator.py b/tools/models/memento/src/estimators_cube_builder/cube_validator.py new file mode 100644 index 000000000..8eb371bff --- /dev/null +++ b/tools/models/memento/src/estimators_cube_builder/cube_validator.py @@ -0,0 +1,57 @@ +import os +import sys + +import tiledb + +from .cube_schema import ( + OBS_GROUPS_ARRAY, + OBS_LOGICAL_DIMS, +) + + +def _validate_dim_group_uniqueness(obs_groups: tiledb.SparseArray) -> None: + # retrieve all logical dimension columns, and ignore the estimator columns + dimensions_df = obs_groups.df[:].reset_index()[OBS_LOGICAL_DIMS] + group_counts = dimensions_df.value_counts() + assert all(group_counts <= 1), "duplicate dimension groups found" + + +def _validate_all_obs_dims_groups_present(obs_groups: tiledb.SparseArray, source_obs: tiledb.SparseArray) -> None: + distinct_obs_dims_df = source_obs.df[:][OBS_LOGICAL_DIMS].set_index(OBS_LOGICAL_DIMS) + distinct_obs_groups_dims_df = obs_groups.df[:][OBS_LOGICAL_DIMS].set_index(OBS_LOGICAL_DIMS) + actual = set(distinct_obs_groups_dims_df.index) + expected = set(distinct_obs_dims_df.index) + missing = expected.difference(actual) + assert ( + actual == expected + ), f"not all obs dimensions groups are present in the cube; missing {len(missing)} groups: {missing}" + + +def _validate_n_obs_sum(obs_groups: tiledb.SparseArray, source_obs: tiledb.SparseArray) -> None: + cube_n_obs_sums = obs_groups.df[:][["n_obs"]].sum() + source_obs_len = source_obs.df[:].shape[0] + assert all(cube_n_obs_sums == source_obs_len) + + +def validate_cube(cube_uri: str, source_experiment_uri: str) -> bool: + """ + Validate that the cube at the given path is a valid memento estimators cube. + """ + obs_groups_uri = os.path.join(cube_uri, OBS_GROUPS_ARRAY) + # estimators_uri = os.path.join(cube_uri, ESTIMATORS_ARRAY) + + with tiledb.open(os.path.join(source_experiment_uri, "obs")) as source_obs: + with tiledb.open(obs_groups_uri, "r") as obs_groups: + _validate_all_obs_dims_groups_present(obs_groups, source_obs) + _validate_dim_group_uniqueness(obs_groups) + _validate_n_obs_sum(obs_groups, source_obs) + + # TODO: Check that all 0 < sem < mean + # with tiledb.open(estimators_uri, "r") as estimators: + # pass + + return True + + +if __name__ == "__main__": + validate_cube(sys.argv[1], sys.argv[2]) diff --git a/tools/models/memento/src/estimators_cube_builder/estimators.py b/tools/models/memento/src/estimators_cube_builder/estimators.py new file mode 100644 index 000000000..64895bcd3 --- /dev/null +++ b/tools/models/memento/src/estimators_cube_builder/estimators.py @@ -0,0 +1,70 @@ +import logging +from typing import Tuple, cast + +import numpy as np +import numpy.typing as npt +import scipy.stats as stats +from numba import njit +from numpy import random +from scipy.sparse import csc_array + +RELIABILITY_THRESHOLD = 0.05 + + +def bin_size_factor(size_factor: npt.NDArray[np.float64], num_bins: int = 30) -> npt.NDArray[np.float64]: + """Bin the size factors to speed up bootstrap.""" + + binned_stat = stats.binned_statistic(size_factor, size_factor, bins=num_bins, statistic="mean") + bin_idx = np.clip(binned_stat[2], a_min=1, a_max=binned_stat[0].shape[0]) + approx_sf = binned_stat[0][bin_idx - 1] + max_sf = size_factor.max() + approx_sf[size_factor == max_sf] = max_sf + + return cast(npt.NDArray[np.float64], approx_sf) + + +def fill_invalid(val: npt.NDArray[np.float64], group_name: Tuple[str, ...]) -> npt.NDArray[np.float64]: + """Fill invalid entries by randomly selecting a valid entry.""" + + # negatives and nan values are invalid values for our purposes + invalid_mask = np.less_equal(val, 0.0, where=~np.isnan(val)) | np.isnan(val) + num_invalid = invalid_mask.sum() + + if num_invalid == val.shape[0]: + # if all values are invalid, there are no valid values to choose from, so return all nans + logging.debug(f"all bootstrap variances are invalid for group {group_name}") + return np.full(shape=val.shape, fill_value=np.nan) + + val[invalid_mask] = np.random.choice(val[~invalid_mask], num_invalid) + + return val + + +def unique_expr( + X: csc_array, size_factor: npt.NDArray[np.float64] +) -> Tuple[npt.NDArray[np.float64], npt.NDArray[np.float32], npt.NDArray[np.int64]]: + """ + Find (approximately) unique combinations of expression values and size factors. + The random component is for mapping (expr, size_factor) to a single number. + This can certainly be performed more efficiently using sparsity. + """ + + code = X.dot(np.random.random(X.shape[1])) + approx_sf = size_factor + + code += np.random.random() * approx_sf + + _, index, count = np.unique(code, return_index=True, return_counts=True) + + expr_to_return = X[index].toarray() + + return 1 / approx_sf[index].reshape(-1, 1), expr_to_return, count + + +@njit # type: ignore[misc] +def gen_multinomial(counts: npt.NDArray[np.int64], n_obs: int, num_boot: int) -> npt.NDArray[np.int64]: + # reset numpy random generator + # TODO: why is this necessary? + np.random.seed(5) + + return random.multinomial(n_obs, counts / counts.sum(), size=num_boot).T diff --git a/tools/models/memento/src/estimators_cube_builder/migrate_schema.py b/tools/models/memento/src/estimators_cube_builder/migrate_schema.py new file mode 100644 index 000000000..1ddbcf7db --- /dev/null +++ b/tools/models/memento/src/estimators_cube_builder/migrate_schema.py @@ -0,0 +1,32 @@ +import sys + +import tiledb +from cube_schema import ESTIMATOR_NAMES, build_estimators_schema + +if __name__ == "__main__": + old_cube_uri = sys.argv[1] + new_cube_uri = sys.argv[2] + + tdb_config = tiledb.Config( + { + "py.init_buffer_bytes": 1 * 1024**3, + } + ) + + with tiledb.open(old_cube_uri, "r", config=tdb_config) as old_cube: + n_obs_groups = old_cube.schema.domain.dim(1).domain[1] + new_schema = build_estimators_schema(n_obs_groups) + tiledb.Array.create(new_cube_uri, new_schema, overwrite=False) + with tiledb.open(new_cube_uri, "w") as new_cube: + for i, old_chunk in enumerate( + old_cube.query(return_incomplete=True, use_arrow=True, return_arrow=True, attrs=ESTIMATOR_NAMES).df[:], + start=1, + ): + print(f"writing chunk {i}, shape={old_chunk.shape}") + coords = [old_chunk[dim.name].combine_chunks() for dim in new_cube.schema.domain] + data = {attr.name: old_chunk[attr.name].combine_chunks() for attr in new_schema} + new_cube[tuple(coords)] = data + + print("performing consolidate & vacuum...") + tiledb.consolidate(new_cube_uri) + tiledb.vacuum(new_cube_uri) diff --git a/tools/models/memento/src/estimators_cube_builder/mp.py b/tools/models/memento/src/estimators_cube_builder/mp.py new file mode 100644 index 000000000..e654acf1f --- /dev/null +++ b/tools/models/memento/src/estimators_cube_builder/mp.py @@ -0,0 +1,393 @@ +import contextlib +import logging +import multiprocessing +import os +import threading +import weakref +from collections import deque +from concurrent.futures import Executor, Future, ProcessPoolExecutor, ThreadPoolExecutor +from functools import partial +from multiprocessing.pool import Pool +from types import TracebackType +from typing import ( + Any, + Callable, + Generic, + Iterable, + Iterator, + Literal, + Mapping, + Optional, + Tuple, + TypeVar, + Union, + cast, +) + +import attrs +import psutil +from typing_extensions import ParamSpec + +# TODO: DRY with cellxgene_census_builder/../mp.py + + +def _mp_config_checks() -> bool: + # We rely on the pool configuration being correct. Failure to do this will + # lead to strange errors on some OS (eg., Linux defaults to fork). Rather + # than chase those bugs, assert correct configuration. + assert multiprocessing.get_start_method(True) == "spawn" + + return True + + +def _hard_process_cap(max_worker_processes: int, n_proc: int) -> int: + """ + Enforce the configured worker process limit. + + NOTE: logic below only enforces this limit in cases using the default worker count, + as there are special cases where we want higher limits, due to special knowledge that we + will not be subject to the default resource constraints (e.g., VM map usage by SOMA). + """ + return min(int(max_worker_processes), n_proc) + + +def _cpu_count() -> int: + """ + os.cpu_count() returns None if "undetermined" number of CPUs. + This function exists to always return a default of `1` when + os.cpu_count returns None. + """ + cpu_count = os.cpu_count() + if os.cpu_count() is None: + return 1 + return cast(int, cpu_count) + + +def _default_worker_process_count(max_worker_processes: int) -> int: + """Return the default worker process count, subject to configured limit.""" + return _hard_process_cap(max_worker_processes, _cpu_count() + 2) + + +def n_workers_from_memory_budget( + per_worker_budget: int, + max_worker_processes: int, +) -> int: + """Trivial helper to estimate appropriate number of fixed-memory-budget workers from total memory available""" + memory_budget = psutil.virtual_memory().total + n_workers: int = int(memory_budget // per_worker_budget) + return min(n_workers, _default_worker_process_count(max_worker_processes)) + + +def create_process_pool_executor( + max_worker_processes: int, + max_workers: Optional[int] = None, + process_init: Optional[Callable[[], None]] = None, + init_args: Tuple[Any, ...] = (), +) -> ProcessPoolExecutor: + assert _mp_config_checks() + if max_workers is None: + max_workers = _default_worker_process_count(max_worker_processes) + logging.debug(f"create_process_pool_executor [max_workers={max_workers}]") + return ProcessPoolExecutor(max_workers=max_workers, initializer=process_init, initargs=init_args) + + +def create_thread_pool_executor(max_workers: Optional[int] = None) -> ThreadPoolExecutor: + assert _mp_config_checks() + logging.debug(f"create_thread_pool_executor [max_workers={max_workers}]") + return ThreadPoolExecutor(max_workers=max_workers) + + +def log_on_broken_process_pool(ppe: Union[ProcessPoolExecutor, "ResourcePoolProcessExecutor"]) -> None: + """ + There are a number of conditions where the Process Pool can be broken, + such that it will hang in a shutdown. This will cause the context __exit__ + to hang indefinitely, as it calls ProcessPoolExecutor.shutdown with + `wait=True`. + + An example condition which can cause a deadlock is an OOM, where a the + repear kills a process. + + This routine is used to detect the condition and log the error, so a + human has a chance of detecting/diagnosing. + + Caution: uses ProcessPoolExecutor internal API, as this state is not + otherwise visible. + """ + + if ppe._broken: + logging.critical(f"Process pool broken and may fail or hang: {ppe._broken}") + + return + + +# TODO: when the builder is updated to cellxgene_census 1.3+, we can pull +# EagerIterator out of the experimental.util package. Until then, it is hard to keep +# it DRY. + +_T = TypeVar("_T") +_P = ParamSpec("_P") + + +class EagerIterator(Iterator[_T]): + def __init__( + self, + iterator: Iterator[_T], + pool: Optional[Executor] = None, + ): + super().__init__() + self.iterator = iterator + self._pool = pool or ThreadPoolExecutor() + self._own_pool = pool is None + self._future: Optional[Future[_T]] = None + self._fetch_next() + + def _fetch_next(self) -> None: + self._future = self._pool.submit(self.iterator.__next__) + logging.debug("EagerIterator: fetching next iterator element, eagerly") + + def __next__(self) -> _T: + try: + assert self._future + res = self._future.result() + self._fetch_next() + return res + except StopIteration: + self._cleanup() + raise + + def _cleanup(self) -> None: + logging.debug("EagerIterator: cleaning up eager iterator") + if self._own_pool: + self._pool.shutdown() + + def __del__(self) -> None: + # Ensure the threadpool is cleaned up in the case where the + # iterator is not exhausted. For more information on __del__: + # https://docs.python.org/3/reference/datamodel.html#object.__del__ + self._cleanup() + super_del = getattr(super(), "__del__", lambda: None) + super_del() + + +@attrs.define +class _WorkItem(Generic[_T]): + resources: int + future: Future[_T] + fn: Callable[..., _T] + args: Iterable[Any] + kwargs: Mapping[str, Any] + + +_MightBeWork = Tuple[bool, Optional[_WorkItem[Any]]] +_SchedulerMethod = Literal["best-fit", "first-fit"] + + +class _Scheduler(threading.Thread): + def __init__(self, executor: "ResourcePoolProcessExecutor", max_resources: int): + super().__init__(name="ResourcePoolProcessExecutor_scheduler") + + def _weakref_collected(_: weakref.ReferenceType[Any], scheduler: "_Scheduler" = self) -> None: + scheduler.shutdown() + + assert isinstance(executor.process_pool, Pool) + self.executor_ref = weakref.ref(executor, _weakref_collected) + self.max_resources: int = max_resources + + self.resources_in_use: int = 0 + self._pending_work: deque[_WorkItem[Any]] = deque() + + self.shutdown_requested: bool = False + + self._condition: threading.Condition = threading.Condition() + + def shutdown(self) -> None: + self.shutdown_requested = True + with self._condition: + while len(self._pending_work): + wi = self._pending_work.popleft() + wi.future.cancel() + self._condition.notify() + + def submit(self, wi: _WorkItem[_T]) -> Future[_T]: + f = Future[_T]() + with self._condition: + self._pending_work.append(wi) + self._condition.notify() + return f + + def _get_work(self) -> _MightBeWork: + """ + Get next work item to schedule. + + IMPORTANT: caller MUST own scheduler _condition lock to call this. + + Return value is a tuple, where: + * First item is a bool indicating that _either_ a shutdown request or work item + needs to be handled. If True, there is either work or a shutdown request. + * second element is an optional work item. Will be None in case of shutdown + request. + """ + + def _get_next_work() -> Optional[int]: + """Return index of "best" work item to scheudle, or None if work is unavailable.""" + + # Best fit: return the largest resource consumer that fits in available space + max_available_resources = self.max_resources - self.resources_in_use + candidate_work = filter(lambda v: v[1].resources <= max_available_resources, enumerate(self._pending_work)) + return max(candidate_work, key=lambda v: v[1].resources, default=(None,))[0] + + if self.shutdown_requested: + return (True, None) + + if len(self._pending_work): + if (i := _get_next_work()) is not None: + # pop ith item from the deque + self._pending_work.rotate(-i) + wi = self._pending_work.popleft() + self._pending_work.rotate(i) + return (True, wi) + + if self.resources_in_use == 0: + # We always want at least one job, regardless of cost of the work item. + return (True, self._pending_work.popleft()) + + return (False, None) # no work for you + + def run(self) -> None: + while True: + with self._condition: + work: _MightBeWork + while not (work := self._get_work())[0]: + self._condition.wait() + + if self.shutdown_requested: + assert work[0] is True and work[1] is None + self._debug_msg("shutdown request received by scheduler") + return + + assert work[1] is not None + work_item: _WorkItem[Any] = work[1] + assert isinstance(work_item, _WorkItem) + self._schedule_work(work_item) + del work, work_item # don't hold onto references + + @classmethod + def _work_item_done( + cls, scheduler: "_Scheduler", wi: _WorkItem[_T], is_error_callback: bool, result: Union[_T, BaseException] + ) -> None: + """Callback when async_apply is complete.""" + if is_error_callback: + assert isinstance(result, BaseException) + wi.future.set_exception(result) + else: + assert not isinstance(result, BaseException) + wi.future.set_result(result) + scheduler._release_resouces(wi) + + def _schedule_work(self, work: _WorkItem[Any]) -> None: + """must hold lock""" + executor = self.executor_ref() + if executor is None: + # can happen if the ResourcePoolExecutor was collected + return + self._debug_msg(f"adding work to pool {id(work):#x} [resources={work.resources}]") + self.resources_in_use += work.resources + _work_item_done = partial(self._work_item_done, self, work, False) + _work_item_error = partial(self._work_item_done, self, work, True) + executor.process_pool.apply_async( + work.fn, work.args, work.kwargs, callback=_work_item_done, error_callback=_work_item_error + ) + + def _release_resouces(self, wi: _WorkItem[Any]) -> None: + with self._condition: + self.resources_in_use -= wi.resources + self._condition.notify() + + def _debug_msg(self, msg: str) -> None: + logging.debug( + f"ResourcePoolProcessExecutor: {msg} [" + f"free={self.max_resources-self.resources_in_use} " + f"in_use={self.resources_in_use} " + f"unsched={len(self._pending_work)}" + "]" + ) + + +class ResourcePoolProcessExecutor(contextlib.AbstractContextManager["ResourcePoolProcessExecutor"]): + """ + Provides a ProcessPoolExecutor-like API, scheduling based upon static "resource" reservation + requests. A "resource" is any shared capacity or resource, expressed as an integer + value. Class holds a queue of "work items", scheduling them into an actual ProcessPoolExecutor + when sufficient resources are available. + + Primarily use case is managing finite memory, by throttling submissions until memory is + available. + """ + + def __init__(self, max_resources: int, *args: Any, **kwargs: Any): + _mp_config_checks() + + super().__init__() + logging.debug(f"ResourcePoolProcessExecutor: starting process pool with args ({args} {kwargs})") + + max_workers = kwargs.pop("max_workers", None) + initializer = kwargs.pop("initializer", None) + initargs = kwargs.pop("initargs", None) + max_tasks_per_child = kwargs.pop("max_tasks_per_child", None) + self.process_pool: multiprocessing.pool.Pool = multiprocessing.Pool( + processes=max_workers, initializer=initializer, initargs=initargs, maxtasksperchild=max_tasks_per_child + ) + + # create and start scheduler thread + self.scheduler = _Scheduler(self, max_resources) + self.scheduler.start() + + @property + def _broken(self) -> bool: + return self.process_pool._state not in ["RUN", "INIT", "CLOSE"] # type: ignore[attr-defined] + + def submit(self, resources: int, fn: Callable[_P, _T], *args: _P.args, **kwargs: _P.kwargs) -> Future[_T]: + f = Future[_T]() + work_item = _WorkItem[_T](resources=resources, future=f, fn=fn, args=args, kwargs=kwargs) + self.scheduler.submit(work_item) + return f + + def shutdown(self, wait: bool = True, *, cancel_futures: bool = False) -> None: + self.scheduler.shutdown() + self.process_pool.close() + + def __exit__( + self, exc_type: Optional[type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType] + ) -> None: + self.shutdown(wait=True) + return None + + +def create_resource_pool_executor( + max_worker_processes: int = 96, + max_resources: Optional[int] = None, + max_workers: Optional[int] = None, + max_tasks_per_child: Optional[int] = None, + memory_budget: Optional[int] = None, + process_init: Optional[Callable[[], None]] = None, + process_init_args: Tuple[Any, ...] = (), +) -> ResourcePoolProcessExecutor: + assert _mp_config_checks() + + if max_resources is None: + max_resources = memory_budget + assert max_resources is not None, "must specify either max_resources or memory_budget" + if max_workers is None: + max_workers = _default_worker_process_count(max_worker_processes) + if max_tasks_per_child is None: + # not strictly necessary, but helps avoid leaks by turning over sub-processes + max_tasks_per_child = 10 + + logging.debug(f"create_resource_pool_executor [max_workers={max_workers}, max_resources={max_resources}]") + return ResourcePoolProcessExecutor( + max_resources=max_resources, + max_workers=max_workers, + max_tasks_per_child=max_tasks_per_child, + initializer=process_init, + initargs=process_init_args, + ) diff --git a/tools/models/memento/tests/__init__.py b/tools/models/memento/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/tests/fixtures/__init__.py b/tools/models/memento/tests/fixtures/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/__group/__1707330782195_1707330782195_3cb88f5f7b9f4cf08e49a47419b0d220_2 b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/__group/__1707330782195_1707330782195_3cb88f5f7b9f4cf08e49a47419b0d220_2 new file mode 100644 index 000000000..8c06be7cd Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/__group/__1707330782195_1707330782195_3cb88f5f7b9f4cf08e49a47419b0d220_2 differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/__meta/__1707330782195_1707330782195_506e677230ed4e4ba05fe082ea3b8495 b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/__meta/__1707330782195_1707330782195_506e677230ed4e4ba05fe082ea3b8495 new file mode 100644 index 000000000..657fa56ee Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/__meta/__1707330782195_1707330782195_506e677230ed4e4ba05fe082ea3b8495 differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/__tiledb_group.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/__tiledb_group.tdb new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/__group/__1707330782195_1707330782195_686af0c2ff654556b7d2f52662c7eded_2 b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/__group/__1707330782195_1707330782195_686af0c2ff654556b7d2f52662c7eded_2 new file mode 100644 index 000000000..d6cfaaa10 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/__group/__1707330782195_1707330782195_686af0c2ff654556b7d2f52662c7eded_2 differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/__meta/__1707330782195_1707330782195_23b27371d9e94ef19bf6438be0549e4e b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/__meta/__1707330782195_1707330782195_23b27371d9e94ef19bf6438be0549e4e new file mode 100644 index 000000000..f760cd7ff Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/__meta/__1707330782195_1707330782195_23b27371d9e94ef19bf6438be0549e4e differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/__tiledb_group.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/__tiledb_group.tdb new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__commits/__1707330782195_1707330782195_6208729b179a41e89d2bf9230a55230e_20.wrt b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__commits/__1707330782195_1707330782195_6208729b179a41e89d2bf9230a55230e_20.wrt new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__fragments/__1707330782195_1707330782195_6208729b179a41e89d2bf9230a55230e_20/__fragment_metadata.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__fragments/__1707330782195_1707330782195_6208729b179a41e89d2bf9230a55230e_20/__fragment_metadata.tdb new file mode 100644 index 000000000..cfb094fd3 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__fragments/__1707330782195_1707330782195_6208729b179a41e89d2bf9230a55230e_20/__fragment_metadata.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__fragments/__1707330782195_1707330782195_6208729b179a41e89d2bf9230a55230e_20/a0.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__fragments/__1707330782195_1707330782195_6208729b179a41e89d2bf9230a55230e_20/a0.tdb new file mode 100644 index 000000000..25c8c863b Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__fragments/__1707330782195_1707330782195_6208729b179a41e89d2bf9230a55230e_20/a0.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__fragments/__1707330782195_1707330782195_6208729b179a41e89d2bf9230a55230e_20/d0.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__fragments/__1707330782195_1707330782195_6208729b179a41e89d2bf9230a55230e_20/d0.tdb new file mode 100644 index 000000000..64256627b Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__fragments/__1707330782195_1707330782195_6208729b179a41e89d2bf9230a55230e_20/d0.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__fragments/__1707330782195_1707330782195_6208729b179a41e89d2bf9230a55230e_20/d1.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__fragments/__1707330782195_1707330782195_6208729b179a41e89d2bf9230a55230e_20/d1.tdb new file mode 100644 index 000000000..129489285 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__fragments/__1707330782195_1707330782195_6208729b179a41e89d2bf9230a55230e_20/d1.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__meta/__1707330782195_1707330782195_7f58da8c5b804d7fab05d6f9d8ece86f b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__meta/__1707330782195_1707330782195_7f58da8c5b804d7fab05d6f9d8ece86f new file mode 100644 index 000000000..5e787e56f Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__meta/__1707330782195_1707330782195_7f58da8c5b804d7fab05d6f9d8ece86f differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__meta/__1707330782195_1707330782195_d9b0c4d11dc147259d32a26ec7ef9aa7 b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__meta/__1707330782195_1707330782195_d9b0c4d11dc147259d32a26ec7ef9aa7 new file mode 100644 index 000000000..4168113f7 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__meta/__1707330782195_1707330782195_d9b0c4d11dc147259d32a26ec7ef9aa7 differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__schema/__1707330801103_1707330801103_a1cfc3b5b1cc4e3c9faed77268d497a0 b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__schema/__1707330801103_1707330801103_a1cfc3b5b1cc4e3c9faed77268d497a0 new file mode 100644 index 000000000..022a72afa Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/X/raw/__schema/__1707330801103_1707330801103_a1cfc3b5b1cc4e3c9faed77268d497a0 differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/__group/__1707330782195_1707330782195_e1292e7985dc48f48bcbd98fb8970fdf_2 b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/__group/__1707330782195_1707330782195_e1292e7985dc48f48bcbd98fb8970fdf_2 new file mode 100644 index 000000000..9a4d9a9b7 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/__group/__1707330782195_1707330782195_e1292e7985dc48f48bcbd98fb8970fdf_2 differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/__meta/__1707330782195_1707330782195_52caf60f7dad45579ee467c13f442a62 b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/__meta/__1707330782195_1707330782195_52caf60f7dad45579ee467c13f442a62 new file mode 100644 index 000000000..d8b74af58 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/__meta/__1707330782195_1707330782195_52caf60f7dad45579ee467c13f442a62 differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/__tiledb_group.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/RNA/__tiledb_group.tdb new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/__group/__1707330782195_1707330782195_5d09cbdde26c4dceafc304246f58b982_2 b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/__group/__1707330782195_1707330782195_5d09cbdde26c4dceafc304246f58b982_2 new file mode 100644 index 000000000..4e767ccf2 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/__group/__1707330782195_1707330782195_5d09cbdde26c4dceafc304246f58b982_2 differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/__meta/__1707330782195_1707330782195_58db7cfc16f848568297ea7dbdbe5bd3 b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/__meta/__1707330782195_1707330782195_58db7cfc16f848568297ea7dbdbe5bd3 new file mode 100644 index 000000000..f760cd7ff Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/__meta/__1707330782195_1707330782195_58db7cfc16f848568297ea7dbdbe5bd3 differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/__tiledb_group.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/ms/__tiledb_group.tdb new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__commits/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20.wrt b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__commits/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20.wrt new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/__fragment_metadata.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/__fragment_metadata.tdb new file mode 100644 index 000000000..2be06cdc0 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/__fragment_metadata.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a0.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a0.tdb new file mode 100644 index 000000000..202dba715 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a0.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a0_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a0_var.tdb new file mode 100644 index 000000000..5ba5f1a51 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a0_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a1.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a1.tdb new file mode 100644 index 000000000..1faa5b061 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a1.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a10.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a10.tdb new file mode 100644 index 000000000..550a3e00e Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a10.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a11.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a11.tdb new file mode 100644 index 000000000..f1cabbb50 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a11.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a11_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a11_var.tdb new file mode 100644 index 000000000..b44ecaab9 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a11_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a12.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a12.tdb new file mode 100644 index 000000000..538dacc95 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a12.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a12_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a12_var.tdb new file mode 100644 index 000000000..6c52617ab Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a12_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a13.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a13.tdb new file mode 100644 index 000000000..90f14e8a9 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a13.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a13_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a13_var.tdb new file mode 100644 index 000000000..9bd09d81c Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a13_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a14.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a14.tdb new file mode 100644 index 000000000..9af3dc551 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a14.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a14_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a14_var.tdb new file mode 100644 index 000000000..c923f9651 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a14_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a15.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a15.tdb new file mode 100644 index 000000000..74cedfcd1 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a15.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a15_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a15_var.tdb new file mode 100644 index 000000000..e283979ba Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a15_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a16.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a16.tdb new file mode 100644 index 000000000..ac7996895 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a16.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a16_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a16_var.tdb new file mode 100644 index 000000000..76aef7798 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a16_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a17.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a17.tdb new file mode 100644 index 000000000..538dacc95 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a17.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a17_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a17_var.tdb new file mode 100644 index 000000000..3cb5dbc89 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a17_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a18.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a18.tdb new file mode 100644 index 000000000..8c3eca90e Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a18.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a18_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a18_var.tdb new file mode 100644 index 000000000..14fed3c79 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a18_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a19.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a19.tdb new file mode 100644 index 000000000..538dacc95 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a19.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a19_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a19_var.tdb new file mode 100644 index 000000000..a97197127 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a19_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a1_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a1_var.tdb new file mode 100644 index 000000000..f4528228c Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a1_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a2.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a2.tdb new file mode 100644 index 000000000..93fd1865b Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a2.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a20.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a20.tdb new file mode 100644 index 000000000..a48a30a5d Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a20.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a21.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a21.tdb new file mode 100644 index 000000000..b230cd514 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a21.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a22.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a22.tdb new file mode 100644 index 000000000..1cb0397f3 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a22.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a23.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a23.tdb new file mode 100644 index 000000000..5e00eb44f Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a23.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a24.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a24.tdb new file mode 100644 index 000000000..55ff19c0b Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a24.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a2_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a2_var.tdb new file mode 100644 index 000000000..c877937d4 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a2_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a3.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a3.tdb new file mode 100644 index 000000000..9e1bf83dc Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a3.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a3_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a3_var.tdb new file mode 100644 index 000000000..b66a960df Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a3_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a4.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a4.tdb new file mode 100644 index 000000000..2da187cc4 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a4.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a4_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a4_var.tdb new file mode 100644 index 000000000..8dbb7c6ef Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a4_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a5.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a5.tdb new file mode 100644 index 000000000..057206899 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a5.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a5_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a5_var.tdb new file mode 100644 index 000000000..3cb273fc9 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a5_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a6.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a6.tdb new file mode 100644 index 000000000..538dacc95 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a6.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a6_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a6_var.tdb new file mode 100644 index 000000000..6eca9f11a Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a6_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a7.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a7.tdb new file mode 100644 index 000000000..8c3eca90e Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a7.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a7_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a7_var.tdb new file mode 100644 index 000000000..94cc09f05 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a7_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a8.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a8.tdb new file mode 100644 index 000000000..9af3dc551 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a8.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a8_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a8_var.tdb new file mode 100644 index 000000000..114e6e776 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a8_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a9.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a9.tdb new file mode 100644 index 000000000..6782bc7a8 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a9.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a9_var.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a9_var.tdb new file mode 100644 index 000000000..1c9382c8d Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/a9_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/d0.tdb b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/d0.tdb new file mode 100644 index 000000000..d446c7e2a Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__fragments/__1707330799826_1707330799826_00efba11bc2646deb777816d3406678f_20/d0.tdb differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__meta/__1707330799826_1707330799826_3e10b422039344158c8376bd87e80678 b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__meta/__1707330799826_1707330799826_3e10b422039344158c8376bd87e80678 new file mode 100644 index 000000000..cb20a44d9 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__meta/__1707330799826_1707330799826_3e10b422039344158c8376bd87e80678 differ diff --git a/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__schema/__1707330799825_1707330799825_29905154d5444bec9e5d048083a428de b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__schema/__1707330799825_1707330799825_29905154d5444bec9e5d048083a428de new file mode 100644 index 000000000..338cb3191 Binary files /dev/null and b/tools/models/memento/tests/fixtures/census-homo-sapiens-small/obs/__schema/__1707330799825_1707330799825_29905154d5444bec9e5d048083a428de differ diff --git a/tools/models/memento/tests/fixtures/census_fixture.py b/tools/models/memento/tests/fixtures/census_fixture.py new file mode 100644 index 000000000..766632b7e --- /dev/null +++ b/tools/models/memento/tests/fixtures/census_fixture.py @@ -0,0 +1,59 @@ +import os +import sys + +import pyarrow as pa +import tiledb +import tiledbsoma as soma +from somacore import AxisQuery + + +def subset_census(query: soma.ExperimentAxisQuery, output_base_dir: str) -> None: + """ + Subset the census cube to the given query, returning a new cube. + """ + with soma.Experiment.create(uri=output_base_dir) as exp_subset: + x_data = query.X(layer_name="raw").tables().concat() + + obs_data = query.obs().concat() + # remove obs rows with no X data + x_soma_dim_0_unique = pa.Table.from_arrays([x_data["soma_dim_0"].unique()], names=["soma_dim_0"]) + obs_data = obs_data.join(x_soma_dim_0_unique, keys="soma_joinid", right_keys="soma_dim_0", join_type="inner") + obs = soma.DataFrame.create(os.path.join(output_base_dir, "obs"), schema=obs_data.schema) + obs.write(obs_data) + exp_subset.set("obs", obs) + + ms = exp_subset.add_new_collection("ms") + rna = ms.add_new_collection("RNA", soma.Measurement) + + var_data = query.var().concat() + var = rna.add_new_dataframe("var", schema=var_data.schema) + var.write(var_data) + + x_type = x_data.schema.field_by_name("soma_data").type + rna.add_new_collection("X") + rna["X"].add_new_sparse_ndarray("raw", type=x_type, shape=(None, None)) + rna.X["raw"].write(x_data) + + +if __name__ == "__main__": + experiment_uri, obs_value_filter, var_value_filter, output_cube_path = sys.argv[1:5] + + context = soma.SOMATileDBContext().replace( + tiledb_config={ + "soma.init_buffer_bytes": 128 * 1024**2, + "vfs.s3.region": "us-west-2", + "vfs.s3.no_sign_request": "false", + } + ) + with soma.Experiment.open(experiment_uri, context=context) as exp: + query = exp.axis_query( + measurement_name="RNA", + obs_query=AxisQuery(value_filter=obs_value_filter), + var_query=AxisQuery(value_filter=var_value_filter), + ) + subset_census(query, output_cube_path) + + for array_uri in ["obs", "ms/RNA/var", "ms/RNA/X/raw"]: + uri = os.path.join(output_cube_path, array_uri) + tiledb.consolidate(uri) + tiledb.vacuum(uri) diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__commits/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20.wrt b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__commits/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20.wrt new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/__fragment_metadata.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/__fragment_metadata.tdb new file mode 100644 index 000000000..e9f2020e3 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/__fragment_metadata.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/a0.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/a0.tdb new file mode 100644 index 000000000..e2e5626b9 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/a0.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/a1.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/a1.tdb new file mode 100644 index 000000000..7fad609d8 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/a1.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/a2.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/a2.tdb new file mode 100644 index 000000000..f639765f9 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/a2.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/a3.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/a3.tdb new file mode 100644 index 000000000..c321ae80d Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/a3.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/d0.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/d0.tdb new file mode 100644 index 000000000..1b1cb4d44 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/d0.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/d0_var.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/d0_var.tdb new file mode 100644 index 000000000..ae31c7f15 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/d0_var.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/d1.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/d1.tdb new file mode 100644 index 000000000..382031e73 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/d1.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/t.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/t.tdb new file mode 100644 index 000000000..ff991348b Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__fragments/__1707330902002_1707330902008_bc7e3b43e89d4d7bb24f0e23239fa955_20/t.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__schema/__1707330898816_1707330898816_be97595b8bcf4934980106d892c2177d b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__schema/__1707330898816_1707330898816_be97595b8bcf4934980106d892c2177d new file mode 100644 index 000000000..365fce483 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/estimators/__schema/__1707330898816_1707330898816_be97595b8bcf4934980106d892c2177d differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/feature_ids.json b/tools/models/memento/tests/fixtures/estimators-cube-expected/feature_ids.json new file mode 100644 index 000000000..d21433e89 --- /dev/null +++ b/tools/models/memento/tests/fixtures/estimators-cube-expected/feature_ids.json @@ -0,0 +1 @@ +["ENSG00000002330", "ENSG00000000419"] \ No newline at end of file diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__commits/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20.wrt b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__commits/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20.wrt new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/__fragment_metadata.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/__fragment_metadata.tdb new file mode 100644 index 000000000..b30d58f0b Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/__fragment_metadata.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a0.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a0.tdb new file mode 100644 index 000000000..cb73bca43 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a0.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a1.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a1.tdb new file mode 100644 index 000000000..b695581cd Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a1.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a10.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a10.tdb new file mode 100644 index 000000000..618a49985 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a10.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a2.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a2.tdb new file mode 100644 index 000000000..b695581cd Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a2.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a3.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a3.tdb new file mode 100644 index 000000000..3e7eaa894 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a3.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a4.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a4.tdb new file mode 100644 index 000000000..851a0cc92 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a4.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a5.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a5.tdb new file mode 100644 index 000000000..b695581cd Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a5.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a6.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a6.tdb new file mode 100644 index 000000000..54bc80e60 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a6.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a7.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a7.tdb new file mode 100644 index 000000000..851a0cc92 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a7.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a8.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a8.tdb new file mode 100644 index 000000000..c0378804a Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a8.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a9.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a9.tdb new file mode 100644 index 000000000..b695581cd Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/a9.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/d0.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/d0.tdb new file mode 100644 index 000000000..e3a456c44 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__fragments/__1707330898820_1707330898820_cb8b0baa2fad4e71b738d56c3386a6ae_20/d0.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__1707330898813_1707330898813_f948950c6dd2418b96a35d157eddab5f b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__1707330898813_1707330898813_f948950c6dd2418b96a35d157eddab5f new file mode 100644 index 000000000..463e841a6 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__1707330898813_1707330898813_f948950c6dd2418b96a35d157eddab5f differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__0456554d582647109bfe3566e3e61021_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__0456554d582647109bfe3566e3e61021_0 new file mode 100644 index 000000000..ea3d2175f Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__0456554d582647109bfe3566e3e61021_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__08972e8102504dd9a89033a54ef17f26_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__08972e8102504dd9a89033a54ef17f26_0 new file mode 100644 index 000000000..5df827c1d Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__08972e8102504dd9a89033a54ef17f26_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__0c11190076314374b8cb871dcf9e0b7a_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__0c11190076314374b8cb871dcf9e0b7a_0 new file mode 100644 index 000000000..6f204d0da Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__0c11190076314374b8cb871dcf9e0b7a_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__1898ad338f6141a4b6bccd90b1ebaf35_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__1898ad338f6141a4b6bccd90b1ebaf35_0 new file mode 100644 index 000000000..e8c46cbfb Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__1898ad338f6141a4b6bccd90b1ebaf35_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__44c88d557ee4438286f1f97f449dc9ad_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__44c88d557ee4438286f1f97f449dc9ad_0 new file mode 100644 index 000000000..c14bd21f9 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__44c88d557ee4438286f1f97f449dc9ad_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__5be1b19090164b4897918755654a93d7_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__5be1b19090164b4897918755654a93d7_0 new file mode 100644 index 000000000..d9f67fc12 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__5be1b19090164b4897918755654a93d7_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__a39694c09c5045d7ad19eac6df9a8320_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__a39694c09c5045d7ad19eac6df9a8320_0 new file mode 100644 index 000000000..347fdf2cf Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__a39694c09c5045d7ad19eac6df9a8320_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__ea8c52e643a243e491b4bcd23e3c05a2_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__ea8c52e643a243e491b4bcd23e3c05a2_0 new file mode 100644 index 000000000..bb1346c13 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__ea8c52e643a243e491b4bcd23e3c05a2_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__ee613b5dcc7a4ce8b5b704bbf8af6a2d_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__ee613b5dcc7a4ce8b5b704bbf8af6a2d_0 new file mode 100644 index 000000000..be6545bee Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__ee613b5dcc7a4ce8b5b704bbf8af6a2d_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__f74512220a784de4b94a638edd665945_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__f74512220a784de4b94a638edd665945_0 new file mode 100644 index 000000000..a8ab5fb38 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/obs_groups/__schema/__enumerations/__f74512220a784de4b94a638edd665945_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__commits/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20.wrt b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__commits/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20.wrt new file mode 100644 index 000000000..e69de29bb diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/__fragment_metadata.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/__fragment_metadata.tdb new file mode 100644 index 000000000..75efa91d5 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/__fragment_metadata.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a0.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a0.tdb new file mode 100644 index 000000000..d446c7e2a Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a0.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a1.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a1.tdb new file mode 100644 index 000000000..9b81d9a53 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a1.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a10.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a10.tdb new file mode 100644 index 000000000..ba91b4f46 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a10.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a11.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a11.tdb new file mode 100644 index 000000000..01a00cb23 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a11.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a2.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a2.tdb new file mode 100644 index 000000000..ba91b4f46 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a2.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a3.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a3.tdb new file mode 100644 index 000000000..ba91b4f46 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a3.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a4.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a4.tdb new file mode 100644 index 000000000..580fa0d2d Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a4.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a5.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a5.tdb new file mode 100644 index 000000000..6032d7453 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a5.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a6.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a6.tdb new file mode 100644 index 000000000..ba91b4f46 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a6.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a7.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a7.tdb new file mode 100644 index 000000000..303815a5e Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a7.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a8.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a8.tdb new file mode 100644 index 000000000..cb92a6715 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a8.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a9.tdb b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a9.tdb new file mode 100644 index 000000000..42817acb5 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__fragments/__1707330898720_1707330898720_1825f3fc69064e928370b2cfa441fca8_20/a9.tdb differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__meta/__1707330898719_1707330898719_7969359375024a77811c20c0f3884d28 b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__meta/__1707330898719_1707330898719_7969359375024a77811c20c0f3884d28 new file mode 100644 index 000000000..43f3335c4 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__meta/__1707330898719_1707330898719_7969359375024a77811c20c0f3884d28 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__1707330898716_1707330898716_9b9af0c5bb6b41888b93d261d1c0bfc7 b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__1707330898716_1707330898716_9b9af0c5bb6b41888b93d261d1c0bfc7 new file mode 100644 index 000000000..e64af638c Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__1707330898716_1707330898716_9b9af0c5bb6b41888b93d261d1c0bfc7 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__182db3ff0c974359b0844d688e3ea136_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__182db3ff0c974359b0844d688e3ea136_0 new file mode 100644 index 000000000..aa312c1cb Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__182db3ff0c974359b0844d688e3ea136_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__4f045bd75412448f9cc4178a5664c6c8_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__4f045bd75412448f9cc4178a5664c6c8_0 new file mode 100644 index 000000000..38e2c836f Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__4f045bd75412448f9cc4178a5664c6c8_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__67826a592aee42d8904c0930aeeb7fa0_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__67826a592aee42d8904c0930aeeb7fa0_0 new file mode 100644 index 000000000..ae4febda5 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__67826a592aee42d8904c0930aeeb7fa0_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__83727d4d1157460084dc84ecd4dabc53_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__83727d4d1157460084dc84ecd4dabc53_0 new file mode 100644 index 000000000..5673fbafb Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__83727d4d1157460084dc84ecd4dabc53_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__8913f3db6618477980f325f1c8c9d89c_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__8913f3db6618477980f325f1c8c9d89c_0 new file mode 100644 index 000000000..51120decc Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__8913f3db6618477980f325f1c8c9d89c_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__8b75f8f7d15e4c59a36701dbf963f725_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__8b75f8f7d15e4c59a36701dbf963f725_0 new file mode 100644 index 000000000..92873af7a Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__8b75f8f7d15e4c59a36701dbf963f725_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__91acc91178204d4fb8b5788afa4f5155_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__91acc91178204d4fb8b5788afa4f5155_0 new file mode 100644 index 000000000..74c09c495 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__91acc91178204d4fb8b5788afa4f5155_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__d386a419b4fd4f908a24d2f5367a3b48_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__d386a419b4fd4f908a24d2f5367a3b48_0 new file mode 100644 index 000000000..f09437560 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__d386a419b4fd4f908a24d2f5367a3b48_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__eaada47eb8a547de9837ff47672b5862_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__eaada47eb8a547de9837ff47672b5862_0 new file mode 100644 index 000000000..cc0eb9eaa Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__eaada47eb8a547de9837ff47672b5862_0 differ diff --git a/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__ec5c1be025664f71822df25b001573a3_0 b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__ec5c1be025664f71822df25b001573a3_0 new file mode 100644 index 000000000..7f7cda872 Binary files /dev/null and b/tools/models/memento/tests/fixtures/estimators-cube-expected/size_factors/__schema/__enumerations/__ec5c1be025664f71822df25b001573a3_0 differ diff --git a/tools/models/memento/tests/test_cube_builder.py b/tools/models/memento/tests/test_cube_builder.py new file mode 100644 index 000000000..7ff221a07 --- /dev/null +++ b/tools/models/memento/tests/test_cube_builder.py @@ -0,0 +1,45 @@ +from os import path +from tempfile import TemporaryDirectory + +import numpy as np +import tiledb + +from tools.models.memento.src.estimators_cube_builder.cube_builder import build + + +def test_cube_builder_regression() -> None: + """ This test is useful to ensure that the cube builder is producing the same cube as the original cube builder, after + any refactoring. Any substantive changes to the schema of the cube or the computation will necessarily cause this + test fail. However, if manual verification of the cube is done, a new static fixture should be generated as follows: + + cd tools/models/memento/ + + python tests/fixtures/census_fixture.py \ + s3://cellxgene-data-public/cell-census/2023-10-30/soma/census_data/homo_sapiens \ + "is_primary_data == True and tissue_general in ['tongue']" \ + "feature_id in ['ENSG00000000419', 'ENSG00000002330']" \ + tests/fixtures/census-homo-sapiens-small + + python -m src.estimators_cube_builder.cube_builder \ + --experiment-uri tests/fixtures/census-homo-sapiens-small \ + --cube-uri tests/fixtures/estimators-cube-expected + """ + + pwd = path.dirname(__file__) + with TemporaryDirectory() as cube_dir: + build(cube_uri=cube_dir, experiment_uri=path.join(pwd, "fixtures", "census-homo-sapiens-small")) + + expected_cube_fixture_dir = path.join(pwd, "fixtures", "estimators-cube-expected") + + with tiledb.open(path.join(cube_dir, "obs_groups")) as actual_obs_groups: + with tiledb.open(path.join(expected_cube_fixture_dir, "obs_groups")) as expected_obs_groups: + assert actual_obs_groups.df[:].equals(expected_obs_groups.df[:]) + + with tiledb.open(path.join(cube_dir, "estimators")) as actual_estimators: + with tiledb.open(path.join(expected_cube_fixture_dir, "estimators")) as expected_estimators: + actual_estimators = actual_estimators.df[:] + expected_estimators = expected_estimators.df[:] + for col in ["mean", "sem"]: + assert np.allclose( + actual_estimators[col], expected_estimators[col] + ), f"estimators mismatch for '{col}'"