From 5463249e4cac3e62ea4b31a40af89c32883c40c7 Mon Sep 17 00:00:00 2001 From: psmyth94 Date: Fri, 22 Nov 2024 10:23:51 -0600 Subject: [PATCH] refactor(tests): remove unused omic dataset functions Removed the `create_omic_dataset` function and related fixtures from `tests/fixtures/files.py`. These functions were no longer in use and cluttered the codebase. --- tests/fixtures/files.py | 329 ---------------------------------------- 1 file changed, 329 deletions(-) diff --git a/tests/fixtures/files.py b/tests/fixtures/files.py index e3022a8..8b1ce20 100644 --- a/tests/fixtures/files.py +++ b/tests/fixtures/files.py @@ -175,204 +175,6 @@ def _create_all_arrow_types_dataframe(num_rows=100, feature_type=None): return data, features -def create_omic_dataset( - num_rows=100, - num_cols=None, - dtype="all", - sample="sample_id", - batch="batch", - label="label", - multi_class=False, - task="classification", - label_type="int", - metadata=True, - input_feature=None, - sparse=False, - missing_labels=False, -): - """ - Create a sample dataframe with predefined structure. - """ - from datasets import Value - - if input_feature is None: - input_feature = Value - data = {} - features = {} - enable_full_determinism(SEED) - - if sample: - data[sample] = [str(i) for i in range(num_rows)] - features[sample] = get_feature("Sample")("string") - if batch: - data[batch] = [str(i) for i in range(num_rows)] - features[batch] = get_feature("Batch")("string") - metadata_value_options = { - "multi_classification_int": [i % 3 for i in range(num_rows)], - "multi_classification_str": [ - ALPHANUMERIC[i % len(ALPHANUMERIC)] for i in range(num_rows) - ], - "bin_classification_bool": [ - True if i % 2 == 0 else False for i in range(num_rows) - ], - "bin_classification_int": [i % 2 for i in range(num_rows)], - "bin_classification_str": [ - "positive" if i > num_rows // 2 else "negative" for i in range(num_rows) - ], - "regression": np.random.randn(num_rows), - } - metadata_feature_options = { - "multi_classification_int": "int8", - "multi_classification_str": "string", - "bin_classification_bool": "bool", - "bin_classification_int": "int8", - "bin_classification_str": "string", - "regression": "float32", - } - if label: - if task == "classification": - if multi_class: - label_name = "multi" - else: - label_name = "bin" - label_name += f"_classification_{label_type}" - else: - label_name = "regression" - - data[label] = metadata_value_options.pop(label_name) - if missing_labels: - # Randomly set 10% of labels to -1 if classification, else set to None - if task == "classification": - indices_to_replace = np.random.choice( - num_rows, int(num_rows * 0.1), replace=False - ) - data[label] = [ - -1 if i in indices_to_replace else lab - for i, lab in enumerate(data[label]) - ] - else: - indices_to_replace = np.random.choice( - num_rows, int(num_rows * 0.1), replace=False - ) - data[label] = [ - None if i in indices_to_replace else lab - for i, lab in enumerate(data[label]) - ] - label_dtype = metadata_feature_options.pop(label_name) - if label_name == "regression": - features[label] = get_feature("RegressionTarget")(label_dtype) - else: - names = list(set(data[label])) - if not isinstance(names[0], str): - names = [str(n) for n in names] - else: - name_map = {n: i for i, n in enumerate(names)} - data[label] = [name_map[n] for n in data[label]] - - num_classes = len(names) - - features[label] = get_feature("ClassLabel")( - num_classes=num_classes, names=names - ) - if metadata: - if isinstance(metadata, str): - data.update(metadata_value_options) - features.update( - { - k: get_feature("Metadata")(dtype=v) - for k, v in metadata_feature_options.items() - } - ) - else: - for label, v in metadata_value_options.items(): - data[label] = v - features[label] = Value(metadata_feature_options[label]) - - if dtype == "all": - ext_data, ext_features = _create_all_arrow_types_dataframe( - num_rows=num_rows, feature_type=input_feature - ) - else: - ext_data = {} - ext_features = {} - if sparse and isinstance(sparse, bool): - sparse = 0.8 - if num_cols is None: - num_cols = 1 - - dtype_to_pa = { - "multi_bins": "int32", - "one_hot": "int32", - } - - for i in range(num_cols): - arr: np.ndarray = PA_DATA[dtype](num_rows) - ext_data[f"{dtype}_{i}"] = arr.tolist() - - ext_features[f"{dtype}_{i}"] = input_feature( - dtype=dtype_to_pa.get(dtype, dtype), - metadata={ - "my_metadata_str": ALPHANUMERIC[ - np.random.randint(0, len(ALPHANUMERIC)) - ], - "my_metadata_int": np.random.randint(0, 100), - }, - ) - if sparse: - mat = np.array([ext_data[f"{dtype}_{i}"] for i in range(num_cols)]).T - for i in range(num_cols): - arr = np.array(ext_data[f"{dtype}_{i}"]) - total_values = arr.size - if isinstance(sparse, list): - _sparse = sparse[i] - else: - _sparse = sparse - if isinstance(_sparse, bool): - _sparse = np.random.uniform(0.1, 0.9) - - num_to_replace = max( - min(int(total_values * (_sparse)), total_values - 1), 0 - ) - indices_to_replace = np.random.choice( - total_values, num_to_replace, replace=False - ) - # check if replacing with 0 would make a row all 0s - for idx in indices_to_replace: - if dtype in [ - "one_hot", - "multi_bins", - "uint8", - "uint16", - "uint32", - "uint64", - ]: - if np.sum(mat[:idx] > 0) + np.sum(mat[idx + 1 :] > 0) == 0: - indices_to_replace = np.delete( - indices_to_replace, np.where(indices_to_replace == idx) - ) - else: - arr[idx] = 0 - else: - if all(v is None for v in mat[:idx]) and all( - v is None for v in mat[idx + 1 :] - ): - indices_to_replace = np.delete( - indices_to_replace, np.where(indices_to_replace == idx) - ) - else: - arr[idx] = 0 - ext_data[f"{dtype}_{i}"] = arr.tolist() - - data.update(ext_data) - features.update(ext_features) - if is_biosets_available(): - import biosets - import datasets - - return biosets.Bioset.from_dict(data, features=datasets.Features(features)) - return pd.DataFrame(data) - - def create_feature_dataframe(num_cols=100, feature_id="feature"): """ Create a feature dataframe with predefined structure. @@ -404,40 +206,6 @@ def directory_exists_with_files(path, expected_files): return True -# def save_dataframes(dfs, data_dir, filenames): -# """ -# Save a list of dataframes to CSV in the specified directory. -# """ -# for df, filename in zip(dfs, filenames): -# file_ext = filename.split(".")[-1] -# if file_ext in ["parquet"]: -# tbl = pa.Table.from_pandas(df) if isinstance(df, pd.DataFrame) else df -# if "float16" in tbl.schema.names: -# tbl = tbl.drop(["float16"]) # not supported by parquet -# writer = ParquetWriter( -# path=os.path.join(data_dir, filename), schema=tbl.schema -# ) -# writer.write_table(tbl) -# elif file_ext in ["arrow"]: -# tbl = pa.Table.from_pandas(df) if isinstance(df, pd.DataFrame) else df -# writer = ArrowWriter( -# path=os.path.join(data_dir, filename), schema=tbl.schema -# ) -# writer.write_table(tbl) -# elif file_ext in ["csv"]: -# df.to_csv(os.path.join(data_dir, filename), index=False) -# elif file_ext in ["tsv", "txt"]: -# df.to_csv(os.path.join(data_dir, filename), sep="\t", index=False) - - -# def create_fake_data_dir(data, base_dir, overwrite=False): -# for name, filenames, dfs, _ in data: -# data_dir = f"{base_dir}/{name}" -# os.makedirs(data_dir, exist_ok=True) -# if not directory_exists_with_files(data_dir, filenames) or overwrite: -# save_dataframes(dfs, data_dir, filenames) - - def create_dataset_with_sklearn( path, experiment_type, @@ -723,11 +491,6 @@ def sample_metadata(): return create_sample_metadata(20) -@pytest.fixture(scope="session") -def biodataset(): - return create_omic_dataset(10, num_cols=3, dtype="float32", metadata="metadata") - - @pytest.fixture(scope="session") def snp_dataset_path(tmp_path_factory): set_seed(SEED) @@ -768,95 +531,6 @@ def maldi_dataset_path(tmp_path_factory): return path -@pytest.fixture(scope="session") -def snp_dataset(): - ds = create_omic_dataset( - num_rows=10, - num_cols=3, - dtype="multi_bins", - metadata="metadata", - input_feature=get_feature("GenomicVariant"), - sparse=0.8, - ) - ds.info.builder_name = "snp" - return ds - - -@pytest.fixture(scope="session") -def maldi_dataset(): - ds = create_omic_dataset( - num_rows=10, - num_cols=3, - dtype="multi_bins", - metadata="metadata", - input_feature=get_feature("PeakIntensity"), - sparse=0.8, - ) - ds.info.builder_name = "maldi" - return ds - - -# @pytest.fixture(scope="session") -# def camda_dataset(): -# camda_dir = "./tests/data/CAMDA" -# camda_metadata_files = os.path.join(camda_dir, "camda.pheno.csv") -# camda_feature_metadata_files = os.path.join(camda_dir, "camda.feature.csv") -# ds = load_dataset( -# "otu", -# data_dir=camda_dir, -# sample_metadata_files=camda_metadata_files, -# feature_metadata_files=camda_feature_metadata_files, -# label_column="City2", -# cache_dir="./.cache", -# ) -# ds.cleanup_cache_files() -# return ds -# - -# @pytest.fixture(scope="session") -# def camda_dataset_files_only(): -# camda_dir = "./tests/data/CAMDA" -# data_files = os.path.join(camda_dir, "*matrix*.csv") -# return load_dataset( -# dataset_type="otu", -# name="camda", -# data_files=data_files, -# label_column="City2", -# ) - - -# @pytest.fixture(scope="session") -# def camda_dataset_no_polars(): -# camda_dir = "./tests/data/CAMDA" -# camda_metadata_files = os.path.join(camda_dir, "camda.pheno.csv") -# camda_feature_metadata_files = os.path.join(camda_dir, "camda.feature.csv") -# return load_dataset( -# "otu", -# data_dir=camda_dir, -# sample_metadata_files=camda_metadata_files, -# feature_metadata_files=camda_feature_metadata_files, -# label_column="City2", -# cache_dir="./.cache", -# use_polars=False, -# ) -# - -# @pytest.fixture(scope="session") -# def tb_dataset(): -# tb_dir = "./tests/data/genomics_TB" -# dataset = load_dataset( -# "snp", -# "TB", -# data_dir=tb_dir, -# label_column="Isoniazid", -# keep_in_memory=False, -# cache_dir="./.cache", -# ) -# dataset.cleanup_cache_files() -# return dataset -# - - @pytest.fixture(scope="session") def arrow_file(tmp_path_factory, dataset): filename = str(tmp_path_factory.mktemp("data") / "file.arrow") @@ -864,9 +538,6 @@ def arrow_file(tmp_path_factory, dataset): return filename -# FILE_CONTENT + files - - FILE_CONTENT = """\ Text data. Second line of data."""