Skip to content

Commit

Permalink
refactor(tests): remove unused omic dataset functions
Browse files Browse the repository at this point in the history
Removed the `create_omic_dataset` function and related fixtures from
`tests/fixtures/files.py`. These functions were no longer in use and
cluttered the codebase.
  • Loading branch information
psmyth94 committed Nov 22, 2024
1 parent 4583bb5 commit 5463249
Showing 1 changed file with 0 additions and 329 deletions.
329 changes: 0 additions & 329 deletions tests/fixtures/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,204 +175,6 @@ def _create_all_arrow_types_dataframe(num_rows=100, feature_type=None):
return data, features


def create_omic_dataset(
num_rows=100,
num_cols=None,
dtype="all",
sample="sample_id",
batch="batch",
label="label",
multi_class=False,
task="classification",
label_type="int",
metadata=True,
input_feature=None,
sparse=False,
missing_labels=False,
):
"""
Create a sample dataframe with predefined structure.
"""
from datasets import Value

if input_feature is None:
input_feature = Value
data = {}
features = {}
enable_full_determinism(SEED)

if sample:
data[sample] = [str(i) for i in range(num_rows)]
features[sample] = get_feature("Sample")("string")
if batch:
data[batch] = [str(i) for i in range(num_rows)]
features[batch] = get_feature("Batch")("string")
metadata_value_options = {
"multi_classification_int": [i % 3 for i in range(num_rows)],
"multi_classification_str": [
ALPHANUMERIC[i % len(ALPHANUMERIC)] for i in range(num_rows)
],
"bin_classification_bool": [
True if i % 2 == 0 else False for i in range(num_rows)
],
"bin_classification_int": [i % 2 for i in range(num_rows)],
"bin_classification_str": [
"positive" if i > num_rows // 2 else "negative" for i in range(num_rows)
],
"regression": np.random.randn(num_rows),
}
metadata_feature_options = {
"multi_classification_int": "int8",
"multi_classification_str": "string",
"bin_classification_bool": "bool",
"bin_classification_int": "int8",
"bin_classification_str": "string",
"regression": "float32",
}
if label:
if task == "classification":
if multi_class:
label_name = "multi"
else:
label_name = "bin"
label_name += f"_classification_{label_type}"
else:
label_name = "regression"

data[label] = metadata_value_options.pop(label_name)
if missing_labels:
# Randomly set 10% of labels to -1 if classification, else set to None
if task == "classification":
indices_to_replace = np.random.choice(
num_rows, int(num_rows * 0.1), replace=False
)
data[label] = [
-1 if i in indices_to_replace else lab
for i, lab in enumerate(data[label])
]
else:
indices_to_replace = np.random.choice(
num_rows, int(num_rows * 0.1), replace=False
)
data[label] = [
None if i in indices_to_replace else lab
for i, lab in enumerate(data[label])
]
label_dtype = metadata_feature_options.pop(label_name)
if label_name == "regression":
features[label] = get_feature("RegressionTarget")(label_dtype)
else:
names = list(set(data[label]))
if not isinstance(names[0], str):
names = [str(n) for n in names]
else:
name_map = {n: i for i, n in enumerate(names)}
data[label] = [name_map[n] for n in data[label]]

num_classes = len(names)

features[label] = get_feature("ClassLabel")(
num_classes=num_classes, names=names
)
if metadata:
if isinstance(metadata, str):
data.update(metadata_value_options)
features.update(
{
k: get_feature("Metadata")(dtype=v)
for k, v in metadata_feature_options.items()
}
)
else:
for label, v in metadata_value_options.items():
data[label] = v
features[label] = Value(metadata_feature_options[label])

if dtype == "all":
ext_data, ext_features = _create_all_arrow_types_dataframe(
num_rows=num_rows, feature_type=input_feature
)
else:
ext_data = {}
ext_features = {}
if sparse and isinstance(sparse, bool):
sparse = 0.8
if num_cols is None:
num_cols = 1

dtype_to_pa = {
"multi_bins": "int32",
"one_hot": "int32",
}

for i in range(num_cols):
arr: np.ndarray = PA_DATA[dtype](num_rows)
ext_data[f"{dtype}_{i}"] = arr.tolist()

ext_features[f"{dtype}_{i}"] = input_feature(
dtype=dtype_to_pa.get(dtype, dtype),
metadata={
"my_metadata_str": ALPHANUMERIC[
np.random.randint(0, len(ALPHANUMERIC))
],
"my_metadata_int": np.random.randint(0, 100),
},
)
if sparse:
mat = np.array([ext_data[f"{dtype}_{i}"] for i in range(num_cols)]).T
for i in range(num_cols):
arr = np.array(ext_data[f"{dtype}_{i}"])
total_values = arr.size
if isinstance(sparse, list):
_sparse = sparse[i]
else:
_sparse = sparse
if isinstance(_sparse, bool):
_sparse = np.random.uniform(0.1, 0.9)

num_to_replace = max(
min(int(total_values * (_sparse)), total_values - 1), 0
)
indices_to_replace = np.random.choice(
total_values, num_to_replace, replace=False
)
# check if replacing with 0 would make a row all 0s
for idx in indices_to_replace:
if dtype in [
"one_hot",
"multi_bins",
"uint8",
"uint16",
"uint32",
"uint64",
]:
if np.sum(mat[:idx] > 0) + np.sum(mat[idx + 1 :] > 0) == 0:
indices_to_replace = np.delete(
indices_to_replace, np.where(indices_to_replace == idx)
)
else:
arr[idx] = 0
else:
if all(v is None for v in mat[:idx]) and all(
v is None for v in mat[idx + 1 :]
):
indices_to_replace = np.delete(
indices_to_replace, np.where(indices_to_replace == idx)
)
else:
arr[idx] = 0
ext_data[f"{dtype}_{i}"] = arr.tolist()

data.update(ext_data)
features.update(ext_features)
if is_biosets_available():
import biosets
import datasets

return biosets.Bioset.from_dict(data, features=datasets.Features(features))
return pd.DataFrame(data)


def create_feature_dataframe(num_cols=100, feature_id="feature"):
"""
Create a feature dataframe with predefined structure.
Expand Down Expand Up @@ -404,40 +206,6 @@ def directory_exists_with_files(path, expected_files):
return True


# def save_dataframes(dfs, data_dir, filenames):
# """
# Save a list of dataframes to CSV in the specified directory.
# """
# for df, filename in zip(dfs, filenames):
# file_ext = filename.split(".")[-1]
# if file_ext in ["parquet"]:
# tbl = pa.Table.from_pandas(df) if isinstance(df, pd.DataFrame) else df
# if "float16" in tbl.schema.names:
# tbl = tbl.drop(["float16"]) # not supported by parquet
# writer = ParquetWriter(
# path=os.path.join(data_dir, filename), schema=tbl.schema
# )
# writer.write_table(tbl)
# elif file_ext in ["arrow"]:
# tbl = pa.Table.from_pandas(df) if isinstance(df, pd.DataFrame) else df
# writer = ArrowWriter(
# path=os.path.join(data_dir, filename), schema=tbl.schema
# )
# writer.write_table(tbl)
# elif file_ext in ["csv"]:
# df.to_csv(os.path.join(data_dir, filename), index=False)
# elif file_ext in ["tsv", "txt"]:
# df.to_csv(os.path.join(data_dir, filename), sep="\t", index=False)


# def create_fake_data_dir(data, base_dir, overwrite=False):
# for name, filenames, dfs, _ in data:
# data_dir = f"{base_dir}/{name}"
# os.makedirs(data_dir, exist_ok=True)
# if not directory_exists_with_files(data_dir, filenames) or overwrite:
# save_dataframes(dfs, data_dir, filenames)


def create_dataset_with_sklearn(
path,
experiment_type,
Expand Down Expand Up @@ -723,11 +491,6 @@ def sample_metadata():
return create_sample_metadata(20)


@pytest.fixture(scope="session")
def biodataset():
return create_omic_dataset(10, num_cols=3, dtype="float32", metadata="metadata")


@pytest.fixture(scope="session")
def snp_dataset_path(tmp_path_factory):
set_seed(SEED)
Expand Down Expand Up @@ -768,105 +531,13 @@ def maldi_dataset_path(tmp_path_factory):
return path


@pytest.fixture(scope="session")
def snp_dataset():
ds = create_omic_dataset(
num_rows=10,
num_cols=3,
dtype="multi_bins",
metadata="metadata",
input_feature=get_feature("GenomicVariant"),
sparse=0.8,
)
ds.info.builder_name = "snp"
return ds


@pytest.fixture(scope="session")
def maldi_dataset():
ds = create_omic_dataset(
num_rows=10,
num_cols=3,
dtype="multi_bins",
metadata="metadata",
input_feature=get_feature("PeakIntensity"),
sparse=0.8,
)
ds.info.builder_name = "maldi"
return ds


# @pytest.fixture(scope="session")
# def camda_dataset():
# camda_dir = "./tests/data/CAMDA"
# camda_metadata_files = os.path.join(camda_dir, "camda.pheno.csv")
# camda_feature_metadata_files = os.path.join(camda_dir, "camda.feature.csv")
# ds = load_dataset(
# "otu",
# data_dir=camda_dir,
# sample_metadata_files=camda_metadata_files,
# feature_metadata_files=camda_feature_metadata_files,
# label_column="City2",
# cache_dir="./.cache",
# )
# ds.cleanup_cache_files()
# return ds
#

# @pytest.fixture(scope="session")
# def camda_dataset_files_only():
# camda_dir = "./tests/data/CAMDA"
# data_files = os.path.join(camda_dir, "*matrix*.csv")
# return load_dataset(
# dataset_type="otu",
# name="camda",
# data_files=data_files,
# label_column="City2",
# )


# @pytest.fixture(scope="session")
# def camda_dataset_no_polars():
# camda_dir = "./tests/data/CAMDA"
# camda_metadata_files = os.path.join(camda_dir, "camda.pheno.csv")
# camda_feature_metadata_files = os.path.join(camda_dir, "camda.feature.csv")
# return load_dataset(
# "otu",
# data_dir=camda_dir,
# sample_metadata_files=camda_metadata_files,
# feature_metadata_files=camda_feature_metadata_files,
# label_column="City2",
# cache_dir="./.cache",
# use_polars=False,
# )
#

# @pytest.fixture(scope="session")
# def tb_dataset():
# tb_dir = "./tests/data/genomics_TB"
# dataset = load_dataset(
# "snp",
# "TB",
# data_dir=tb_dir,
# label_column="Isoniazid",
# keep_in_memory=False,
# cache_dir="./.cache",
# )
# dataset.cleanup_cache_files()
# return dataset
#


@pytest.fixture(scope="session")
def arrow_file(tmp_path_factory, dataset):
filename = str(tmp_path_factory.mktemp("data") / "file.arrow")
dataset.map(cache_file_name=filename)
return filename


# FILE_CONTENT + files


FILE_CONTENT = """\
Text data.
Second line of data."""
Expand Down

0 comments on commit 5463249

Please sign in to comment.