Skip to content

Commit

Permalink
Merge pull request #70 from vluzko/mock-data
Browse files Browse the repository at this point in the history
Mock data
  • Loading branch information
vluzko authored Aug 18, 2022
2 parents d0bb6ee + 62ca6ff commit 658dadf
Show file tree
Hide file tree
Showing 13 changed files with 40,227 additions and 72 deletions.
25 changes: 13 additions & 12 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
@@ -1,26 +1,27 @@
# Use the latest 2.1 version of CircleCI pipeline process engine.
# See: https://circleci.com/docs/2.0/configuration-reference
version: 2.1

orbs:
python: circleci/[email protected]
# Define a job to be invoked later in a workflow.
# See: https://circleci.com/docs/2.0/configuration-reference/#jobs
jobs:
say-hello:
# Specify the execution environment. You can specify an image from Dockerhub or use one of our Convenience Images from CircleCI's Developer Hub.
# See: https://circleci.com/docs/2.0/configuration-reference/#docker-machine-macos-windows-executor
docker:
- image: cimg/base:stable
# Add steps to the job
# See: https://circleci.com/docs/2.0/configuration-reference/#steps
build_and_test:
executor: python/default
steps:
- checkout
- run:
name: "Say hello"
command: "echo Hello, World!"
name: Install
command: pip install .[tests]
# - python/install-packages:
# pkg-manager: pip
- run:
name: Run tests
command: python -m pytest

# Invoke jobs via workflows
# See: https://circleci.com/docs/2.0/configuration-reference/#workflows
workflows:
say-hello-workflow:
build_and_test-workflow:
jobs:
- say-hello
- build_and_test
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ dist/
.idea/

data/
tmp.py
tests/mocked/medium.txt
3 changes: 1 addition & 2 deletions aukpy/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,13 @@
Tuple,
Any,
Literal,
TypeGuard,
)


Distance = Literal["km", "miles"]


def check_simple_type(value) -> TypeGuard[Union[str, int, float, bool]]:
def check_simple_type(value) -> bool:
return (
isinstance(value, str)
or isinstance(value, float)
Expand Down
43 changes: 43 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,55 @@
from pathlib import Path


TEST_DATA = Path(__file__).parent / "data"
MOCK_DATA = Path(__file__).parent / "mocked"
SMALL = TEST_DATA / "small" / "observations.txt"
MEDIUM = TEST_DATA / "medium" / "observations.txt"
LARGE = TEST_DATA / "large" / "observations.txt"
# A clean dataframe with non empty atlas codes
WITH_ATLAS = TEST_DATA / "with_atlas.csv"

# We skip any unmocked tests if the file doesn't exist
SKIP_NON_MOCKED = {
"condition": not SMALL.exists() or not MEDIUM.exists(),
"reason": "Real datasets not available",
}
# SKIP_NON_MOCKED = (True, "")

SMALL_DB = TEST_DATA / "small" / "observations.sqlite"
MEDIUM_DB = TEST_DATA / "medium" / "observations.sqlite"
LARGE_DB = TEST_DATA / "large" / "observations.sqlite"

M_SMALL = MOCK_DATA / "small.txt"
M_MEDIUM = MOCK_DATA / "medium.txt"
M_SMALL1 = MOCK_DATA / "small1.txt"
M_SMALL2 = MOCK_DATA / "small2.txt"
M_SMALL3 = MOCK_DATA / "small3.txt"

SMALL_MOCKED = (M_SMALL1, M_SMALL2, M_SMALL3)


def generate_mock_data(obs_path: Path, out_path: Path):
"""Convert the real medium sized dataset to a fake dataset"""
from aukpy import db
from tests import gen_mock_data

df = db.read_clean(obs_path)
new_df = gen_mock_data.scramble_observations(df)
new_df.to_csv(out_path, index=False, sep="\t")


def generate_subsampled(obs_path: Path, out_path: Path, num_rows: int = 10000):
from aukpy import db
from tests import gen_mock_data

df = db.read_clean(obs_path)
subsampled = gen_mock_data.subsample(df, num_rows=num_rows)
subsampled.to_csv(out_path, index=False, sep="\t")


# generate_mock_data(SMALL, M_SMALL)
# generate_mock_data(MEDIUM, M_MEDIUM)
# generate_subsampled(M_MEDIUM, M_SMALL1)
# generate_subsampled(M_MEDIUM, M_SMALL2)
# generate_subsampled(M_MEDIUM, M_SMALL3)
15 changes: 12 additions & 3 deletions tests/db_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
from pathlib import Path
from aukpy import db as auk_db


from tests import SMALL, MEDIUM, LARGE, SMALL_DB
from tests import SMALL, MEDIUM, LARGE, SMALL_MOCKED, SKIP_NON_MOCKED


@pytest.mark.skipif(**SKIP_NON_MOCKED) # type: ignore
def test_build_small():
with NamedTemporaryFile() as output:
db = auk_db.build_db_pandas(SMALL, Path(output.name))


@pytest.mark.skipif(**SKIP_NON_MOCKED) # type: ignore
def test_build_medium():
with NamedTemporaryFile() as output:
db = auk_db.build_db_pandas(MEDIUM, Path(output.name))
Expand All @@ -20,6 +21,7 @@ def test_build_medium():
assert len(res) == 999999


@pytest.mark.skipif(**SKIP_NON_MOCKED) # type: ignore
def test_build_incremental_small():
with NamedTemporaryFile() as output:
db = auk_db.build_db_incremental(SMALL, Path(output.name), max_size=1000)
Expand All @@ -28,6 +30,7 @@ def test_build_incremental_small():
assert len(res) == 10000


@pytest.mark.skipif(**SKIP_NON_MOCKED) # type: ignore
def test_build_incremental():
with NamedTemporaryFile() as output:
db = auk_db.build_db_incremental(MEDIUM, Path(output.name))
Expand All @@ -39,4 +42,10 @@ def test_build_incremental():
@pytest.mark.skip
def test_build_large():
with NamedTemporaryFile() as output:
db = auk_db.build_db(LARGE, Path(output.name))
db = auk_db.build_db_pandas(LARGE, Path(output.name))


def test_build_small_mocked():
for p in SMALL_MOCKED:
with NamedTemporaryFile() as output:
auk_db.build_db_pandas(p, Path(output.name))
65 changes: 65 additions & 0 deletions tests/gen_mock_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from typing import List
import numpy as np
import pandas as pd
from aukpy import db


def scramble_observations(df: pd.DataFrame) -> pd.DataFrame:
"""Scramble and anonymize a real set of observations"""

# sampling_events = df.groupby('sampling_event_identifier')
s_id_to_index = (
df[["sampling_event_identifier"]]
.reset_index()
.set_index("sampling_event_identifier")
)
# Shuffle locations
just_first = df.drop_duplicates("sampling_event_identifier").set_index(
"sampling_event_identifier"
)
location_shuffle = np.random.permutation(just_first.index)
just_loc = just_first.loc[location_shuffle, list(db.LocationWrapper.columns)]

merged = just_loc.join(s_id_to_index).reset_index().set_index("index").loc[df.index]
assert len(merged) == len(df)
df[list(db.LocationWrapper.columns)] = merged[list(db.LocationWrapper.columns)]

# Shuffle sampling events
to_scramble = list(db.SamplingWrapper.columns)
del to_scramble[0]

obs_shuffle = np.random.permutation(just_first.index)
just_obs = just_first.loc[obs_shuffle, list(to_scramble)]

# Randomize observer ids
just_obs["observer_id"] = just_obs["observer_id"].str[4:].astype(int)
r = just_obs["observer_id"]
observers = r.unique()
new_ids = np.random.randint(r.min(), r.max(), len(observers))
new_observers = {o: n for o, n in zip(observers, new_ids)}
just_obs["observer_id"] = "obsr" + r.map(new_observers).astype(str)

merged = just_obs.join(s_id_to_index).reset_index().set_index("index").loc[df.index]
assert len(merged) == len(df)
df[to_scramble] = merged[to_scramble]

# Scramble species
# Obviously this does not preserve the geographic distribution of species
species_shuffle = np.random.permutation(df.index)

for col in db.SpeciesWrapper.columns:
df[col] = df.loc[species_shuffle, col].values

# Remove comments
for col in ("trip_comments", "species_comments"):
not_empty = ~df[col].isna()
df.loc[not_empty, col] = ""

return df


def subsample(df: pd.DataFrame, num_rows: int = 100000) -> pd.DataFrame:
"""Extract a random set of rows from the dataframe"""
assert len(df) >= num_rows
shuffle = np.random.permutation(df.index)
return df.loc[shuffle].iloc[:num_rows]
Loading

0 comments on commit 658dadf

Please sign in to comment.