-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #70 from vluzko/mock-data
Mock data
- Loading branch information
Showing
13 changed files
with
40,227 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,26 +1,27 @@ | ||
# Use the latest 2.1 version of CircleCI pipeline process engine. | ||
# See: https://circleci.com/docs/2.0/configuration-reference | ||
version: 2.1 | ||
|
||
orbs: | ||
python: circleci/[email protected] | ||
# Define a job to be invoked later in a workflow. | ||
# See: https://circleci.com/docs/2.0/configuration-reference/#jobs | ||
jobs: | ||
say-hello: | ||
# Specify the execution environment. You can specify an image from Dockerhub or use one of our Convenience Images from CircleCI's Developer Hub. | ||
# See: https://circleci.com/docs/2.0/configuration-reference/#docker-machine-macos-windows-executor | ||
docker: | ||
- image: cimg/base:stable | ||
# Add steps to the job | ||
# See: https://circleci.com/docs/2.0/configuration-reference/#steps | ||
build_and_test: | ||
executor: python/default | ||
steps: | ||
- checkout | ||
- run: | ||
name: "Say hello" | ||
command: "echo Hello, World!" | ||
name: Install | ||
command: pip install .[tests] | ||
# - python/install-packages: | ||
# pkg-manager: pip | ||
- run: | ||
name: Run tests | ||
command: python -m pytest | ||
|
||
# Invoke jobs via workflows | ||
# See: https://circleci.com/docs/2.0/configuration-reference/#workflows | ||
workflows: | ||
say-hello-workflow: | ||
build_and_test-workflow: | ||
jobs: | ||
- say-hello | ||
- build_and_test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,4 +8,4 @@ dist/ | |
.idea/ | ||
|
||
data/ | ||
tmp.py | ||
tests/mocked/medium.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,55 @@ | ||
from pathlib import Path | ||
|
||
|
||
TEST_DATA = Path(__file__).parent / "data" | ||
MOCK_DATA = Path(__file__).parent / "mocked" | ||
SMALL = TEST_DATA / "small" / "observations.txt" | ||
MEDIUM = TEST_DATA / "medium" / "observations.txt" | ||
LARGE = TEST_DATA / "large" / "observations.txt" | ||
# A clean dataframe with non empty atlas codes | ||
WITH_ATLAS = TEST_DATA / "with_atlas.csv" | ||
|
||
# We skip any unmocked tests if the file doesn't exist | ||
SKIP_NON_MOCKED = { | ||
"condition": not SMALL.exists() or not MEDIUM.exists(), | ||
"reason": "Real datasets not available", | ||
} | ||
# SKIP_NON_MOCKED = (True, "") | ||
|
||
SMALL_DB = TEST_DATA / "small" / "observations.sqlite" | ||
MEDIUM_DB = TEST_DATA / "medium" / "observations.sqlite" | ||
LARGE_DB = TEST_DATA / "large" / "observations.sqlite" | ||
|
||
M_SMALL = MOCK_DATA / "small.txt" | ||
M_MEDIUM = MOCK_DATA / "medium.txt" | ||
M_SMALL1 = MOCK_DATA / "small1.txt" | ||
M_SMALL2 = MOCK_DATA / "small2.txt" | ||
M_SMALL3 = MOCK_DATA / "small3.txt" | ||
|
||
SMALL_MOCKED = (M_SMALL1, M_SMALL2, M_SMALL3) | ||
|
||
|
||
def generate_mock_data(obs_path: Path, out_path: Path): | ||
"""Convert the real medium sized dataset to a fake dataset""" | ||
from aukpy import db | ||
from tests import gen_mock_data | ||
|
||
df = db.read_clean(obs_path) | ||
new_df = gen_mock_data.scramble_observations(df) | ||
new_df.to_csv(out_path, index=False, sep="\t") | ||
|
||
|
||
def generate_subsampled(obs_path: Path, out_path: Path, num_rows: int = 10000): | ||
from aukpy import db | ||
from tests import gen_mock_data | ||
|
||
df = db.read_clean(obs_path) | ||
subsampled = gen_mock_data.subsample(df, num_rows=num_rows) | ||
subsampled.to_csv(out_path, index=False, sep="\t") | ||
|
||
|
||
# generate_mock_data(SMALL, M_SMALL) | ||
# generate_mock_data(MEDIUM, M_MEDIUM) | ||
# generate_subsampled(M_MEDIUM, M_SMALL1) | ||
# generate_subsampled(M_MEDIUM, M_SMALL2) | ||
# generate_subsampled(M_MEDIUM, M_SMALL3) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
from typing import List | ||
import numpy as np | ||
import pandas as pd | ||
from aukpy import db | ||
|
||
|
||
def scramble_observations(df: pd.DataFrame) -> pd.DataFrame: | ||
"""Scramble and anonymize a real set of observations""" | ||
|
||
# sampling_events = df.groupby('sampling_event_identifier') | ||
s_id_to_index = ( | ||
df[["sampling_event_identifier"]] | ||
.reset_index() | ||
.set_index("sampling_event_identifier") | ||
) | ||
# Shuffle locations | ||
just_first = df.drop_duplicates("sampling_event_identifier").set_index( | ||
"sampling_event_identifier" | ||
) | ||
location_shuffle = np.random.permutation(just_first.index) | ||
just_loc = just_first.loc[location_shuffle, list(db.LocationWrapper.columns)] | ||
|
||
merged = just_loc.join(s_id_to_index).reset_index().set_index("index").loc[df.index] | ||
assert len(merged) == len(df) | ||
df[list(db.LocationWrapper.columns)] = merged[list(db.LocationWrapper.columns)] | ||
|
||
# Shuffle sampling events | ||
to_scramble = list(db.SamplingWrapper.columns) | ||
del to_scramble[0] | ||
|
||
obs_shuffle = np.random.permutation(just_first.index) | ||
just_obs = just_first.loc[obs_shuffle, list(to_scramble)] | ||
|
||
# Randomize observer ids | ||
just_obs["observer_id"] = just_obs["observer_id"].str[4:].astype(int) | ||
r = just_obs["observer_id"] | ||
observers = r.unique() | ||
new_ids = np.random.randint(r.min(), r.max(), len(observers)) | ||
new_observers = {o: n for o, n in zip(observers, new_ids)} | ||
just_obs["observer_id"] = "obsr" + r.map(new_observers).astype(str) | ||
|
||
merged = just_obs.join(s_id_to_index).reset_index().set_index("index").loc[df.index] | ||
assert len(merged) == len(df) | ||
df[to_scramble] = merged[to_scramble] | ||
|
||
# Scramble species | ||
# Obviously this does not preserve the geographic distribution of species | ||
species_shuffle = np.random.permutation(df.index) | ||
|
||
for col in db.SpeciesWrapper.columns: | ||
df[col] = df.loc[species_shuffle, col].values | ||
|
||
# Remove comments | ||
for col in ("trip_comments", "species_comments"): | ||
not_empty = ~df[col].isna() | ||
df.loc[not_empty, col] = "" | ||
|
||
return df | ||
|
||
|
||
def subsample(df: pd.DataFrame, num_rows: int = 100000) -> pd.DataFrame: | ||
"""Extract a random set of rows from the dataframe""" | ||
assert len(df) >= num_rows | ||
shuffle = np.random.permutation(df.index) | ||
return df.loc[shuffle].iloc[:num_rows] |
Oops, something went wrong.