Merge pull request #70 from vluzko/mock-data

Mock data
vluzko · Aug 18, 2022 · 658dadf · 658dadf
2 parents d0bb6ee + 62ca6ff
commit 658dadf
Show file tree

Hide file tree

Showing 13 changed files with 40,227 additions and 72 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -1,26 +1,27 @@
 # Use the latest 2.1 version of CircleCI pipeline process engine.
 # See: https://circleci.com/docs/2.0/configuration-reference
 version: 2.1
-
+orbs:
+  python: circleci/[email protected]
 # Define a job to be invoked later in a workflow.
 # See: https://circleci.com/docs/2.0/configuration-reference/#jobs
 jobs:
-  say-hello:
-    # Specify the execution environment. You can specify an image from Dockerhub or use one of our Convenience Images from CircleCI's Developer Hub.
-    # See: https://circleci.com/docs/2.0/configuration-reference/#docker-machine-macos-windows-executor
-    docker:
-      - image: cimg/base:stable
-    # Add steps to the job
-    # See: https://circleci.com/docs/2.0/configuration-reference/#steps
+  build_and_test:
+    executor: python/default
     steps:
       - checkout
       - run:
-          name: "Say hello"
-          command: "echo Hello, World!"
+          name: Install
+          command: pip install .[tests]
+      # - python/install-packages:
+      #     pkg-manager: pip
+      - run:
+          name: Run tests
+          command: python -m pytest
 
 # Invoke jobs via workflows
 # See: https://circleci.com/docs/2.0/configuration-reference/#workflows
 workflows:
-  say-hello-workflow:
+  build_and_test-workflow:
     jobs:
-      - say-hello
+      - build_and_test
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,4 @@ dist/
 .idea/
 
 data/
-tmp.py
+tests/mocked/medium.txt
diff --git a/aukpy/queries.py b/aukpy/queries.py
@@ -13,14 +13,13 @@
     Tuple,
     Any,
     Literal,
-    TypeGuard,
 )
 
 
 Distance = Literal["km", "miles"]
 
 
-def check_simple_type(value) -> TypeGuard[Union[str, int, float, bool]]:
+def check_simple_type(value) -> bool:
     return (
         isinstance(value, str)
         or isinstance(value, float)

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,12 +1,55 @@
 from pathlib import Path
 
+
 TEST_DATA = Path(__file__).parent / "data"
+MOCK_DATA = Path(__file__).parent / "mocked"
 SMALL = TEST_DATA / "small" / "observations.txt"
 MEDIUM = TEST_DATA / "medium" / "observations.txt"
 LARGE = TEST_DATA / "large" / "observations.txt"
 # A clean dataframe with non empty atlas codes
 WITH_ATLAS = TEST_DATA / "with_atlas.csv"
 
+# We skip any unmocked tests if the file doesn't exist
+SKIP_NON_MOCKED = {
+    "condition": not SMALL.exists() or not MEDIUM.exists(),
+    "reason": "Real datasets not available",
+}
+# SKIP_NON_MOCKED = (True, "")
+
 SMALL_DB = TEST_DATA / "small" / "observations.sqlite"
 MEDIUM_DB = TEST_DATA / "medium" / "observations.sqlite"
 LARGE_DB = TEST_DATA / "large" / "observations.sqlite"
+
+M_SMALL = MOCK_DATA / "small.txt"
+M_MEDIUM = MOCK_DATA / "medium.txt"
+M_SMALL1 = MOCK_DATA / "small1.txt"
+M_SMALL2 = MOCK_DATA / "small2.txt"
+M_SMALL3 = MOCK_DATA / "small3.txt"
+
+SMALL_MOCKED = (M_SMALL1, M_SMALL2, M_SMALL3)
+
+
+def generate_mock_data(obs_path: Path, out_path: Path):
+    """Convert the real medium sized dataset to a fake dataset"""
+    from aukpy import db
+    from tests import gen_mock_data
+
+    df = db.read_clean(obs_path)
+    new_df = gen_mock_data.scramble_observations(df)
+    new_df.to_csv(out_path, index=False, sep="\t")
+
+
+def generate_subsampled(obs_path: Path, out_path: Path, num_rows: int = 10000):
+    from aukpy import db
+    from tests import gen_mock_data
+
+    df = db.read_clean(obs_path)
+    subsampled = gen_mock_data.subsample(df, num_rows=num_rows)
+    subsampled.to_csv(out_path, index=False, sep="\t")
+
+
+# generate_mock_data(SMALL, M_SMALL)
+# generate_mock_data(MEDIUM, M_MEDIUM)
+# generate_subsampled(M_MEDIUM, M_SMALL1)
+# generate_subsampled(M_MEDIUM, M_SMALL2)
+# generate_subsampled(M_MEDIUM, M_SMALL3)
diff --git a/tests/db_test.py b/tests/db_test.py
@@ -3,15 +3,16 @@
 from pathlib import Path
 from aukpy import db as auk_db
 
-
-from tests import SMALL, MEDIUM, LARGE, SMALL_DB
+from tests import SMALL, MEDIUM, LARGE, SMALL_MOCKED, SKIP_NON_MOCKED
 
 
+@pytest.mark.skipif(**SKIP_NON_MOCKED)  # type: ignore
 def test_build_small():
     with NamedTemporaryFile() as output:
         db = auk_db.build_db_pandas(SMALL, Path(output.name))
 
 
+@pytest.mark.skipif(**SKIP_NON_MOCKED)  # type: ignore
 def test_build_medium():
     with NamedTemporaryFile() as output:
         db = auk_db.build_db_pandas(MEDIUM, Path(output.name))
@@ -20,6 +21,7 @@ def test_build_medium():
         assert len(res) == 999999
 
 
+@pytest.mark.skipif(**SKIP_NON_MOCKED)  # type: ignore
 def test_build_incremental_small():
     with NamedTemporaryFile() as output:
         db = auk_db.build_db_incremental(SMALL, Path(output.name), max_size=1000)
@@ -28,6 +30,7 @@ def test_build_incremental_small():
         assert len(res) == 10000
 
 
+@pytest.mark.skipif(**SKIP_NON_MOCKED)  # type: ignore
 def test_build_incremental():
     with NamedTemporaryFile() as output:
         db = auk_db.build_db_incremental(MEDIUM, Path(output.name))
@@ -39,4 +42,10 @@ def test_build_incremental():
 @pytest.mark.skip
 def test_build_large():
     with NamedTemporaryFile() as output:
-        db = auk_db.build_db(LARGE, Path(output.name))
+        db = auk_db.build_db_pandas(LARGE, Path(output.name))
+
+
+def test_build_small_mocked():
+    for p in SMALL_MOCKED:
+        with NamedTemporaryFile() as output:
+            auk_db.build_db_pandas(p, Path(output.name))
diff --git a/tests/gen_mock_data.py b/tests/gen_mock_data.py
@@ -0,0 +1,65 @@
+from typing import List
+import numpy as np
+import pandas as pd
+from aukpy import db
+
+
+def scramble_observations(df: pd.DataFrame) -> pd.DataFrame:
+    """Scramble and anonymize a real set of observations"""
+
+    # sampling_events = df.groupby('sampling_event_identifier')
+    s_id_to_index = (
+        df[["sampling_event_identifier"]]
+        .reset_index()
+        .set_index("sampling_event_identifier")
+    )
+    # Shuffle locations
+    just_first = df.drop_duplicates("sampling_event_identifier").set_index(
+        "sampling_event_identifier"
+    )
+    location_shuffle = np.random.permutation(just_first.index)
+    just_loc = just_first.loc[location_shuffle, list(db.LocationWrapper.columns)]
+
+    merged = just_loc.join(s_id_to_index).reset_index().set_index("index").loc[df.index]
+    assert len(merged) == len(df)
+    df[list(db.LocationWrapper.columns)] = merged[list(db.LocationWrapper.columns)]
+
+    # Shuffle sampling events
+    to_scramble = list(db.SamplingWrapper.columns)
+    del to_scramble[0]
+
+    obs_shuffle = np.random.permutation(just_first.index)
+    just_obs = just_first.loc[obs_shuffle, list(to_scramble)]
+
+    # Randomize observer ids
+    just_obs["observer_id"] = just_obs["observer_id"].str[4:].astype(int)
+    r = just_obs["observer_id"]
+    observers = r.unique()
+    new_ids = np.random.randint(r.min(), r.max(), len(observers))
+    new_observers = {o: n for o, n in zip(observers, new_ids)}
+    just_obs["observer_id"] = "obsr" + r.map(new_observers).astype(str)
+
+    merged = just_obs.join(s_id_to_index).reset_index().set_index("index").loc[df.index]
+    assert len(merged) == len(df)
+    df[to_scramble] = merged[to_scramble]
+
+    # Scramble species
+    # Obviously this does not preserve the geographic distribution of species
+    species_shuffle = np.random.permutation(df.index)
+
+    for col in db.SpeciesWrapper.columns:
+        df[col] = df.loc[species_shuffle, col].values
+
+    # Remove comments
+    for col in ("trip_comments", "species_comments"):
+        not_empty = ~df[col].isna()
+        df.loc[not_empty, col] = ""
+
+    return df
+
+
+def subsample(df: pd.DataFrame, num_rows: int = 100000) -> pd.DataFrame:
+    """Extract a random set of rows from the dataframe"""
+    assert len(df) >= num_rows
+    shuffle = np.random.permutation(df.index)
+    return df.loc[shuffle].iloc[:num_rows]
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,4 +8,4 @@ dist/ @@
     .idea/
     data/
-    tmp.py
+    tests/mocked/medium.txt