diff --git a/README.md b/README.md index b67d93a..ea34ef7 100644 --- a/README.md +++ b/README.md @@ -73,3 +73,34 @@ Name: greek, dtype: int64 'description': 'A column with numbers.'}], 'facets': ['letter', 'greek']} ``` + + +## Create an API with `datasetter` and `fastapi` + +Datasetter comes with a wrapper to simply create an API based on your datasets. +Based on the code above, just add : + +```python +>>> from fastapi import FastAPI +>>> from datasetter.api import add_dataset + +>>> app = FastAPI() +>>> add_dataset(app, 'random-letters', dataset) +``` + +Then your have a `fastAPI` application that you can run with `uvicorn` : + +```bash +$ uvicorn main:app --host 0.0.0.0 --port 8000 +``` + +or + +```python +>>> import uvicorn +>>> uvicorn.run( +>>> app, +>>> host="0.0.0.0", +>>> port=8000, +>>> ) +``` \ No newline at end of file diff --git a/datasetter/api.py b/datasetter/api.py new file mode 100644 index 0000000..afaf4ec --- /dev/null +++ b/datasetter/api.py @@ -0,0 +1,84 @@ +import forge +from fastapi import HTTPException +from typing import Optional +import pandas as pd +import json +from datasetter.dataset import FacetUnavailableError + + +def as_json(doc): + return json.loads( + pd.Series([doc]) + .to_json(orient='records', date_format='iso') + )[0] + + +def add_dataset(fast_api, uri, dataset): + """Create FastAPI endpoints to serve the dataset. + + Parameters + ---------- + fast_api : fastapi.applications.FastAPI + A FastAPI application object. A new endpoint will be added in this application. + uri : str + The relative uri where the endpoint will be added. + dataset : datasetter.dataset.Dataset + The dataset object to be served. + + Returns + ------- + Nothing : The endpoint is created as a side effect (inplace) in the `fast_api` application. + """ + uri = '/' + uri.strip('/') + facet = forge.kwarg('facet', type=str) + rows = forge.kwarg('rows', default=10, type=Optional[int]) + skip = forge.kwarg('skip', default=0, type=Optional[int]) + kwargs = [forge.kwarg(facet, default=None, type=Optional[str]) + for facet in dataset.facets] + + @fast_api.get(uri + "/") + def get_metadata(): + return as_json(dataset.metadata()) + + @fast_api.get(uri + "/count") + @forge.sign(*kwargs) + def count(**kwargs): + filters = {key: val for key, val in kwargs.items() if val is not None} + count = dataset.count(**filters) + return as_json({ + "count": int(count), + "filters": filters, + }) + + @fast_api.get(uri + "/count-by/{facet}") + @forge.sign(facet, rows, skip, *kwargs) + def count_by(facet, rows=10, skip=0, **kwargs): + filters = {key: val for key, val in kwargs.items() if val is not None} + try: + result = dataset.count_by(facet, rows=rows, skip=skip, **filters) + except FacetUnavailableError: + raise HTTPException(status_code=404, + detail="FacetUnavailableError: no facet {}".format(facet)) + return as_json({ + "facet": facet, + # "count": len(result), # TODO : add "nunique" feature in count_by schema + "rows": len(result), + "skip": skip, + "filters": filters, + "data": [{"value": str(key), "count": int(val)} for key, val in result.items()], + }) + + @fast_api.get(uri + "/sample") + @forge.sign(rows, skip, *kwargs) + def sample(rows=10, skip=0, **kwargs): + filters = {key: val for key, val in kwargs.items() if val is not None} + result = dataset.sample(rows=rows, skip=skip, **filters) + count = dataset.count(**filters) + return as_json({ + # "facet": facet, + "count": count, + "rows": len(result), + "skip": skip, + "filters": filters, + "data": result.to_dict(orient='records'), + }) diff --git a/datasetter/dataset.py b/datasetter/dataset.py index cab5750..2a0d14c 100644 --- a/datasetter/dataset.py +++ b/datasetter/dataset.py @@ -1,16 +1,4 @@ # Dataset : base class for datasetter - -import pandas as pd -import json - - -def as_json(doc): - return json.loads( - pd.Series([doc]) - .to_json(orient='records', date_format='iso') - )[0] - - class FacetUnavailableError(Exception): """This class is used to raise exceptions due to unavailable facets.""" pass diff --git a/datasetter/utils.py b/datasetter/utils.py new file mode 100644 index 0000000..518fc12 --- /dev/null +++ b/datasetter/utils.py @@ -0,0 +1,16 @@ +def doc_equal(doc1, doc2): + """Test if two JSON-like documents are equel.""" + try: + if isinstance(doc1, dict): + assert isinstance(doc2, dict) + for key in doc1.keys(): + assert key in doc2 and doc_equal(doc1[key], doc2[key]) + elif isinstance(doc1, list): + assert isinstance(doc2, list) and len(doc1) == len(doc2) + for x, y in zip(doc1, doc2): + assert doc_equal(x, y) + else: + assert doc1 == doc2 + return True + except AssertionError: + return False diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/test_api.py b/test/test_api.py new file mode 100644 index 0000000..46b33b2 --- /dev/null +++ b/test/test_api.py @@ -0,0 +1,160 @@ +# import pytest +import numpy as np +import pandas as pd +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from datasetter.utils import doc_equal +from datasetter.api import as_json, add_dataset +from datasetter.pandas_dataset import PandasDataset + + +df = pd.DataFrame([ + ['A', 'alpha', 1], + ['A', 'beta', 13], + ['A', 'gamma', 8], + ['B', 'alpha', 1], + ['B', 'beta', 31], + ['C', 'gamma', 9], + ['C', 'alpha', 2], + ['D', 'beta', 21], + ['D', 'gamma', 0], + ], columns=['letter', 'greek', 'number']) + +metadata = { + "description": "A simple dataset to make tests.", + "facets": ['letter', 'greek'], + "name": "Random letters", + "columns": [ + {"name": "letter", "type": "string", "description": "A column with letters."}, + {"name": "greek", "type": "string", "description": "A column with greek letters."}, + {"name": "number", "type": "integer", "description": "A column with numbers."}, + ]} + +dataset = PandasDataset(df, **metadata) + +app = FastAPI() +add_dataset(app, 'pandas-dataset', dataset) +# Launch app from parent directory with: +# > uvicorn test.test_api:app --host 0.0.0.0 --port 8000 --reload + +client = TestClient(app) + + +def test_as_json(): + assert as_json(None) is None + assert as_json(pd.NaT) is None + assert as_json(np.NaN) is None + assert as_json({}) == {} + assert isinstance(as_json(pd.Timestamp.utcnow()), str) + + +def test_metadata(): + r = client.get('/pandas-dataset/') + assert r.status_code == 200 + assert doc_equal(r.json(), metadata) + + +def test_count(): + r = client.get('/pandas-dataset/count') + assert r.status_code == 200 + assert doc_equal( + r.json(), + {'count': 9, 'filters': {}}) + + r = client.get('/pandas-dataset/count?letter=A') + assert r.status_code == 200 + assert doc_equal( + r.json(), + {'count': 3, 'filters': {'letter': 'A'}}) + + r = client.get('/pandas-dataset/count', + params=dict(letter='A', greek='alpha')) + assert r.status_code == 200 + assert doc_equal( + r.json(), + {'count': 1, 'filters': {'letter': 'A', 'greek': 'alpha'}}) + + +def test_count_by(): + r = client.get('/pandas-dataset/count-by/letter') + assert r.status_code == 200 + assert doc_equal( + r.json(), + {'facet': 'letter', + 'rows': 4, + 'skip': 0, + 'filters': {}, + 'data': [ + {'value': 'A', 'count': 3}, + {'value': 'B', 'count': 2}, + {'value': 'C', 'count': 2}, + {'value': 'D', 'count': 2}]}) + + r = client.get('/pandas-dataset/count-by/greek') + assert r.status_code == 200 + assert doc_equal( + r.json(), + {'facet': 'greek', + 'rows': 3, + 'skip': 0, + 'filters': {}, + 'data': [ + {'value': 'alpha', 'count': 3}, + {'value': 'beta', 'count': 3}, + {'value': 'gamma', 'count': 3}]}) + + r = client.get('/pandas-dataset/count-by/foo') + assert r.status_code == 404 + assert doc_equal( + r.json(), + {'detail': 'FacetUnavailableError: no facet foo'}) + + r = client.get('/pandas-dataset/count-by/number') + assert r.status_code == 404 + assert doc_equal( + r.json(), + {'detail': 'FacetUnavailableError: no facet number'}) + + +def test_sample(): + r = client.get('/pandas-dataset/sample') + assert r.status_code == 200 + assert doc_equal( + r.json(), + {'count': min(10, dataset.count()), + 'rows': min(10, dataset.count()), + 'skip': 0, + 'filters': {}, + 'data': df.to_dict(orient='records')}) + + r = client.get('/pandas-dataset/sample?rows=3') + assert r.status_code == 200 + assert doc_equal( + r.json(), + {'count': min(10, dataset.count()), + 'rows': min(3, dataset.count()), + 'skip': 0, + 'filters': {}, + 'data': df.head(3).to_dict(orient='records')}) + + r = client.get('/pandas-dataset/sample?letter=A') + assert r.status_code == 200 + assert doc_equal( + r.json(), + {'count': min(10, dataset.count(letter="A")), + 'rows': min(3, dataset.count(letter="A")), + 'skip': 0, + 'filters': {'letter': 'A'}, + 'data': dataset.sample(letter='A').to_dict(orient='records')}) + + r = client.get('/pandas-dataset/sample?letter=A&rows=2') + assert r.status_code == 200 + assert doc_equal( + r.json(), + {'count': dataset.count(letter="A"), + 'rows': min(2, dataset.count(letter="A")), + 'skip': 0, + 'filters': {'letter': 'A'}, + 'data': (dataset.sample(letter='A').head(2) + .to_dict(orient='records'))}) diff --git a/test/test_dataset.py b/test/test_dataset.py index 9a83317..eac53fb 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -1,16 +1,6 @@ import pytest -import numpy as np -import pandas as pd -from datasetter.dataset import Dataset, as_json - - -def test_as_json(): - assert as_json(None) is None - assert as_json(pd.NaT) is None - assert as_json(np.NaN) is None - assert as_json({}) == {} - assert isinstance(as_json(pd.Timestamp.utcnow()), str) +from datasetter.dataset import Dataset def test_dataframe(): diff --git a/test/test_pandas_dataset.py b/test/test_pandas_dataset.py index bbf2394..50b591d 100644 --- a/test/test_pandas_dataset.py +++ b/test/test_pandas_dataset.py @@ -67,4 +67,3 @@ def test_sample(): with pytest.raises(FacetUnavailableError): dataset.sample(number=12) - assert True diff --git a/tox.ini b/tox.ini index b96937e..2d35aa4 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,7 @@ [flake8] max-line-length = 100 +exclude = .git,__pycache__,datasetter.egg-info,.eggs,.tox,.ipynb_checkpoints + [tox] envlist = py38,py39