diff --git a/nesta_ds_utils/loading_saving/S3.py b/nesta_ds_utils/loading_saving/S3.py index 41f120d..36b5563 100644 --- a/nesta_ds_utils/loading_saving/S3.py +++ b/nesta_ds_utils/loading_saving/S3.py @@ -1,9 +1,9 @@ import io from typing import List -from xmlrpc.client import Boolean import boto3 from fnmatch import fnmatch import pandas as pd +import geopandas as gpd import numpy as np import pyarrow as pa import pyarrow.parquet as pq @@ -46,9 +46,34 @@ def _df_to_fileobj(df_data: pd.DataFrame, path_to: str, **kwargs) -> io.BytesIO: df_data.to_csv(buffer, **kwargs) elif fnmatch(path_to, "*.parquet"): df_data.to_parquet(buffer, **kwargs) + elif fnmatch(path_to, "*.xlsx"): + df_data.to_excel(buffer, **kwargs) + elif fnmatch(path_to, "*.xlsm"): + df_data.to_excel(buffer, **kwargs) else: - raise Exception( - "Uploading dataframe currently supported only for 'csv' and 'parquet'." + raise NotImplementedError( + "Uploading dataframe currently supported only for 'csv', 'parquet', 'xlsx' and xlsm'." + ) + buffer.seek(0) + return buffer + + +def _gdf_to_fileobj(df_data: gpd.GeoDataFrame, path_to: str, **kwargs) -> io.BytesIO: + """Convert GeoDataFrame into bytes file object. + + Args: + df_data (gpd.DataFrame): Dataframe to convert. + path_to (str): Saving file name. + + Returns: + io.BytesIO: Bytes file object. + """ + buffer = io.BytesIO() + if fnmatch(path_to, "*.geojson"): + df_data.to_file(buffer, driver="GeoJSON", **kwargs) + else: + raise NotImplementedError( + "Uploading geodataframe currently supported only for 'geojson'." ) buffer.seek(0) return buffer @@ -67,8 +92,30 @@ def _dict_to_fileobj(dict_data: dict, path_to: str, **kwargs) -> io.BytesIO: buffer = io.BytesIO() if fnmatch(path_to, "*.json"): buffer.write(json.dumps(dict_data, **kwargs).encode()) + elif fnmatch(path_to, "*.geojson"): + if "type" in dict_data: + if dict_data["type"] in [ + "Point", + "MultiPoint", + "LineString", + "MultiLineString", + "Polygon", + "MultiPolygon", + "GeometryCollection", + "Feature", + "FeatureCollection", + ]: + buffer.write(json.dumps(dict_data, **kwargs).encode()) + else: + raise AttributeError( + "GeoJSONS must have a member with the name 'type', the value of the member must " + "be one of the following: 'Point', 'MultiPoint', 'LineString', 'MultiLineString'," + "'Polygon', 'MultiPolygon','GeometryCollection', 'Feature' or 'FeatureCollection'." + ) else: - raise Exception("Uploading dictionary currently supported only for 'json'.") + raise NotImplementedError( + "Uploading dictionary currently supported only for 'json' and 'geojson'." + ) buffer.seek(0) return buffer @@ -92,7 +139,7 @@ def _list_to_fileobj(list_data: list, path_to: str, **kwargs) -> io.BytesIO: elif fnmatch(path_to, "*.json"): buffer.write(json.dumps(list_data, **kwargs).encode()) else: - raise Exception( + raise NotImplementedError( "Uploading list currently supported only for 'csv', 'txt' and 'json'." ) buffer.seek(0) @@ -112,7 +159,9 @@ def _str_to_fileobj(str_data: str, path_to: str, **kwargs) -> io.BytesIO: if fnmatch(path_to, "*.txt"): buffer = io.BytesIO(bytes(str_data.encode("utf-8"))) else: - raise Exception("Uploading string currently supported only for 'txt'.") + raise NotImplementedError( + "Uploading string currently supported only for 'txt'." + ) buffer.seek(0) return buffer @@ -135,7 +184,7 @@ def _np_array_to_fileobj( elif fnmatch(path_to, "*.parquet"): pq.write_table(pa.table({"data": np_array_data}), buffer, **kwargs) else: - raise Exception( + raise NotImplementedError( "Uploading numpy array currently supported only for 'csv' and 'parquet." ) buffer.seek(0) @@ -156,7 +205,7 @@ def _unsupp_data_to_fileobj(data: any, path_to: str, **kwargs) -> io.BytesIO: if fnmatch(path_to, "*.pkl"): pickle.dump(data, buffer, **kwargs) else: - raise Exception( + raise NotImplementedError( "This file type is not supported for this data. Use 'pkl' instead." ) buffer.seek(0) @@ -180,7 +229,9 @@ def upload_obj( kwargs_writing (dict, optional): Dictionary of kwargs for writing data. """ - if isinstance(obj, pd.DataFrame): + if isinstance(obj, gpd.base.GeoPandasBase): + obj = _gdf_to_fileobj(obj, path_to, **kwargs_writing) + elif isinstance(obj, pd.DataFrame): obj = _df_to_fileobj(obj, path_to, **kwargs_writing) elif isinstance(obj, dict): obj = _dict_to_fileobj(obj, path_to, **kwargs_writing) @@ -194,7 +245,7 @@ def upload_obj( obj = _unsupp_data_to_fileobj(obj, path_to, **kwargs_writing) warnings.warn( "Data uploaded as pickle. Please consider other accessible " - "file types among the suppoted ones." + "file types among the supported ones." ) s3 = boto3.client("s3") @@ -215,6 +266,26 @@ def _fileobj_to_df(fileobj: io.BytesIO, path_from: str, **kwargs) -> pd.DataFram return pd.read_csv(fileobj, **kwargs) elif fnmatch(path_from, "*.parquet"): return pd.read_parquet(fileobj, **kwargs) + elif fnmatch(path_from, "*.xlsx"): + return pd.read_excel(fileobj, **kwargs) + elif fnmatch(path_from, "*.xlsm"): + return pd.read_excel(fileobj, **kwargs) + + +def _fileobj_to_gdf(fileobj: io.BytesIO, path_from: str, **kwargs) -> pd.DataFrame: + """Convert bytes file object into geodataframe. + + Args: + fileobj (io.BytesIO): Bytes file object. + path_from (str): Path of loaded data. + + Returns: + gpd.DataFrame: Data as geodataframe. + """ + if fnmatch(path_from, "*.geojson"): + return gpd.GeoDataFrame.from_features( + json.loads(fileobj.getvalue().decode())["features"] + ) def _fileobj_to_dict(fileobj: io.BytesIO, path_from: str, **kwargs) -> dict: @@ -294,41 +365,65 @@ def download_obj( bucket (str): Bucket's name. path_from (str): Path to data in S3. download_as (str, optional): Type of object downloading. Choose between - ('dataframe', 'dict', 'list', 'str', 'np.array'). Not needed for 'pkl files'. + ('dataframe', 'geodf', 'dict', 'list', 'str', 'np.array'). Not needed for 'pkl files'. kwargs_boto (dict, optional): Dictionary of kwargs for boto3 function 'download_fileobj'. kwargs_reading (dict, optional): Dictionary of kwargs for reading data. Returns: - any: Donwloaded data. + any: Downloaded data. """ - if not path_from.endswith(tuple([".csv", ".parquet", ".json", ".txt", ".pkl"])): - raise Exception( + if not path_from.endswith( + tuple( + [".csv", ".parquet", ".json", ".txt", ".pkl", ".geojson", ".xlsx", ".xlsm"] + ) + ): + raise NotImplementedError( "This file type is not currently supported for download in memory." ) s3 = boto3.client("s3") fileobj = io.BytesIO() s3.download_fileobj(bucket, path_from, fileobj, **kwargs_boto) fileobj.seek(0) - if download_as == "dataframe": - if path_from.endswith(tuple([".csv", ".parquet"])): + if not download_as: + if path_from.endswith(tuple([".pkl"])): + return pickle.load(fileobj, **kwargs_reading) + else: + raise ValueError("'download_as' is required for this file type.") + elif download_as == "dataframe": + if path_from.endswith(tuple([".csv", ".parquet", ".xlsx", ".xlsm"])): return _fileobj_to_df(fileobj, path_from, **kwargs_reading) else: - raise Exception( + raise NotImplementedError( "Download as dataframe currently supported only " - "for 'csv' and 'parquet'." + "for 'csv','parquet','xlsx' and 'xlsm'." + ) + elif download_as == "geodf": + if path_from.endswith(tuple([".geojson"])): + return _fileobj_to_gdf(fileobj, path_from, **kwargs_reading) + else: + raise NotImplementedError( + "Download as geodataframe currently supported only " "for 'geojson'." ) elif download_as == "dict": if path_from.endswith(tuple([".json"])): return _fileobj_to_dict(fileobj, path_from, **kwargs_reading) + elif path_from.endswith(tuple([".geojson"])): + warnings.warn( + "Please check geojson has a member with the name 'type', the value of the member must be one of the following:" + "'Point', 'MultiPoint', 'LineString', 'MultiLineString', 'Polygon', 'MultiPolygon', 'GeometryCollection'," + "'Feature' and 'FeatureCollection'. Else downloaded dictionary will not be valid geojson." + ) + return _fileobj_to_dict(fileobj, path_from, **kwargs_reading) else: - raise Exception( - "Download as dictionary currently supported only " "for 'json'." + raise NotImplementedError( + "Download as dictionary currently supported only " + "for 'json' and 'geojson'." ) elif download_as == "list": if path_from.endswith(tuple([".csv", ".txt", ".json"])): return _fileobj_to_list(fileobj, path_from, **kwargs_reading) else: - raise Exception( + raise NotImplementedError( "Download as list currently supported only " "for 'csv', 'txt' and 'json'." ) @@ -336,23 +431,20 @@ def download_obj( if path_from.endswith(tuple([".txt"])): return _fileobj_to_str(fileobj) else: - raise Exception("Download as string currently supported only " "for 'txt'.") + raise NotImplementedError( + "Download as string currently supported only " "for 'txt'." + ) elif download_as == "np.array": if path_from.endswith(tuple([".csv", ".parquet"])): return _fileobj_to_np_array(fileobj, path_from, **kwargs_reading) else: - raise Exception( + raise NotImplementedError( "Download as numpy array currently supported only " "for 'csv' and 'parquet'." ) - elif not download_as: - if path_from.endswith(tuple([".pkl"])): - return pickle.load(fileobj, **kwargs_reading) - else: - raise Exception("'download_as' is required for this file type.") else: - raise Exception( - "'download_as' not provided. Choose between ('dataframe', " + raise ValueError( + "'download_as' not provided. Choose between ('dataframe', 'geodf', " "'dict', 'list', 'str', 'np.array'). Not needed for 'pkl files'.'" ) diff --git a/nesta_ds_utils/loading_saving/file_ops.py b/nesta_ds_utils/loading_saving/file_ops.py index aa85f6f..c8e64bd 100644 --- a/nesta_ds_utils/loading_saving/file_ops.py +++ b/nesta_ds_utils/loading_saving/file_ops.py @@ -1,9 +1,7 @@ from typing import Union from pathlib import Path -from xmlrpc.client import Boolean import zipfile import os -from fnmatch import fnmatch def _convert_str_to_pathlib_path(path: Union[Path, str]) -> Path: @@ -32,7 +30,7 @@ def make_path_if_not_exist(path: Union[Path, str]): def extractall( zip_path: Union[Path, str], out_path: Union[Path, str] = None, - delete_zip: Boolean = True, + delete_zip: bool = True, ): """Takes path to zipped file and extracts it to specified output path. diff --git a/setup.cfg b/setup.cfg index a836cdd..f6ed202 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,6 +20,10 @@ install_requires = [options.extras_require] s3 = boto3==1.24.93 +gis = + geopandas==0.13.2 +io_extras = + openpyxl==3.0.9 viz = altair==4.2.0 altair-saver==0.5.0 @@ -34,6 +38,8 @@ test = pytest==7.1.3 moto[s3]==4.0.7 %(s3)s + %(gis)s + %(io_extras)s %(viz)s %(networks)s %(nlp)s @@ -51,11 +57,15 @@ dev = pre-commit-hooks==4.3.0 black==22.10.0 %(s3)s + %(gis)s + %(io_extras)s %(viz)s %(networks)s %(nlp)s all = %(s3)s + %(gis)s + %(io_extras)s %(viz)s %(networks)s %(nlp)s diff --git a/tests/artifacts/dummy_dataframe.geojson b/tests/artifacts/dummy_dataframe.geojson new file mode 100644 index 0000000..090e651 --- /dev/null +++ b/tests/artifacts/dummy_dataframe.geojson @@ -0,0 +1,7 @@ +{ +"type": "FeatureCollection", +"features": [ +{ "type": "Feature", "properties": { "col1": "name1" }, "geometry": { "type": "Point", "coordinates": [ 1.0, 2.0 ] } }, +{ "type": "Feature", "properties": { "col1": "name2" }, "geometry": { "type": "Point", "coordinates": [ 2.0, 1.0 ] } } +] +} diff --git a/tests/artifacts/dummy_dataframe.xlsm b/tests/artifacts/dummy_dataframe.xlsm new file mode 100644 index 0000000..eddc527 Binary files /dev/null and b/tests/artifacts/dummy_dataframe.xlsm differ diff --git a/tests/artifacts/dummy_dataframe.xlsx b/tests/artifacts/dummy_dataframe.xlsx new file mode 100644 index 0000000..07eb141 Binary files /dev/null and b/tests/artifacts/dummy_dataframe.xlsx differ diff --git a/tests/artifacts/dummy_dict.geojson b/tests/artifacts/dummy_dict.geojson new file mode 100644 index 0000000..3f06398 --- /dev/null +++ b/tests/artifacts/dummy_dict.geojson @@ -0,0 +1 @@ +{"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", "properties": {"test": "name1"}, "geometry": {"type": "Point", "coordinates": [0, 0]}}], "crs": {"type": "name", "properties": {"name": "urn:ogc:def:crs:EPSG::3857"}}} \ No newline at end of file diff --git a/tests/loading_saving/test_S3.py b/tests/loading_saving/test_S3.py index fc80a5d..90b4c6a 100644 --- a/tests/loading_saving/test_S3.py +++ b/tests/loading_saving/test_S3.py @@ -8,8 +8,17 @@ from moto import mock_s3 import io from nesta_ds_utils.loading_saving import S3 +from shapely.geometry import Point +import geopandas as gpd +TEST_GEODATAFRAME = gpd.GeoDataFrame({'col1': ['name1', 'name2'], 'geometry': [Point(1, 2), Point(2, 1)]}) TEST_DATAFRAME = pd.DataFrame({"test": [0, 0]}) +TEST_DICT_GEO = {"type": "FeatureCollection", + "features": [ + {"id": "0", "type": "Feature", "properties": {"test": "name1"}, "geometry": {"type": "Point", "coordinates": [0, 0]}} + ], + "crs": {"type": "name", "properties": {"name": "urn:ogc:def:crs:EPSG::3857"}} + } TEST_DICT = {"test": [0, 0]} TEST_LIST = [0, "test"] TEST_STR = "test" @@ -36,6 +45,35 @@ def test_upload_obj_dataframe_csv(): mock_client = boto3.client("s3") S3.upload_obj(TEST_DATAFRAME, "test-bucket", "dummy.csv") +@mock_s3 +def test_upload_obj_dataframe_xlsx(): + """Tests that upload_obj does not return an exeption + uploading dataframe as xlsx. + """ + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket="test-bucket") + mock_client = boto3.client("s3") + S3.upload_obj(TEST_DATAFRAME, "test-bucket", "dummy.xlsx") + +@mock_s3 +def test_upload_obj_dataframe_xlsm(): + """Tests that upload_obj does not return an exeption + uploading dataframe as xlsm. + """ + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket="test-bucket") + mock_client = boto3.client("s3") + S3.upload_obj(TEST_DATAFRAME, "test-bucket", "dummy.xlsm") + +@mock_s3 +def test_upload_obj_dataframe_geojson(): + """Tests that upload_obj does not return an exeption + uploading dataframe as geojson. + """ + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket="test-bucket") + mock_client = boto3.client("s3") + S3.upload_obj(TEST_GEODATAFRAME, "test-bucket", "dummy.geojson") @mock_s3 def test_upload_obj_dataframe_parquet(): @@ -56,6 +94,14 @@ def test_upload_obj_dict_json(): mock_client = boto3.client("s3") S3.upload_obj(TEST_DICT, "test-bucket", "dummy.json") +@mock_s3 +def test_upload_obj_dict_json(): + """Tests that upload_obj does not return an exeption.""" + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket="test-bucket") + mock_client = boto3.client("s3") + S3.upload_obj(TEST_DICT_GEO, "test-bucket", "dummy.geojson") + @mock_s3 def test_upload_obj_list_csv(): @@ -135,7 +181,7 @@ def test_upload_obj_unsup_data(): @mock_s3 -def test_dowload_obj_dataframe_csv(): +def test_download_obj_dataframe_csv(): """Tests that download_obj returns the correct dataframe from csv file. """ @@ -150,9 +196,56 @@ def test_dowload_obj_dataframe_csv(): == 0 ) +@mock_s3 +def test_download_obj_dataframe_xlsx(): + """Tests that download_obj returns the correct dataframe + from xlsx file. + """ + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket="test-bucket") + mock_client = boto3.client("s3") + mock_client.upload_file( + "tests/artifacts/dummy_dataframe.xlsx", "test-bucket", "dummy.xlsx" + ) + assert ( + S3.download_obj("test-bucket", "dummy.xlsx", download_as="dataframe").test[0] + == 0 + ) + +@mock_s3 +def test_download_obj_dataframe_xlsm(): + """Tests that download_obj returns the correct dataframe + from xlsm file. + """ + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket="test-bucket") + mock_client = boto3.client("s3") + mock_client.upload_file( + "tests/artifacts/dummy_dataframe.xlsm", "test-bucket", "dummy.xlsm" + ) + assert ( + S3.download_obj("test-bucket", "dummy.xlsm", download_as="dataframe").test[0] + == 0 + ) + +@mock_s3 +def test_download_obj_dataframe_geojson(): + """Tests that download_obj returns the correct dataframe + from geojson file. + """ + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket="test-bucket") + mock_client = boto3.client("s3") + mock_client.upload_file( + "tests/artifacts/dummy_dataframe.geojson", "test-bucket", "dummy.geojson" + ) + assert ( + S3.download_obj("test-bucket", "dummy.geojson", download_as="geodf").geometry[0] + == Point(1, 2) + ) @mock_s3 -def test_dowload_obj_dataframe_parquet(): +def test_download_obj_dataframe_parquet(): """Tests that download_obj returns the correct dataframe from parquet file. """ @@ -169,7 +262,7 @@ def test_dowload_obj_dataframe_parquet(): @mock_s3 -def test_dowload_obj_dict_json(): +def test_download_obj_dict_json(): """Tests that download_obj returns the correct dictionary from json file. """ @@ -183,9 +276,23 @@ def test_dowload_obj_dict_json(): S3.download_obj("test-bucket", "dummy.json", download_as="dict")["test"][0] == 0 ) +@mock_s3 +def test_download_obj_dict_geojson(): + """Tests that download_obj returns the correct dictionary + from json file. + """ + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket="test-bucket") + mock_client = boto3.client("s3") + mock_client.upload_file( + "tests/artifacts/dummy_dict.geojson", "test-bucket", "dummy.geojson" + ) + assert ( + S3.download_obj("test-bucket", "dummy.geojson", download_as="dict")["type"]=="FeatureCollection" + ) @mock_s3 -def test_dowload_obj_list_csv(): +def test_download_obj_list_csv(): """Tests that download_obj returns the correct list from csv file. """ @@ -199,7 +306,7 @@ def test_dowload_obj_list_csv(): @mock_s3 -def test_dowload_obj_list_txt(): +def test_download_obj_list_txt(): """Tests that download_obj returns the correct list from txt file. """ @@ -213,7 +320,7 @@ def test_dowload_obj_list_txt(): @mock_s3 -def test_dowload_obj_list_json(): +def test_download_obj_list_json(): """Tests that download_obj returns the correct dataframe from json file. """ @@ -230,7 +337,7 @@ def test_dowload_obj_list_json(): @mock_s3 -def test_dowload_obj_str_txt(): +def test_download_obj_str_txt(): """Tests that download_obj returns the correct string from txt file. """ @@ -242,7 +349,7 @@ def test_dowload_obj_str_txt(): @mock_s3 -def test_dowload_obj_array_csv(): +def test_download_obj_array_csv(): """Tests that download_obj returns the correct numpy array from csv file. """ @@ -256,7 +363,7 @@ def test_dowload_obj_array_csv(): @mock_s3 -def test_dowload_obj_array_parquet(): +def test_download_obj_array_parquet(): """Tests that download_obj returns the correct numpy array from parquet file. """ @@ -272,7 +379,7 @@ def test_dowload_obj_array_parquet(): @mock_s3 -def test_dowload_obj_unsup_data(): +def test_download_obj_unsup_data(): """Tests that download_obj returns the correct integer from pkl file. """ @@ -286,7 +393,7 @@ def test_dowload_obj_unsup_data(): @mock_s3 -def test_dowload_obj_exeption(): +def test_download_obj_exeption(): """Tests that download_obj returns an exception for unsupported file type.""" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="test-bucket")