diff --git a/operations/data/.coveragerc b/operations/data/.coveragerc deleted file mode 100644 index e86d09d1a2..0000000000 --- a/operations/data/.coveragerc +++ /dev/null @@ -1,13 +0,0 @@ -[run] -source = - dffml_operations_data - tests -branch = True - -[report] -exclude_lines = - no cov - no qa - noqa - pragma: no cover - if __name__ == .__main__.: diff --git a/operations/data/.gitignore b/operations/data/.gitignore deleted file mode 100644 index 070ee81c83..0000000000 --- a/operations/data/.gitignore +++ /dev/null @@ -1,20 +0,0 @@ -*.log -*.pyc -.cache/ -.coverage -.idea/ -.vscode/ -*.egg-info/ -build/ -dist/ -docs/build/ -venv/ -wheelhouse/ -*.egss -.mypy_cache/ -*.swp -.venv/ -.eggs/ -*.modeldir -*.db -htmlcov/ diff --git a/operations/data/Dockerfile b/operations/data/Dockerfile deleted file mode 100644 index b7e990ac99..0000000000 --- a/operations/data/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -# Usage -# docker build -t gitpod/dffml_operations_data . -# docker run --rm -ti -p 80:8080 gitpod/dffml_operations_data -insecure -log debug -# -# curl -v http://127.0.0.1:80/list/sources -FROM ubuntu:22.04@sha256:6042500cf4b44023ea1894effe7890666b0c5c7871ed83a97c36c76ae560bb9b - -RUN apt-get update && \ - apt-get install -y \ - gcc \ - python3-dev \ - python3-pip \ - python3 \ - ca-certificates && \ - python3 -m pip install -U pip && \ - python3 -m pip install dffml-service-http && \ - apt-get purge -y \ - gcc \ - python3-dev && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR /usr/src/app -COPY . /usr/src/app - -RUN python3 -m pip install -e . - -ENTRYPOINT ["python3", "-m", "dffml", "service", "http", "server", "-addr", "0.0.0.0"] -CMD ["-mc-config", "dffml_operations_data/deploy"] diff --git a/operations/data/LICENSE b/operations/data/LICENSE deleted file mode 100644 index 91bb615394..0000000000 --- a/operations/data/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -Copyright (c) 2019 Intel, Sudhanshu - -MIT License - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/operations/data/MANIFEST.in b/operations/data/MANIFEST.in deleted file mode 100644 index 19f3196490..0000000000 --- a/operations/data/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include README.md -include LICENSE -include setup_common.py diff --git a/operations/data/README.md b/operations/data/README.md deleted file mode 100644 index f90edac6a5..0000000000 --- a/operations/data/README.md +++ /dev/null @@ -1,14 +0,0 @@ -# DFFML dffml-operations-data Operations - -dffml-operations-data description. - -## Usage - -Example usage - -```console -``` - -## License - -DFFML dffml-operations-data is distributed under the [MIT License](LICENSE). diff --git a/operations/data/dffml_operations_data/__init__.py b/operations/data/dffml_operations_data/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/operations/data/dffml_operations_data/definitions.py b/operations/data/dffml_operations_data/definitions.py deleted file mode 100644 index 2d32697f93..0000000000 --- a/operations/data/dffml_operations_data/definitions.py +++ /dev/null @@ -1,16 +0,0 @@ -import sys -from dffml.df.types import Definition - -definitions = [ - Definition(name="input_data", primitive="List[List[int]]"), - Definition(name="output_data", primitive="List[List[int]]"), - Definition(name="n_components", primitive="int"), - Definition(name="n_iter", primitive="int"), - Definition(name="random_state", primitive="int"), - Definition(name="missing_values", primitive="Any"), - Definition(name="strategy", primitive="str"), - Definition(name="categories", primitive="List[List[Any]]"), -] - -for definition in definitions: - setattr(sys.modules[__name__], definition.name, definition) diff --git a/operations/data/dffml_operations_data/operations.py b/operations/data/dffml_operations_data/operations.py deleted file mode 100644 index 1d0cede0d5..0000000000 --- a/operations/data/dffml_operations_data/operations.py +++ /dev/null @@ -1,213 +0,0 @@ -import numpy as np -from sklearn.decomposition import PCA, TruncatedSVD -from sklearn.preprocessing import OneHotEncoder, StandardScaler -from sklearn.impute import SimpleImputer - -from dffml.df.base import op - -from .definitions import ( - n_iter, - strategy, - input_data, - categories, - output_data, - random_state, - n_components, - missing_values, -) - - -@op( - inputs={"data": input_data, "n_components": n_components}, - outputs={"result": output_data}, -) -async def principal_component_analysis( - data, n_components=None, -): - """ - Decomposes the data into (n_samples, n_components) - using PCA method - - Parameters - ---------- - data : List[List[int]] - data to be decomposed. - - n_components : int - number of colums the data should have after decomposition. - - Returns - ------- - result: Data having dimensions (n_samples, n_components) - """ - pca = PCA(n_components=n_components) - new_data = pca.fit_transform(data) - return {"result": new_data} - - -@op( - inputs={ - "data": input_data, - "n_components": n_components, - "n_iter": n_iter, - "random_state": random_state, - }, - outputs={"result": output_data}, -) -async def singular_value_decomposition( - data, n_components=2, n_iter=5, random_state=None, -): - """ - Decomposes the data into (n_samples, n_components) - using SVD method. - - Parameters - ---------- - data : List[List[int]] - data to be decomposed. - - n_components : int - number of colums the data should have after decomposition. - - Returns - ------- - result: Data having dimensions (n_samples, n_components) - """ - svd = TruncatedSVD( - n_components=n_components, n_iter=n_iter, random_state=random_state - ) - new_data = svd.fit_transform(data) - return {"result": new_data} - - -@op( - inputs={ - "data": input_data, - "missing_values": missing_values, - "strategy": strategy, - }, - outputs={"result": output_data}, -) -async def simple_imputer(data, missing_values=np.nan, strategy="mean"): - """ - Imputation method for missing values - - Parameters - ---------- - data : List[List[int]] - data in which missing values are present - - missing_values : Any str, int, float, None default = np.nan - The value present in place of missing value - - strategy : str "mean", "median", "constant", "most_frequent" default = "mean" - The value present in place of missing value - - Returns - ------- - result: Dataset having missing values imputed with the strategy - """ - if missing_values not in (int, float, str, None, np.nan): - raise Exception( - f"Missing values should be one of: str, float, int, None, np.nan got {missing_values}" - ) - - if strategy not in ("mean", "median", "constant", "most_frequent"): - raise Exception( - f"Strategy should be one of mean, median, constant, most_frequent got {strategy}" - ) - - imp = SimpleImputer(missing_values=missing_values, strategy=strategy) - new_data = imp.fit_transform(data) - return {"result": new_data} - - -@op( - inputs={"data": input_data, "categories": categories}, - outputs={"result": output_data}, -) -async def one_hot_encoder(data, categories): - """ - One hot encoding for categorical data columns - - Parameters - ---------- - data : List[List[int]] - data to be encoded. - - categories : List[List[str]] - Categorical values which needs to be encoded - - Returns - ------- - result: Encoded data for categorical values - """ - enc = OneHotEncoder(categories=categories) - enc.fit(data) - new_data = enc.transform(data).toarray() - return {"result": new_data} - - -@op(inputs={"data": input_data}, outputs={"result": output_data}) -async def standard_scaler(data): - """ - Standardize features by removing the mean and - scaling to unit variance. - - Parameters - ---------- - data: List[List[int]] - data that needs to be standardized - - Returns - ------- - result: Standardized data - """ - scaler = StandardScaler() - new_data = scaler.fit_transform(data) - return {"result": new_data.tolist()} - - -@op( - inputs={"data": input_data}, outputs={"result": output_data}, -) -async def remove_whitespaces(data): - """ - Removes white-spaces from the dataset - - Parameters - ---------- - data : List[List[int]] - dataset. - - Returns - ------- - result: dataset having whitespaces removed - """ - new_data = np.char.strip(data) - return {"result": new_data} - - -@op( - inputs={"data": input_data}, outputs={"result": output_data}, -) -async def ordinal_encoder(data): - """ - One hot encoding for categorical data columns - - Parameters - ---------- - data : List[List[int]] - data to be encoded. - - categories : List[List[str]] - Categorical values which needs to be encoded - - Returns - ------- - result: Encoded data for categorical values - """ - enc = OneHotEncoder() - enc.fit(data) - new_data = enc.transform(data).toarray() - return {"result": new_data} diff --git a/operations/data/dffml_operations_data/version.py b/operations/data/dffml_operations_data/version.py deleted file mode 100644 index 901e5110b2..0000000000 --- a/operations/data/dffml_operations_data/version.py +++ /dev/null @@ -1 +0,0 @@ -VERSION = "0.0.1" diff --git a/operations/data/entry_points.txt b/operations/data/entry_points.txt deleted file mode 100644 index 1027d8d62f..0000000000 --- a/operations/data/entry_points.txt +++ /dev/null @@ -1,8 +0,0 @@ -[dffml.operation] -principal_component_analysis = dffml_operations_data.operations:principal_component_analysis -singular_value_decomposition = dffml_operations_data.operations:singular_value_decomposition -simple_imputer = dffml_operations_data.operations:simple_imputer -one_hot_encoder = dffml_operations_data.operations:one_hot_encoder -standard_scaler = dffml_operations_data.operations:standard_scaler -remove_whitespaces = dffml_operations_data.operations:remove_whitespaces -ordinal_encoder = dffml_operations_data.operations:ordinal_encoder \ No newline at end of file diff --git a/operations/data/pyproject.toml b/operations/data/pyproject.toml deleted file mode 100644 index 2630964232..0000000000 --- a/operations/data/pyproject.toml +++ /dev/null @@ -1,24 +0,0 @@ -[build-system] -requires = ["setuptools", "wheel"] -build-backend = "setuptools.build_meta" - -[tool.black] -line-length = 79 -target-version = ['py37'] - -exclude = ''' -( - /( - \.eggs # exclude a few common directories in the - | \.git # root of the project - | \.hg - | \.mypy_cache - | \.tox - | \.venv - | _build - | buck-out - | build - | dist - ) -) -''' diff --git a/operations/data/setup.cfg b/operations/data/setup.cfg deleted file mode 100644 index 7981bcb7e3..0000000000 --- a/operations/data/setup.cfg +++ /dev/null @@ -1,33 +0,0 @@ -[metadata] -name = dffml-operations-data -version = attr: dffml_operations_data.version.VERSION -description = DFFML operations dffml-operations-data -long_description = file: README.md -long_description_content_type = text/markdown -author = "Sudhanshu kumar " -author_email = sudhanshukumar5459@gmail.com -maintainer = "Sudhanshu kumar " -maintainer_email = sudhanshukumar5459@gmail.com -url = https://github.com/gitpod/dffml-operations-data -license = MIT -keywords = dffml -classifiers = - Development Status :: 3 - Alpha - Intended Audience :: Developers - License :: OSI Approved :: MIT License - Natural Language :: English - Operating System :: OS Independent - Programming Language :: Python :: 3 :: Only - Programming Language :: Python :: 3.7 - Programming Language :: Python :: Implementation :: CPython - Programming Language :: Python :: Implementation :: PyPy - -[options] -zip_safe = False -include_package_data = True -packages = find: -entry_points = file: entry_points.txt -install_requires = - dffml>=0.4.0 - scikit-learn>=0.21.2 - numpy>=1.19.2 diff --git a/operations/data/setup.py b/operations/data/setup.py deleted file mode 100644 index 17542f4d0e..0000000000 --- a/operations/data/setup.py +++ /dev/null @@ -1,8 +0,0 @@ -import sys -import site -import setuptools - -# See https://github.com/pypa/pip/issues/7953 -site.ENABLE_USER_SITE = "--user" in sys.argv[1:] - -setuptools.setup() diff --git a/operations/data/tests/__init__.py b/operations/data/tests/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/operations/data/tests/test_operations.py b/operations/data/tests/test_operations.py deleted file mode 100644 index 5db5fbbd9a..0000000000 --- a/operations/data/tests/test_operations.py +++ /dev/null @@ -1,227 +0,0 @@ -import numpy as np -from sklearn.datasets import make_classification - -from dffml.df.types import Input, DataFlow -from dffml.operation.output import GetSingle -from dffml.df.memory import MemoryOrchestrator -from dffml.util.asynctestcase import AsyncTestCase - -from dffml_operations_data.operations import * - - -class TestOperations(AsyncTestCase): - async def test_principal_component_analysis(self): - input_data, _ = make_classification( - n_samples=10, - n_features=10, - n_informative=8, - n_redundant=2, - random_state=7, - ) - async for ctx, results in MemoryOrchestrator.run( - DataFlow.auto(principal_component_analysis, GetSingle), - [ - Input( - value=[ - principal_component_analysis.op.outputs["result"].name - ], - definition=GetSingle.op.inputs["spec"], - ), - Input( - value=input_data, - definition=principal_component_analysis.op.inputs["data"], - ), - Input( - value=8, - definition=principal_component_analysis.op.inputs[ - "n_components" - ], - ), - ], - ): - self.assertTrue( - (10, 8) - == results[ - principal_component_analysis.op.outputs["result"].name - ].shape - ) - - async def test_singular_value_decomposition(self): - input_data, _ = make_classification( - n_samples=10, - n_features=10, - n_informative=8, - n_redundant=2, - random_state=7, - ) - async for ctx, results in MemoryOrchestrator.run( - DataFlow.auto(singular_value_decomposition, GetSingle), - [ - Input( - value=[ - singular_value_decomposition.op.outputs["result"].name - ], - definition=GetSingle.op.inputs["spec"], - ), - Input( - value=input_data, - definition=singular_value_decomposition.op.inputs["data"], - ), - Input( - value=8, - definition=singular_value_decomposition.op.inputs[ - "n_components" - ], - ), - Input( - value=1, - definition=singular_value_decomposition.op.inputs[ - "n_iter" - ], - ), - Input( - value=7, - definition=singular_value_decomposition.op.inputs[ - "random_state" - ], - ), - ], - ): - self.assertTrue( - (10, 8) - == results[ - singular_value_decomposition.op.outputs["result"].name - ].shape, - ) - - async def test_simple_imputer(self): - input_data = [[np.nan, 2], [6, np.nan], [7, 6]] - output_data = [[6.5, 2], [6, 4], [7, 6]] - async for ctx, results in MemoryOrchestrator.run( - DataFlow.auto(simple_imputer, GetSingle), - [ - Input( - value=[simple_imputer.op.outputs["result"].name], - definition=GetSingle.op.inputs["spec"], - ), - Input( - value=input_data, - definition=simple_imputer.op.inputs["data"], - ), - Input( - value=np.nan, - definition=simple_imputer.op.inputs["missing_values"], - ), - Input( - value="mean", - definition=simple_imputer.op.inputs["strategy"], - ), - ], - ): - self.assertTrue( - ( - results[simple_imputer.op.outputs["result"].name] - == output_data - ).all() - ) - - async def test_one_hot_encoder(self): - categories = [["Male", "Female"], [1, 2, 3]] - input_data = [["Female", 1], ["Male", 3]] - output_data = [[0.0, 1.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 1.0]] - async for ctx, results in MemoryOrchestrator.run( - DataFlow.auto(one_hot_encoder, GetSingle), - [ - Input( - value=[one_hot_encoder.op.outputs["result"].name], - definition=GetSingle.op.inputs["spec"], - ), - Input( - value=input_data, - definition=one_hot_encoder.op.inputs["data"], - ), - Input( - value=categories, - definition=one_hot_encoder.op.inputs["categories"], - ), - ], - ): - self.assertTrue( - ( - results[one_hot_encoder.op.outputs["result"].name] - == output_data - ).all() - ) - - async def test_standard_scaler(self): - input_data = [[0, 0], [0, 0], [1, 1], [1, 1]] - output_data = [[-1, -1], [-1, -1], [1, 1], [1, 1]] - async for ctx, results in MemoryOrchestrator.run( - DataFlow.auto(standard_scaler, GetSingle), - [ - Input( - value=[standard_scaler.op.outputs["result"].name], - definition=GetSingle.op.inputs["spec"], - ), - Input( - value=input_data, - definition=standard_scaler.op.inputs["data"], - ), - ], - ): - self.assertTrue( - ( - results[standard_scaler.op.outputs["result"].name] - == output_data - ) - ) - - async def test_remove_whitespaces(self): - input_data = [[" ABC ", "XYD "], [" ABC", " XYD "]] - output_data = [["ABC", "XYD"], ["ABC", "XYD"]] - async for ctx, results in MemoryOrchestrator.run( - DataFlow.auto(remove_whitespaces, GetSingle), - [ - Input( - value=[remove_whitespaces.op.outputs["result"].name], - definition=GetSingle.op.inputs["spec"], - ), - Input( - value=input_data, - definition=remove_whitespaces.op.inputs["data"], - ), - ], - ): - self.assertTrue( - ( - results[remove_whitespaces.op.outputs["result"].name] - == output_data - ).all() - ) - - async def test_ordinal_encoder(self): - input_data = [["x", "a"], ["x", "b"], ["y", "a"]] - output_data = [ - [1.0, 0.0, 1.0, 0.0], - [1.0, 0.0, 0.0, 1.0], - [0.0, 1.0, 1.0, 0.0], - ] - async for ctx, results in MemoryOrchestrator.run( - DataFlow.auto(ordinal_encoder, GetSingle), - [ - Input( - value=[ordinal_encoder.op.outputs["result"].name], - definition=GetSingle.op.inputs["spec"], - ), - Input( - value=input_data, - definition=ordinal_encoder.op.inputs["data"], - ), - ], - ): - self.assertTrue( - ( - results[ordinal_encoder.op.outputs["result"].name] - == output_data - ).all() - )