From 60ae5559877a6a1a004df9e702bd3854ca4a1f9c Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev <41479552+pvk-developer@users.noreply.github.com> Date: Fri, 3 May 2024 00:02:33 +0200 Subject: [PATCH] Add ExcelHandler (#1962) --- pyproject.toml | 4 +- sdv/io/local/__init__.py | 5 +- sdv/io/local/local.py | 98 +++++++++- tests/integration/io/local/test_local.py | 73 +++++++- tests/unit/io/local/test_local.py | 219 ++++++++++++++++++++++- 5 files changed, 389 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index af250ec3b..3e89f63ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ 'deepecho>=0.6.0', 'rdt>=1.12.0', 'sdmetrics>=0.14.0', - 'platformdirs>=4.0' + 'platformdirs>=4.0', ] [project.urls] @@ -51,7 +51,9 @@ dependencies = [ sdv = { main = 'sdv.cli.__main__:main' } [project.optional-dependencies] +excel = ['pandas[excel]'] test = [ + 'sdv[excel]', 'pytest>=3.4.2', 'pytest-cov>=2.6.0', 'pytest-rerunfailures>=10.3,<15', diff --git a/sdv/io/local/__init__.py b/sdv/io/local/__init__.py index a233b25be..bd3c2ba5b 100644 --- a/sdv/io/local/__init__.py +++ b/sdv/io/local/__init__.py @@ -1,8 +1,9 @@ """Local I/O module.""" -from sdv.io.local.local import BaseLocalHandler, CSVHandler +from sdv.io.local.local import BaseLocalHandler, CSVHandler, ExcelHandler __all__ = ( 'BaseLocalHandler', - 'CSVHandler' + 'CSVHandler', + 'ExcelHandler' ) diff --git a/sdv/io/local/local.py b/sdv/io/local/local.py index 0d81ab634..1ceaba195 100644 --- a/sdv/io/local/local.py +++ b/sdv/io/local/local.py @@ -33,7 +33,7 @@ def _infer_metadata(self, data): return metadata def read(self): - """Read data from files and returns it along with metadata. + """Read data from files and return it along with metadata. This method must be implemented by subclasses. @@ -91,7 +91,7 @@ def __init__(self, sep=',', encoding='UTF', decimal='.', float_format=None, self.quoting = quoting def read(self, folder_name, file_names=None): - """Read data from CSV files and returns it along with metadata. + """Read data from CSV files and return it along with metadata. Args: folder_name (str): @@ -192,3 +192,97 @@ def write(self, synthetic_data, folder_name, file_name_suffix=None, mode='x'): quoting=self.quoting, mode=mode, ) + + +class ExcelHandler(BaseLocalHandler): + """A class for handling Excel files.""" + + def _read_excel(self, file_path, sheet_names=None): + """Read data from Excel File and return just the data as a dictionary.""" + data = {} + if sheet_names is None: + xl_file = pd.ExcelFile(file_path) + sheet_names = xl_file.sheet_names + + for sheet_name in sheet_names: + data[sheet_name] = pd.read_excel( + file_path, + sheet_name=sheet_name, + parse_dates=False, + decimal=self.decimal, + index_col=None + ) + + return data + + def read(self, file_path, sheet_names=None): + """Read data from Excel files and return it along with metadata. + + Args: + file_path (str): + The path to the Excel file to read. + sheet_names (list of str, optional): + The names of sheets to read. If None, all sheets are read. + + Returns: + tuple: + A tuple containing the data as a dictionary and metadata. The dictionary maps + table names to pandas DataFrames. The metadata is an object describing the data. + """ + metadata = MultiTableMetadata() + if sheet_names is not None and not isinstance(sheet_names, list): + raise ValueError("'sheet_names' must be None or a list of strings.") + + data = self._read_excel(file_path, sheet_names) + metadata = self._infer_metadata(data) + return data, metadata + + def write(self, synthetic_data, file_name, sheet_name_suffix=None, mode='w'): + """Write synthetic data to an Excel File. + + Args: + synthetic_data (dict): + A dictionary mapping table names to pandas DataFrames containing synthetic data. + file_name (str): + The name of the Excel file to write. + sheet_name_suffix (str, optional): + A suffix to add to each sheet name. + mode (str, optional): + The mode of writing to use. Defaults to 'w'. + 'w': Write sheets to a new file, clearing any existing file that may exist. + 'a': Append new sheets within the existing file. + Note: You cannot append data to existing sheets. + """ + temp_data = synthetic_data + suffix_added = False + + if mode == 'a': + temp_data = self._read_excel(file_name) + for table_name, table in synthetic_data.items(): + sheet_name = table_name + if sheet_name_suffix: + sheet_name = f'{table_name}{sheet_name_suffix}' + suffix_added = True + + if temp_data.get(sheet_name) is not None: + temp_data[sheet_name] = pd.concat( + [temp_data[sheet_name], synthetic_data[sheet_name]], + ignore_index=True + ) + + else: + temp_data[sheet_name] = table + + writer = pd.ExcelWriter(file_name) + for table_name, table_data in temp_data.items(): + if sheet_name_suffix and not suffix_added: + table_name += sheet_name_suffix + + table_data.to_excel( + writer, + sheet_name=table_name, + float_format=self.float_format, + index=False + ) + + writer.close() diff --git a/tests/integration/io/local/test_local.py b/tests/integration/io/local/test_local.py index 87b3c80ea..934350deb 100644 --- a/tests/integration/io/local/test_local.py +++ b/tests/integration/io/local/test_local.py @@ -1,13 +1,13 @@ import pandas as pd -from sdv.io.local import CSVHandler +from sdv.io.local import CSVHandler, ExcelHandler from sdv.metadata import MultiTableMetadata class TestCSVHandler: - def test_integration_read_write(self, tmpdir): - """Test end to end the read and write methods of ``CSVHandler``.""" + def test_integration_write_and_read(self, tmpdir): + """Test end to end the write and read methods of ``CSVHandler``.""" # Prepare synthetic data synthetic_data = { 'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}), @@ -30,3 +30,70 @@ def test_integration_read_write(self, tmpdir): # Check if the dataframes match the original synthetic data pd.testing.assert_frame_equal(data['table1'], synthetic_data['table1']) pd.testing.assert_frame_equal(data['table2'], synthetic_data['table2']) + + +class TestExcelHandler: + + def test_integration_write_and_read(self, tmpdir): + """Test end to end the write and read methods of ``ExcelHandler``.""" + # Prepare synthetic data + synthetic_data = { + 'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}), + 'table2': pd.DataFrame({'col3': [4, 5, 6], 'col4': ['d', 'e', 'f']}) + } + + # Write synthetic data to xslx files + handler = ExcelHandler() + handler.write(synthetic_data, tmpdir / 'excel.xslx') + + # Read data from xslx file + data, metadata = handler.read(tmpdir / 'excel.xslx') + + # Check if data was read correctly + assert len(data) == 2 + assert 'table1' in data + assert 'table2' in data + assert isinstance(metadata, MultiTableMetadata) is True + + # Check if the dataframes match the original synthetic data + pd.testing.assert_frame_equal(data['table1'], synthetic_data['table1']) + pd.testing.assert_frame_equal(data['table2'], synthetic_data['table2']) + + def test_integration_write_and_read_append_mode(self, tmpdir): + """Test end to end the write and read methods of ``ExcelHandler``.""" + # Prepare synthetic data + synthetic_data = { + 'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}), + 'table2': pd.DataFrame({'col3': [4, 5, 6], 'col4': ['d', 'e', 'f']}) + } + + # Write synthetic data to xslx files + handler = ExcelHandler() + handler.write(synthetic_data, tmpdir / 'excel.xslx') + + # Read data from xslx file + data, metadata = handler.read(tmpdir / 'excel.xslx') + + # Write using append mode + handler.write(synthetic_data, tmpdir / 'excel.xslx', mode='a') + + # Read data from xslx file + data, metadata = handler.read(tmpdir / 'excel.xslx') + + # Check if data was read correctly + assert len(data) == 2 + assert 'table1' in data + assert 'table2' in data + assert isinstance(metadata, MultiTableMetadata) is True + + # Check if the dataframes match the original synthetic data + expected_table_one = pd.concat( + [synthetic_data['table1'], synthetic_data['table1']], + ignore_index=True + ) + expected_table_two = pd.concat( + [synthetic_data['table2'], synthetic_data['table2']], + ignore_index=True + ) + pd.testing.assert_frame_equal(data['table1'], expected_table_one) + pd.testing.assert_frame_equal(data['table2'], expected_table_two) diff --git a/tests/unit/io/local/test_local.py b/tests/unit/io/local/test_local.py index e69d18636..363754da5 100644 --- a/tests/unit/io/local/test_local.py +++ b/tests/unit/io/local/test_local.py @@ -1,12 +1,12 @@ """Unit tests for local file handlers.""" import os from pathlib import Path -from unittest.mock import patch +from unittest.mock import Mock, call, patch import pandas as pd import pytest -from sdv.io.local.local import CSVHandler +from sdv.io.local.local import CSVHandler, ExcelHandler from sdv.metadata.multi_table import MultiTableMetadata @@ -210,3 +210,218 @@ def test_write_file_exists_mode_is_w(self, tmpdir): 'col2': ['a', 'b', 'c'] }) pd.testing.assert_frame_equal(dataframe, expected_dataframe) + + +class TestExcelHandler: + + def test___init__(self): + """Test the init parameters with default values.""" + # Run + instance = ExcelHandler() + + # Assert + assert instance.decimal == '.' + assert instance.float_format is None + + def test___init___custom(self): + """Test custom initialization of the class.""" + # Run + instance = ExcelHandler(decimal=',', float_format='%.2f') + + # Assert + assert instance.decimal == ',' + assert instance.float_format == '%.2f' + + @patch('sdv.io.local.local.pd') + def test_read(self, mock_pd): + """Test the read method of ExcelHandler class.""" + # Setup + file_path = 'test_file.xlsx' + mock_pd.ExcelFile.return_value = Mock(sheet_names=['Sheet1', 'Sheet2']) + mock_pd.read_excel.side_effect = [ + pd.DataFrame({'A': [1, 2], 'B': [3, 4]}), + pd.DataFrame({'C': [5, 6], 'D': [7, 8]}) + ] + + instance = ExcelHandler() + + # Run + data, metadata = instance.read(file_path) + + # Assert + sheet_1_call = call( + 'test_file.xlsx', + sheet_name='Sheet1', + parse_dates=False, + decimal='.', + index_col=None + ) + sheet_2_call = call( + 'test_file.xlsx', + sheet_name='Sheet2', + parse_dates=False, + decimal='.', + index_col=None + ) + pd.testing.assert_frame_equal( + data['Sheet1'], + pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + ) + pd.testing.assert_frame_equal( + data['Sheet2'], + pd.DataFrame({'C': [5, 6], 'D': [7, 8]}) + ) + assert isinstance(metadata, MultiTableMetadata) + assert mock_pd.read_excel.call_args_list == [sheet_1_call, sheet_2_call] + + @patch('sdv.io.local.local.pd') + def test_read_sheet_names(self, mock_pd): + """Test the read method when provided sheet names.""" + # Setup + file_path = 'test_file.xlsx' + sheet_names = ['Sheet1'] + mock_pd.ExcelFile.return_value = Mock(sheet_names=['Sheet1', 'Sheet2']) + mock_pd.read_excel.side_effect = [ + pd.DataFrame({'A': [1, 2], 'B': [3, 4]}), + pd.DataFrame({'C': [5, 6], 'D': [7, 8]}) + ] + + instance = ExcelHandler() + + # Run + data, metadata = instance.read(file_path, sheet_names) + + # Assert + sheet_1_call = call( + 'test_file.xlsx', + sheet_name='Sheet1', + parse_dates=False, + decimal='.', + index_col=None + ) + pd.testing.assert_frame_equal( + data['Sheet1'], + pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + ) + assert isinstance(metadata, MultiTableMetadata) + assert mock_pd.read_excel.call_args_list == [sheet_1_call] + assert list(data) == ['Sheet1'] + + def test_read_sheet_names_string(self): + """Test the read method when provided sheet names but they are string.""" + # Setup + file_path = 'test_file.xlsx' + sheet_names = 'Sheet1' + instance = ExcelHandler() + + # Run and Assert + error_msg = "'sheet_names' must be None or a list of strings." + with pytest.raises(ValueError, match=error_msg): + instance.read(file_path, sheet_names) + + @patch('sdv.io.local.local.pd') + def test_write(self, mock_pd): + """Test the write functionality of the ExcelHandler.""" + # Setup + sheet_one = Mock() + sheet_two = Mock() + synthetic_data = {'Sheet1': sheet_one, 'Sheet2': sheet_two} + + file_name = 'output_file.xlsx' + sheet_name_suffix = '_synthetic' + instance = ExcelHandler() + + # Run + instance.write(synthetic_data, file_name, sheet_name_suffix) + + # Assert + sheet_one.to_excel.assert_called_once_with( + mock_pd.ExcelWriter.return_value, + sheet_name='Sheet1_synthetic', + float_format=None, + index=False + ) + sheet_two.to_excel.assert_called_once_with( + mock_pd.ExcelWriter.return_value, + sheet_name='Sheet2_synthetic', + float_format=None, + index=False + ) + mock_pd.ExcelWriter.return_value.close.assert_called_once_with() + + @patch('sdv.io.local.local.pd') + def test_write_mode_append(self, mock_pd): + """Test the write functionality of the ExcelHandler when mode is `a``.""" + # Setup + sheet_one = Mock() + sheet_two = Mock() + synth_sheet_one = Mock() + synth_sheet_two = Mock() + synthetic_data = {'Sheet1': synth_sheet_one, 'Sheet2': synth_sheet_two} + + file_name = 'output_file.xlsx' + sheet_name_suffix = '_synthetic' + instance = ExcelHandler() + instance._read_excel = Mock(return_value={'Sheet1': sheet_one, 'Sheet2': sheet_two}) + + # Run + instance.write(synthetic_data, file_name, sheet_name_suffix, mode='a') + + # Assert + sheet_one.to_excel.assert_called_once_with( + mock_pd.ExcelWriter.return_value, + sheet_name='Sheet1', + float_format=None, + index=False + ) + sheet_two.to_excel.assert_called_once_with( + mock_pd.ExcelWriter.return_value, + sheet_name='Sheet2', + float_format=None, + index=False + ) + synth_sheet_one.to_excel.assert_called_once_with( + mock_pd.ExcelWriter.return_value, + sheet_name='Sheet1_synthetic', + float_format=None, + index=False + ) + synth_sheet_two.to_excel.assert_called_once_with( + mock_pd.ExcelWriter.return_value, + sheet_name='Sheet2_synthetic', + float_format=None, + index=False + ) + mock_pd.ExcelWriter.return_value.close.assert_called_once_with() + + @patch('sdv.io.local.local.pd') + def test_write_mode_append_no_suffix(self, mock_pd): + """Test the write functionality of the ExcelHandler when mode is `a`` and no suffix.""" + # Setup + sheet_one = Mock() + sheet_two = Mock() + synth_sheet_one = Mock() + synthetic_data = {'Sheet1': synth_sheet_one} + file_name = 'output_file.xlsx' + instance = ExcelHandler() + instance._read_excel = Mock(return_value={'Sheet1': sheet_one, 'Sheet2': sheet_two}) + + # Run + instance.write(synthetic_data, file_name, mode='a') + + # Assert + mock_pd.concat.assert_called_once_with([sheet_one, synth_sheet_one], ignore_index=True) + mock_pd.concat.return_value.to_excel.assert_called_once_with( + mock_pd.ExcelWriter.return_value, + sheet_name='Sheet1', + float_format=None, + index=False + ) + + sheet_two.to_excel.assert_called_once_with( + mock_pd.ExcelWriter.return_value, + sheet_name='Sheet2', + float_format=None, + index=False + ) + mock_pd.ExcelWriter.return_value.close.assert_called_once_with()