Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] issue-44: Add support for file_encoding in sources #46

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion ingen/data_source/file_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def fetch_data(self, reader):
"""
returns a DataFrame of data fetched from input FileSource.
"""
return reader.read(self._src)
return reader.read(self._src, encoding=self._src['file_encoding'])
ChillarAnand marked this conversation as resolved.
Show resolved Hide resolved

def fetch_validations(self):
"""
Expand All @@ -64,4 +64,6 @@ def format_file_path(self, source, params_map):
path_parser = PathParser(run_date)
if 'file_path' in source:
source['file_path'] = path_parser.parse(source['file_path'])
if not source.get('file_encoding'):
ChillarAnand marked this conversation as resolved.
Show resolved Hide resolved
source['file_encoding'] = 'utf-8'
return source
18 changes: 9 additions & 9 deletions ingen/reader/file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ def read(self, src):


def get_config(src):
header_size = src.get('skip_header_size' , 0)
trailer_size = src.get('skip_trailer_size' , 0)
header_size = src.get('skip_header_size', 0)
trailer_size = src.get('skip_trailer_size', 0)
all_cols = src.get('columns')
return {
"header_size": header_size,
Expand All @@ -115,13 +115,13 @@ class ReaderFactory:

@classmethod
def get_reader(cls, src):
factory_types = {'delimited_file': CSVFileReader,
'excel': ExcelFileReader,
'xml': XMLFileReader,
'json': JSONFileReader,
"fixed_width": FixedWidthFileReader
}
factory_types = {
'delimited_file': CSVFileReader,
'excel': ExcelFileReader,
'xml': XMLFileReader,
'json': JSONFileReader,
'fixed_width': FixedWidthFileReader
}
reader_cls = factory_types.get(src.get('file_type'))
if reader_cls:
return reader_cls()

2 changes: 1 addition & 1 deletion ingen/reader/xml_file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
class XMLFileReader:

def read(self, src):
xml_file = open(src['file_path'], 'r')
xml_file = open(src['file_path'], 'r', encoding=src['file_encoding'])
try:
data = xmltodict.parse(xml_file.read())
tree = et.parse(src['file_path'])
Expand Down
3 changes: 1 addition & 2 deletions test/data_source/test_file_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ def setUp(self):
'file_path': 'test',
'skip_header_size': 1,
'skip_trailer_size': 1,
'columns': ['col1', 'col2']

'columns': ['col1', 'col2'],
}
self.params_map = {'query_params': {'table_name': 'positions'}, 'infile': {}}
self.source = FileSource(self._src, self.params_map)
Expand Down
Binary file added test/input/test_utf_16.xml
Binary file not shown.
16 changes: 15 additions & 1 deletion test/metadata/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def test_output_type(self):
output = self.metadata.output
self.assertEqual("file", output["type"])

def test_output_type(self):
def test_split_output_type(self):
self.setUp({"use_split_file_config": True})
output = self.metadata.output
self.assertEqual("splitted_file", output["type"])
Expand Down Expand Up @@ -239,6 +239,20 @@ def test_sources_with_dynamic_data(self):
self.assertIsInstance(metadata.sources[1], JsonSource)
self.assertEqual("json_payload", metadata.sources[1]._id)

def test_default_file_encoding(self):
metadata = MetaData(
self.test_md_name, self.test_md_configurations, self.params_map
)
self.assertEqual("utf-8", metadata.sources[0]._src["file_encoding"])

def test_custom_file_encoding(self):
metadata_config = self.test_md_configurations.copy()
metadata_config["sources"][0]["file_encoding"] = "utf-16"
metadata = MetaData(
self.test_md_name, metadata_config, self.params_map
)
self.assertEqual("utf-16", metadata.sources[0]._src["file_encoding"])


if __name__ == "__main__":
unittest.main()
23 changes: 23 additions & 0 deletions test/reader/test_xml_file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,19 @@ def setUp(self):
],
'root_tag': 'ORDER'
}
self.xml_src_utf_16_encoding = {
'id': 'order_xml',
'type': 'file',
'file_type': 'xml',
'file_encoding': 'utf-16',
'file_path': f'{THIS_DIR.parent}/input/test_utf_16.xml',
'columns': [
'ORDER_ID',
'ORD_DETAIL_set.ORD_DETAIL.ITEM_NAME',
'ORD_DETAIL_set.ORD_DETAIL.QUANTITY'
],
'root_tag': 'ORDER'
}

def test_xml(self):
source = self.xml_src
Expand Down Expand Up @@ -133,6 +146,16 @@ def test_empty_xml_file(self, mock_logging):
error_msg = "XML file is empty"
mock_logging.error.assert_called_with(error_msg)

def test_xml_utf_16_encoding(self):
source = self.xml_src_utf_16_encoding
reader = XMLFileReader()
data = reader.read(source)
keys = {}
expected_data = pd.DataFrame(
keys,
columns=source['columns'])
pd.testing.assert_frame_equal(expected_data, data)


if __name__ == '__main__':
unittest.main()