Skip to content

Commit

Permalink
[FEATURE] Add support for file_encoding in sources
Browse files Browse the repository at this point in the history
  • Loading branch information
ChillarAnand committed May 28, 2024
1 parent 924dfa4 commit 277d1d6
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 14 deletions.
4 changes: 3 additions & 1 deletion ingen/data_source/file_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def fetch_data(self, reader):
"""
returns a DataFrame of data fetched from input FileSource.
"""
return reader.read(self._src)
return reader.read(self._src, encoding=self._src['file_encoding'])

def fetch_validations(self):
"""
Expand All @@ -64,4 +64,6 @@ def format_file_path(self, source, params_map):
path_parser = PathParser(run_date)
if 'file_path' in source:
source['file_path'] = path_parser.parse(source['file_path'])
if not source.get('file_encoding'):
source['file_encoding'] = 'utf-8'
return source
18 changes: 9 additions & 9 deletions ingen/reader/file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ def read(self, src):


def get_config(src):
header_size = src.get('skip_header_size' , 0)
trailer_size = src.get('skip_trailer_size' , 0)
header_size = src.get('skip_header_size', 0)
trailer_size = src.get('skip_trailer_size', 0)
all_cols = src.get('columns')
return {
"header_size": header_size,
Expand All @@ -115,13 +115,13 @@ class ReaderFactory:

@classmethod
def get_reader(cls, src):
factory_types = {'delimited_file': CSVFileReader,
'excel': ExcelFileReader,
'xml': XMLFileReader,
'json': JSONFileReader,
"fixed_width": FixedWidthFileReader
}
factory_types = {
'delimited_file': CSVFileReader,
'excel': ExcelFileReader,
'xml': XMLFileReader,
'json': JSONFileReader,
'fixed_width': FixedWidthFileReader
}
reader_cls = factory_types.get(src.get('file_type'))
if reader_cls:
return reader_cls()

2 changes: 1 addition & 1 deletion ingen/reader/xml_file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
class XMLFileReader:

def read(self, src):
xml_file = open(src['file_path'], 'r')
xml_file = open(src['file_path'], 'r', encoding=src['file_encoding'])
try:
data = xmltodict.parse(xml_file.read())
tree = et.parse(src['file_path'])
Expand Down
3 changes: 1 addition & 2 deletions test/data_source/test_file_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ def setUp(self):
'file_path': 'test',
'skip_header_size': 1,
'skip_trailer_size': 1,
'columns': ['col1', 'col2']

'columns': ['col1', 'col2'],
}
self.params_map = {'query_params': {'table_name': 'positions'}, 'infile': {}}
self.source = FileSource(self._src, self.params_map)
Expand Down
Binary file added test/input/test_utf_16.xml
Binary file not shown.
16 changes: 15 additions & 1 deletion test/metadata/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def test_output_type(self):
output = self.metadata.output
self.assertEqual("file", output["type"])

def test_output_type(self):
def test_split_output_type(self):
self.setUp({"use_split_file_config": True})
output = self.metadata.output
self.assertEqual("splitted_file", output["type"])
Expand Down Expand Up @@ -239,6 +239,20 @@ def test_sources_with_dynamic_data(self):
self.assertIsInstance(metadata.sources[1], JsonSource)
self.assertEqual("json_payload", metadata.sources[1]._id)

def test_default_file_encoding(self):
metadata = MetaData(
self.test_md_name, self.test_md_configurations, self.params_map
)
self.assertEqual("utf-8", metadata.sources[0]._src["file_encoding"])

def test_custom_file_encoding(self):
metadata_config = self.test_md_configurations.copy()
metadata_config["sources"][0]["file_encoding"] = "utf-16"
metadata = MetaData(
self.test_md_name, metadata_config, self.params_map
)
self.assertEqual("utf-16", metadata.sources[0]._src["file_encoding"])


if __name__ == "__main__":
unittest.main()
23 changes: 23 additions & 0 deletions test/reader/test_xml_file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,19 @@ def setUp(self):
],
'root_tag': 'ORDER'
}
self.xml_src_utf_16_encoding = {
'id': 'order_xml',
'type': 'file',
'file_type': 'xml',
'file_encoding': 'utf-16',
'file_path': f'{THIS_DIR.parent}/input/test_utf_16.xml',
'columns': [
'ORDER_ID',
'ORD_DETAIL_set.ORD_DETAIL.ITEM_NAME',
'ORD_DETAIL_set.ORD_DETAIL.QUANTITY'
],
'root_tag': 'ORDER'
}

def test_xml(self):
source = self.xml_src
Expand Down Expand Up @@ -133,6 +146,16 @@ def test_empty_xml_file(self, mock_logging):
error_msg = "XML file is empty"
mock_logging.error.assert_called_with(error_msg)

def test_xml_utf_16_encoding(self):
source = self.xml_src_utf_16_encoding
reader = XMLFileReader()
data = reader.read(source)
keys = {}
expected_data = pd.DataFrame(
keys,
columns=source['columns'])
pd.testing.assert_frame_equal(expected_data, data)


if __name__ == '__main__':
unittest.main()

0 comments on commit 277d1d6

Please sign in to comment.