diff --git a/ingen/data_source/file_source.py b/ingen/data_source/file_source.py index f25841e..0e8c9e2 100644 --- a/ingen/data_source/file_source.py +++ b/ingen/data_source/file_source.py @@ -31,6 +31,8 @@ def __init__(self, source, params_map): self._src = source else: self._src = self.format_file_path(source, params_map) + if not source.get('file_encoding'): + source['file_encoding'] = 'utf-8' def fetch(self): """ diff --git a/ingen/reader/file_reader.py b/ingen/reader/file_reader.py index df8524b..f3a9e79 100644 --- a/ingen/reader/file_reader.py +++ b/ingen/reader/file_reader.py @@ -24,13 +24,16 @@ def read(self, src): config = get_config(src) dtype = src.get('dtype') try: - result = pd.read_csv(src['file_path'], - sep=src.get('delimiter'), - index_col=False, - skiprows=config['header_size'], - skipfooter=config['trailer_size'], - names=config['all_cols'], - dtype=dtype) + result = pd.read_csv( + src['file_path'], + sep=src.get('delimiter'), + index_col=False, + skiprows=config['header_size'], + skipfooter=config['trailer_size'], + names=config['all_cols'], + dtype=dtype, + encoding=src['file_encoding'], + ) except TypeError: logging.error(self.DTYPE_LOG_MSG) raise @@ -101,8 +104,8 @@ def read(self, src): def get_config(src): - header_size = src.get('skip_header_size' , 0) - trailer_size = src.get('skip_trailer_size' , 0) + header_size = src.get('skip_header_size', 0) + trailer_size = src.get('skip_trailer_size', 0) all_cols = src.get('columns') return { "header_size": header_size, @@ -115,13 +118,13 @@ class ReaderFactory: @classmethod def get_reader(cls, src): - factory_types = {'delimited_file': CSVFileReader, - 'excel': ExcelFileReader, - 'xml': XMLFileReader, - 'json': JSONFileReader, - "fixed_width": FixedWidthFileReader - } + factory_types = { + 'delimited_file': CSVFileReader, + 'excel': ExcelFileReader, + 'xml': XMLFileReader, + 'json': JSONFileReader, + 'fixed_width': FixedWidthFileReader + } reader_cls = factory_types.get(src.get('file_type')) if reader_cls: return reader_cls() - diff --git a/ingen/reader/xml_file_reader.py b/ingen/reader/xml_file_reader.py index b3869ec..99536ec 100644 --- a/ingen/reader/xml_file_reader.py +++ b/ingen/reader/xml_file_reader.py @@ -13,7 +13,7 @@ class XMLFileReader: def read(self, src): - xml_file = open(src['file_path'], 'r') + xml_file = open(src['file_path'], 'r', encoding=src['file_encoding']) try: data = xmltodict.parse(xml_file.read()) tree = et.parse(src['file_path']) diff --git a/test/data_source/test_file_source.py b/test/data_source/test_file_source.py index c5aaa24..b37045b 100644 --- a/test/data_source/test_file_source.py +++ b/test/data_source/test_file_source.py @@ -20,8 +20,7 @@ def setUp(self): 'file_path': 'test', 'skip_header_size': 1, 'skip_trailer_size': 1, - 'columns': ['col1', 'col2'] - + 'columns': ['col1', 'col2'], } self.params_map = {'query_params': {'table_name': 'positions'}, 'infile': {}} self.source = FileSource(self._src, self.params_map) diff --git a/test/input/test_utf_16.xml b/test/input/test_utf_16.xml new file mode 100644 index 0000000..c65dea7 Binary files /dev/null and b/test/input/test_utf_16.xml differ diff --git a/test/metadata/test_metadata.py b/test/metadata/test_metadata.py index 7d7f1c1..b08cd00 100644 --- a/test/metadata/test_metadata.py +++ b/test/metadata/test_metadata.py @@ -131,7 +131,7 @@ def test_output_type(self): output = self.metadata.output self.assertEqual("file", output["type"]) - def test_output_type(self): + def test_split_output_type(self): self.setUp({"use_split_file_config": True}) output = self.metadata.output self.assertEqual("splitted_file", output["type"]) @@ -239,6 +239,20 @@ def test_sources_with_dynamic_data(self): self.assertIsInstance(metadata.sources[1], JsonSource) self.assertEqual("json_payload", metadata.sources[1]._id) + def test_default_file_encoding(self): + metadata = MetaData( + self.test_md_name, self.test_md_configurations, self.params_map + ) + self.assertEqual("utf-8", metadata.sources[0]._src["file_encoding"]) + + def test_custom_file_encoding(self): + metadata_config = self.test_md_configurations.copy() + metadata_config["sources"][0]["file_encoding"] = "utf-16" + metadata = MetaData( + self.test_md_name, metadata_config, self.params_map + ) + self.assertEqual("utf-16", metadata.sources[0]._src["file_encoding"]) + if __name__ == "__main__": unittest.main() diff --git a/test/reader/test_xml_file_reader.py b/test/reader/test_xml_file_reader.py index 592f37c..f670ab0 100644 --- a/test/reader/test_xml_file_reader.py +++ b/test/reader/test_xml_file_reader.py @@ -77,6 +77,19 @@ def setUp(self): ], 'root_tag': 'ORDER' } + self.xml_src_utf_16_encoding = { + 'id': 'order_xml', + 'type': 'file', + 'file_type': 'xml', + 'file_encoding': 'utf-16', + 'file_path': f'{THIS_DIR.parent}/input/test_utf_16.xml', + 'columns': [ + 'ORDER_ID', + 'ORD_DETAIL_set.ORD_DETAIL.ITEM_NAME', + 'ORD_DETAIL_set.ORD_DETAIL.QUANTITY' + ], + 'root_tag': 'ORDER' + } def test_xml(self): source = self.xml_src @@ -133,6 +146,16 @@ def test_empty_xml_file(self, mock_logging): error_msg = "XML file is empty" mock_logging.error.assert_called_with(error_msg) + def test_xml_utf_16_encoding(self): + source = self.xml_src_utf_16_encoding + reader = XMLFileReader() + data = reader.read(source) + keys = {} + expected_data = pd.DataFrame( + keys, + columns=source['columns']) + pd.testing.assert_frame_equal(expected_data, data) + if __name__ == '__main__': unittest.main()