diff --git a/ingen/data_source/file_source.py b/ingen/data_source/file_source.py index f25841e..9d1bf9e 100644 --- a/ingen/data_source/file_source.py +++ b/ingen/data_source/file_source.py @@ -46,7 +46,7 @@ def fetch_data(self, reader): """ returns a DataFrame of data fetched from input FileSource. """ - return reader.read(self._src) + return reader.read(self._src, encoding=self._src['file_encoding']) def fetch_validations(self): """ @@ -64,4 +64,6 @@ def format_file_path(self, source, params_map): path_parser = PathParser(run_date) if 'file_path' in source: source['file_path'] = path_parser.parse(source['file_path']) + if not source.get('file_encoding'): + source['file_encoding'] = 'utf-8' return source diff --git a/ingen/reader/file_reader.py b/ingen/reader/file_reader.py index df8524b..3f78656 100644 --- a/ingen/reader/file_reader.py +++ b/ingen/reader/file_reader.py @@ -101,8 +101,8 @@ def read(self, src): def get_config(src): - header_size = src.get('skip_header_size' , 0) - trailer_size = src.get('skip_trailer_size' , 0) + header_size = src.get('skip_header_size', 0) + trailer_size = src.get('skip_trailer_size', 0) all_cols = src.get('columns') return { "header_size": header_size, @@ -115,13 +115,13 @@ class ReaderFactory: @classmethod def get_reader(cls, src): - factory_types = {'delimited_file': CSVFileReader, - 'excel': ExcelFileReader, - 'xml': XMLFileReader, - 'json': JSONFileReader, - "fixed_width": FixedWidthFileReader - } + factory_types = { + 'delimited_file': CSVFileReader, + 'excel': ExcelFileReader, + 'xml': XMLFileReader, + 'json': JSONFileReader, + 'fixed_width': FixedWidthFileReader + } reader_cls = factory_types.get(src.get('file_type')) if reader_cls: return reader_cls() - diff --git a/ingen/reader/xml_file_reader.py b/ingen/reader/xml_file_reader.py index b3869ec..99536ec 100644 --- a/ingen/reader/xml_file_reader.py +++ b/ingen/reader/xml_file_reader.py @@ -13,7 +13,7 @@ class XMLFileReader: def read(self, src): - xml_file = open(src['file_path'], 'r') + xml_file = open(src['file_path'], 'r', encoding=src['file_encoding']) try: data = xmltodict.parse(xml_file.read()) tree = et.parse(src['file_path']) diff --git a/test/data_source/test_file_source.py b/test/data_source/test_file_source.py index c5aaa24..b37045b 100644 --- a/test/data_source/test_file_source.py +++ b/test/data_source/test_file_source.py @@ -20,8 +20,7 @@ def setUp(self): 'file_path': 'test', 'skip_header_size': 1, 'skip_trailer_size': 1, - 'columns': ['col1', 'col2'] - + 'columns': ['col1', 'col2'], } self.params_map = {'query_params': {'table_name': 'positions'}, 'infile': {}} self.source = FileSource(self._src, self.params_map) diff --git a/test/input/test_utf_16.xml b/test/input/test_utf_16.xml new file mode 100644 index 0000000..c65dea7 Binary files /dev/null and b/test/input/test_utf_16.xml differ diff --git a/test/metadata/test_metadata.py b/test/metadata/test_metadata.py index 7d7f1c1..b08cd00 100644 --- a/test/metadata/test_metadata.py +++ b/test/metadata/test_metadata.py @@ -131,7 +131,7 @@ def test_output_type(self): output = self.metadata.output self.assertEqual("file", output["type"]) - def test_output_type(self): + def test_split_output_type(self): self.setUp({"use_split_file_config": True}) output = self.metadata.output self.assertEqual("splitted_file", output["type"]) @@ -239,6 +239,20 @@ def test_sources_with_dynamic_data(self): self.assertIsInstance(metadata.sources[1], JsonSource) self.assertEqual("json_payload", metadata.sources[1]._id) + def test_default_file_encoding(self): + metadata = MetaData( + self.test_md_name, self.test_md_configurations, self.params_map + ) + self.assertEqual("utf-8", metadata.sources[0]._src["file_encoding"]) + + def test_custom_file_encoding(self): + metadata_config = self.test_md_configurations.copy() + metadata_config["sources"][0]["file_encoding"] = "utf-16" + metadata = MetaData( + self.test_md_name, metadata_config, self.params_map + ) + self.assertEqual("utf-16", metadata.sources[0]._src["file_encoding"]) + if __name__ == "__main__": unittest.main() diff --git a/test/reader/test_xml_file_reader.py b/test/reader/test_xml_file_reader.py index 592f37c..f670ab0 100644 --- a/test/reader/test_xml_file_reader.py +++ b/test/reader/test_xml_file_reader.py @@ -77,6 +77,19 @@ def setUp(self): ], 'root_tag': 'ORDER' } + self.xml_src_utf_16_encoding = { + 'id': 'order_xml', + 'type': 'file', + 'file_type': 'xml', + 'file_encoding': 'utf-16', + 'file_path': f'{THIS_DIR.parent}/input/test_utf_16.xml', + 'columns': [ + 'ORDER_ID', + 'ORD_DETAIL_set.ORD_DETAIL.ITEM_NAME', + 'ORD_DETAIL_set.ORD_DETAIL.QUANTITY' + ], + 'root_tag': 'ORDER' + } def test_xml(self): source = self.xml_src @@ -133,6 +146,16 @@ def test_empty_xml_file(self, mock_logging): error_msg = "XML file is empty" mock_logging.error.assert_called_with(error_msg) + def test_xml_utf_16_encoding(self): + source = self.xml_src_utf_16_encoding + reader = XMLFileReader() + data = reader.read(source) + keys = {} + expected_data = pd.DataFrame( + keys, + columns=source['columns']) + pd.testing.assert_frame_equal(expected_data, data) + if __name__ == '__main__': unittest.main()