From 277d1d653f5a3def564f6dd8521897a6edcc4e78 Mon Sep 17 00:00:00 2001 From: ChillarAnand Date: Tue, 28 May 2024 19:31:55 +0530 Subject: [PATCH] [FEATURE] Add support for file_encoding in sources --- ingen/data_source/file_source.py | 4 +++- ingen/reader/file_reader.py | 18 +++++++++--------- ingen/reader/xml_file_reader.py | 2 +- test/data_source/test_file_source.py | 3 +-- test/input/test_utf_16.xml | Bin 0 -> 486 bytes test/metadata/test_metadata.py | 16 +++++++++++++++- test/reader/test_xml_file_reader.py | 23 +++++++++++++++++++++++ 7 files changed, 52 insertions(+), 14 deletions(-) create mode 100644 test/input/test_utf_16.xml diff --git a/ingen/data_source/file_source.py b/ingen/data_source/file_source.py index f25841e..9d1bf9e 100644 --- a/ingen/data_source/file_source.py +++ b/ingen/data_source/file_source.py @@ -46,7 +46,7 @@ def fetch_data(self, reader): """ returns a DataFrame of data fetched from input FileSource. """ - return reader.read(self._src) + return reader.read(self._src, encoding=self._src['file_encoding']) def fetch_validations(self): """ @@ -64,4 +64,6 @@ def format_file_path(self, source, params_map): path_parser = PathParser(run_date) if 'file_path' in source: source['file_path'] = path_parser.parse(source['file_path']) + if not source.get('file_encoding'): + source['file_encoding'] = 'utf-8' return source diff --git a/ingen/reader/file_reader.py b/ingen/reader/file_reader.py index df8524b..3f78656 100644 --- a/ingen/reader/file_reader.py +++ b/ingen/reader/file_reader.py @@ -101,8 +101,8 @@ def read(self, src): def get_config(src): - header_size = src.get('skip_header_size' , 0) - trailer_size = src.get('skip_trailer_size' , 0) + header_size = src.get('skip_header_size', 0) + trailer_size = src.get('skip_trailer_size', 0) all_cols = src.get('columns') return { "header_size": header_size, @@ -115,13 +115,13 @@ class ReaderFactory: @classmethod def get_reader(cls, src): - factory_types = {'delimited_file': CSVFileReader, - 'excel': ExcelFileReader, - 'xml': XMLFileReader, - 'json': JSONFileReader, - "fixed_width": FixedWidthFileReader - } + factory_types = { + 'delimited_file': CSVFileReader, + 'excel': ExcelFileReader, + 'xml': XMLFileReader, + 'json': JSONFileReader, + 'fixed_width': FixedWidthFileReader + } reader_cls = factory_types.get(src.get('file_type')) if reader_cls: return reader_cls() - diff --git a/ingen/reader/xml_file_reader.py b/ingen/reader/xml_file_reader.py index b3869ec..99536ec 100644 --- a/ingen/reader/xml_file_reader.py +++ b/ingen/reader/xml_file_reader.py @@ -13,7 +13,7 @@ class XMLFileReader: def read(self, src): - xml_file = open(src['file_path'], 'r') + xml_file = open(src['file_path'], 'r', encoding=src['file_encoding']) try: data = xmltodict.parse(xml_file.read()) tree = et.parse(src['file_path']) diff --git a/test/data_source/test_file_source.py b/test/data_source/test_file_source.py index c5aaa24..b37045b 100644 --- a/test/data_source/test_file_source.py +++ b/test/data_source/test_file_source.py @@ -20,8 +20,7 @@ def setUp(self): 'file_path': 'test', 'skip_header_size': 1, 'skip_trailer_size': 1, - 'columns': ['col1', 'col2'] - + 'columns': ['col1', 'col2'], } self.params_map = {'query_params': {'table_name': 'positions'}, 'infile': {}} self.source = FileSource(self._src, self.params_map) diff --git a/test/input/test_utf_16.xml b/test/input/test_utf_16.xml new file mode 100644 index 0000000000000000000000000000000000000000..c65dea7d17bc6ef8684a4d5ff52076a0bac61876 GIT binary patch literal 486 zcma)(&1%9>5QWdW;5(#QRnS-?bXAo!MsOi1iCwrVqGF|ODk@?(KDzqdo2X>x;%5HN zoH;W&p7g4&_WIS29PKpMS{p6(r%Y|RYRc0y)}fMVN`6K))52CiG*Vx6P%mJ*>Zqaz z7$3>it?$rz;DvOq^bh-z-Y;#*J;7JS`b_ZE!h1u-gzUEx@W$524(w#F#20`xijynf z#zu40n;pz1Gqf8$MuX9ltGg)dDMr;uW4$W^5yREPOOz6&