Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] issue-44: Add support for file_encoding in sources #46

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ingen/data_source/file_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def __init__(self, source, params_map):
self._src = source
else:
self._src = self.format_file_path(source, params_map)
if not source.get('file_encoding'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ChillarAnand I was thinking to remove this default setting here and add defaults in pd.read_csv calls in file_reader and xml_reader. Like encoding=src.get('file_encoding', 'utf-8'). The benefit is, if we set default closer to where it is being used it'd be easier for the reader to understand that utf-8 is the default encoding. Let me know your thoughts?

source['file_encoding'] = 'utf-8'

def fetch(self):
"""
Expand Down
35 changes: 19 additions & 16 deletions ingen/reader/file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,16 @@ def read(self, src):
config = get_config(src)
dtype = src.get('dtype')
try:
result = pd.read_csv(src['file_path'],
sep=src.get('delimiter'),
index_col=False,
skiprows=config['header_size'],
skipfooter=config['trailer_size'],
names=config['all_cols'],
dtype=dtype)
result = pd.read_csv(
src['file_path'],
sep=src.get('delimiter'),
index_col=False,
skiprows=config['header_size'],
skipfooter=config['trailer_size'],
names=config['all_cols'],
dtype=dtype,
encoding=src['file_encoding'],
)
except TypeError:
logging.error(self.DTYPE_LOG_MSG)
raise
Expand Down Expand Up @@ -101,8 +104,8 @@ def read(self, src):


def get_config(src):
header_size = src.get('skip_header_size' , 0)
trailer_size = src.get('skip_trailer_size' , 0)
header_size = src.get('skip_header_size', 0)
trailer_size = src.get('skip_trailer_size', 0)
all_cols = src.get('columns')
return {
"header_size": header_size,
Expand All @@ -115,13 +118,13 @@ class ReaderFactory:

@classmethod
def get_reader(cls, src):
factory_types = {'delimited_file': CSVFileReader,
'excel': ExcelFileReader,
'xml': XMLFileReader,
'json': JSONFileReader,
"fixed_width": FixedWidthFileReader
}
factory_types = {
'delimited_file': CSVFileReader,
'excel': ExcelFileReader,
'xml': XMLFileReader,
'json': JSONFileReader,
'fixed_width': FixedWidthFileReader
}
reader_cls = factory_types.get(src.get('file_type'))
if reader_cls:
return reader_cls()

2 changes: 1 addition & 1 deletion ingen/reader/xml_file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
class XMLFileReader:

def read(self, src):
xml_file = open(src['file_path'], 'r')
xml_file = open(src['file_path'], 'r', encoding=src['file_encoding'])
try:
data = xmltodict.parse(xml_file.read())
tree = et.parse(src['file_path'])
Expand Down
3 changes: 1 addition & 2 deletions test/data_source/test_file_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ def setUp(self):
'file_path': 'test',
'skip_header_size': 1,
'skip_trailer_size': 1,
'columns': ['col1', 'col2']

'columns': ['col1', 'col2'],
}
self.params_map = {'query_params': {'table_name': 'positions'}, 'infile': {}}
self.source = FileSource(self._src, self.params_map)
Expand Down
Binary file added test/input/test_utf_16.xml
Binary file not shown.
16 changes: 15 additions & 1 deletion test/metadata/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def test_output_type(self):
output = self.metadata.output
self.assertEqual("file", output["type"])

def test_output_type(self):
def test_split_output_type(self):
self.setUp({"use_split_file_config": True})
output = self.metadata.output
self.assertEqual("splitted_file", output["type"])
Expand Down Expand Up @@ -239,6 +239,20 @@ def test_sources_with_dynamic_data(self):
self.assertIsInstance(metadata.sources[1], JsonSource)
self.assertEqual("json_payload", metadata.sources[1]._id)

def test_default_file_encoding(self):
metadata = MetaData(
self.test_md_name, self.test_md_configurations, self.params_map
)
self.assertEqual("utf-8", metadata.sources[0]._src["file_encoding"])

def test_custom_file_encoding(self):
metadata_config = self.test_md_configurations.copy()
metadata_config["sources"][0]["file_encoding"] = "utf-16"
metadata = MetaData(
self.test_md_name, metadata_config, self.params_map
)
self.assertEqual("utf-16", metadata.sources[0]._src["file_encoding"])


if __name__ == "__main__":
unittest.main()
23 changes: 23 additions & 0 deletions test/reader/test_xml_file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,19 @@ def setUp(self):
],
'root_tag': 'ORDER'
}
self.xml_src_utf_16_encoding = {
'id': 'order_xml',
'type': 'file',
'file_type': 'xml',
'file_encoding': 'utf-16',
'file_path': f'{THIS_DIR.parent}/input/test_utf_16.xml',
'columns': [
'ORDER_ID',
'ORD_DETAIL_set.ORD_DETAIL.ITEM_NAME',
'ORD_DETAIL_set.ORD_DETAIL.QUANTITY'
],
'root_tag': 'ORDER'
}

def test_xml(self):
source = self.xml_src
Expand Down Expand Up @@ -133,6 +146,16 @@ def test_empty_xml_file(self, mock_logging):
error_msg = "XML file is empty"
mock_logging.error.assert_called_with(error_msg)

def test_xml_utf_16_encoding(self):
source = self.xml_src_utf_16_encoding
reader = XMLFileReader()
data = reader.read(source)
keys = {}
expected_data = pd.DataFrame(
keys,
columns=source['columns'])
pd.testing.assert_frame_equal(expected_data, data)


if __name__ == '__main__':
unittest.main()