From d64e2cba43dba53d59aa130d2f573e047e60ef7d Mon Sep 17 00:00:00 2001 From: Joos Kiener Date: Sat, 16 Sep 2023 10:44:42 +0200 Subject: [PATCH] better error handling with invalid characters - fixes issue #34 --- pycdxml/cdxml_converter/chemdraw_types.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pycdxml/cdxml_converter/chemdraw_types.py b/pycdxml/cdxml_converter/chemdraw_types.py index 0bad553..b34ed6c 100644 --- a/pycdxml/cdxml_converter/chemdraw_types.py +++ b/pycdxml/cdxml_converter/chemdraw_types.py @@ -118,9 +118,14 @@ def from_bytes(property_bytes: bytes, charset='iso-8859-1', fonttable=None) -> ' stream.seek(stream.tell() - text_length) value = stream.read(text_length).decode('utf8') except UnicodeDecodeError: - logger.warning("Found unsupported character. Retrying with 'utf8'.") stream.seek(stream.tell() - text_length) - value = stream.read(text_length).decode('utf8') + if charset == 'utf8': + logger.warning("Found unsupported character for utf8. Retrying with errors=='replace'.") + else: + logger.warning(f"Found unsupported character for charset {charset}. " + f"Retrying with 'utf8' and errors=='replace'.") + value = stream.read(text_length).decode('utf8', errors="replace") + # Normalize to xml spec where all line breaks in attributes are represented by \n value = value.replace("\r", "\n") logger.debug(f"Read String '{value}' with {len(font_styles)} different styles.")