diff --git a/changelog.md b/changelog.md index bc3a36910..5b88713f9 100644 --- a/changelog.md +++ b/changelog.md @@ -9,6 +9,7 @@ ### Fixed - Numbers are now only detected without trying to remove the pollution in between digits, ie `55 @ 77777` could be detected as a full number before, but not anymore. +- Fix fsspec open file encoding to "utf-8". ### Changed diff --git a/edsnlp/data/standoff.py b/edsnlp/data/standoff.py index 6dece8673..59a46aaca 100644 --- a/edsnlp/data/standoff.py +++ b/edsnlp/data/standoff.py @@ -77,7 +77,7 @@ def parse_standoff_file( relations = [] events = {} - with fs.open(txt_path, "r") as f: + with fs.open(txt_path, "r", encoding="utf-8") as f: text = f.read() if not len(ann_paths): @@ -86,7 +86,7 @@ def parse_standoff_file( } for ann_file in ann_paths: - with fs.open(ann_file, "r") as f: + with fs.open(ann_file, "r", encoding="utf-8") as f: for line_idx, line in enumerate(f): try: if line.startswith("T"):