diff --git a/HISTORY.md b/HISTORY.md index 1cc8be5d..8b6b9468 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,28 @@ ## History / Changelog +### 1.12.0 + +Breaking change: +- enforce fixed list of output formats, deprecate `-out` on the CLI (#647) + +Faster, more accurate extraction: +- review link and structure checks (#653) +- improve justext fallback (#652) +- baseline: prevent LXML error in JSON-LD (#643), do not use as backup extraction (#646) +- review XPaths for undesirable content (#645) + +Bugfixes and maintenance: +- CLI fix: markdown format should trigger `include_formatting` (#649) +- images fix: use a length threshold on src attribute (#654) +- XML-TEI: replace RelaxNG by DTD, remove pickle, and update (#655) +- formatting & markdown fix: add newlines (#656) +- table fix: prevent `MemoryError` & `ValueError` during conversion to text (#658) + +Documentation: +- update `crawls.rst`: `known` is an unexpected argument, by @tommytyc in #638 + + ### 1.11.0 Breaking change: diff --git a/setup.py b/setup.py index 62f1d3de..29c74109 100644 --- a/setup.py +++ b/setup.py @@ -110,7 +110,7 @@ def get_long_description(): "certifi", "charset_normalizer >= 3.0.1; python_version < '3.7'", "charset_normalizer >= 3.2.0; python_version >= '3.7'", - "courlan >= 1.1.0", + "courlan >= 1.2.0", "htmldate >= 1.8.1", "importlib_metadata; python_version < '3.8'", "justext >= 3.0.1", diff --git a/trafilatura/__init__.py b/trafilatura/__init__.py index a0245c6c..d5d2cc96 100644 --- a/trafilatura/__init__.py +++ b/trafilatura/__init__.py @@ -9,7 +9,7 @@ __author__ = 'Adrien Barbaresi and contributors' __license__ = "Apache-2.0" __copyright__ = 'Copyright 2019-2024, Adrien Barbaresi' -__version__ = '1.11.0' +__version__ = '1.12.0' import logging