From f5b5460d5a5491ee97aa23905028a594b93f8ecf Mon Sep 17 00:00:00 2001 From: IMayBeABitShy Date: Tue, 6 Feb 2024 16:09:18 +0100 Subject: [PATCH 1/5] Fix handling of tag list in Creator.add_metadata (fix #125) This fixes issue #125. Previously, the type annotations of Creator.config_metadata() allowed "Tags" to be a list of strings, but didn't handle the conversion to a string, which resulted in python-libzim raising an error. This commit modifies Creator.add_metadata() to accept a list of strings and, if the key of the metadata is "Tags", join them into a single string. A regression test is included. --- src/zimscraperlib/zim/creator.py | 7 ++++- tests/zim/test_zim_creator.py | 48 ++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index 6ab905db..a703fdc4 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -183,10 +183,15 @@ def validate_metadata( def add_metadata( self, name: str, - content: Union[str, bytes, datetime.date, datetime.datetime], + content: Union[str, bytes, datetime.date, datetime.datetime, Iterable[str]], mimetype: str = "text/plain;charset=UTF-8", ): self.validate_metadata(name, content) + # handle necessary type conversions + if name == "Tags" and not isinstance(content, str): + # join list of tags into a single string + content = ";".join(content) + # NOTE: conversion of "Date" is handled in python-libzim super().add_metadata(name, content, mimetype) def config_metadata( diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py index 9c27685c..2041d312 100644 --- a/tests/zim/test_zim_creator.py +++ b/tests/zim/test_zim_creator.py @@ -573,6 +573,54 @@ def test_config_metadata(tmp_path, png_image): assert reader.get_text_metadata("TestMetadata") == "Test Metadata" +def test_creator_metadata_list_as_tags(tmp_path, png_image): + """ + Test creator metadata setup when "Tags" is a list of strings. + + The test case is based on test_config_metadata() + + This is also a regression test for #125. + """ + fpath = tmp_path / "test_config.zim" + with open(png_image, "rb") as fh: + png_data = fh.read() + creator = Creator(fpath, "").config_metadata( + Name="wikipedia_fr_football", + Title="English Wikipedia", + Creator="English speaking Wikipedia contributors", + Publisher="Wikipedia user Foobar", + Date="2009-11-21", + Description="All articles (without images) from the english Wikipedia", + LongDescription="This ZIM file contains all articles (without images)" + " from the english Wikipedia by 2009-11-10. The topics are...", + Language="eng", + License="CC-BY", + Tags=[ + "wikipedia", + "_category:wikipedia", + "_pictures:no", + "_videos:no", + "_details:yes", + "_ftindex:yes", + ], + Flavour="nopic", + Source="https://en.wikipedia.org/", + Scraper="mwoffliner 1.2.3", + Illustration_48x48_at_1=png_data, + TestMetadata="Test Metadata", + ) + with creator: + pass + + assert fpath.exists() + reader = Archive(fpath) + assert ( + reader.get_text_metadata("Tags") + == "wikipedia;_category:wikipedia;_pictures:no;_videos:no;" + "_details:yes;_ftindex:yes" + ) + + @pytest.mark.parametrize( "name,value,valid", [ From efb824d7dab70cf107ea599c41740be12482f9e9 Mon Sep 17 00:00:00 2001 From: IMayBeABitShy Date: Fri, 9 Feb 2024 14:31:21 +0100 Subject: [PATCH 2/5] Remove comments I didn't think I'd ever encounter an open source project opposed to comments, but here we are. Some comments will soon be outdated anyway. --- src/zimscraperlib/zim/creator.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index a703fdc4..d209da60 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -187,11 +187,8 @@ def add_metadata( mimetype: str = "text/plain;charset=UTF-8", ): self.validate_metadata(name, content) - # handle necessary type conversions if name == "Tags" and not isinstance(content, str): - # join list of tags into a single string content = ";".join(content) - # NOTE: conversion of "Date" is handled in python-libzim super().add_metadata(name, content, mimetype) def config_metadata( From 38daa9c0e809d2de312f1ed50580e9840d551575 Mon Sep 17 00:00:00 2001 From: IMayBeABitShy Date: Fri, 9 Feb 2024 14:52:26 +0100 Subject: [PATCH 3/5] Convert date/datetime for Date metadata --- src/zimscraperlib/zim/creator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index d209da60..b3b4f1e0 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -187,6 +187,8 @@ def add_metadata( mimetype: str = "text/plain;charset=UTF-8", ): self.validate_metadata(name, content) + if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)): + content = content.strftime("%Y-%m-%d").encode("UTF-8") if name == "Tags" and not isinstance(content, str): content = ";".join(content) super().add_metadata(name, content, mimetype) From fba609de41f97910523a35a795894a4166e98d20 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 12 Feb 2024 17:48:14 +0100 Subject: [PATCH 4/5] Mutualize test code base + add pyright hint pyright ignore is mandatory because the type checker has no idea about the impact of validate_metadata/validate_tag. This might be fixed by a shared logic and a TypeGuard but is not available until Python 3.10 ; other solution would be to transfer metadata to a way more typed container after validation. --- src/zimscraperlib/zim/creator.py | 8 +++- tests/zim/test_zim_creator.py | 72 ++++++++++---------------------- 2 files changed, 28 insertions(+), 52 deletions(-) diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index b3b4f1e0..4e25d1de 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -17,6 +17,7 @@ - content stored on object - can be used to store a filepath and content read from it (not stored) """ +import collections.abc import datetime import pathlib import re @@ -189,7 +190,12 @@ def add_metadata( self.validate_metadata(name, content) if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)): content = content.strftime("%Y-%m-%d").encode("UTF-8") - if name == "Tags" and not isinstance(content, str): + if ( + name == "Tags" + and not isinstance(content, str) + and not isinstance(content, bytes) + and isinstance(content, collections.abc.Iterable) + ): content = ";".join(content) super().add_metadata(name, content, mimetype) diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py index 2041d312..9db1b1ca 100644 --- a/tests/zim/test_zim_creator.py +++ b/tests/zim/test_zim_creator.py @@ -514,7 +514,26 @@ def test_check_metadata(tmp_path): Creator(tmp_path, "").config_dev_metadata(LongDescription="T" * 5000).start() -def test_config_metadata(tmp_path, png_image): +@pytest.mark.parametrize( + "tags", + [ + ( + "wikipedia;_category:wikipedia;_pictures:no;_videos:no;_details:yes;" + "_ftindex:yes" + ), + ( + [ + "wikipedia", + "_category:wikipedia", + "_pictures:no", + "_videos:no", + "_details:yes", + "_ftindex:yes", + ] + ), + ], +) +def test_config_metadata(tmp_path, png_image, tags): fpath = tmp_path / "test_config.zim" with open(png_image, "rb") as fh: png_data = fh.read() @@ -529,8 +548,7 @@ def test_config_metadata(tmp_path, png_image): " from the english Wikipedia by 2009-11-10. The topics are...", Language="eng", License="CC-BY", - Tags="wikipedia;_category:wikipedia;_pictures:no;_videos:no;" - "_details:yes;_ftindex:yes", + Tags=tags, Flavour="nopic", Source="https://en.wikipedia.org/", Scraper="mwoffliner 1.2.3", @@ -573,54 +591,6 @@ def test_config_metadata(tmp_path, png_image): assert reader.get_text_metadata("TestMetadata") == "Test Metadata" -def test_creator_metadata_list_as_tags(tmp_path, png_image): - """ - Test creator metadata setup when "Tags" is a list of strings. - - The test case is based on test_config_metadata() - - This is also a regression test for #125. - """ - fpath = tmp_path / "test_config.zim" - with open(png_image, "rb") as fh: - png_data = fh.read() - creator = Creator(fpath, "").config_metadata( - Name="wikipedia_fr_football", - Title="English Wikipedia", - Creator="English speaking Wikipedia contributors", - Publisher="Wikipedia user Foobar", - Date="2009-11-21", - Description="All articles (without images) from the english Wikipedia", - LongDescription="This ZIM file contains all articles (without images)" - " from the english Wikipedia by 2009-11-10. The topics are...", - Language="eng", - License="CC-BY", - Tags=[ - "wikipedia", - "_category:wikipedia", - "_pictures:no", - "_videos:no", - "_details:yes", - "_ftindex:yes", - ], - Flavour="nopic", - Source="https://en.wikipedia.org/", - Scraper="mwoffliner 1.2.3", - Illustration_48x48_at_1=png_data, - TestMetadata="Test Metadata", - ) - with creator: - pass - - assert fpath.exists() - reader = Archive(fpath) - assert ( - reader.get_text_metadata("Tags") - == "wikipedia;_category:wikipedia;_pictures:no;_videos:no;" - "_details:yes;_ftindex:yes" - ) - - @pytest.mark.parametrize( "name,value,valid", [ From 10ca3fac77904492f5ac6f5bd3f2b5fc588a83c7 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 12 Feb 2024 20:03:56 +0100 Subject: [PATCH 5/5] Disable pyright bytes type promotion --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 779791ea..1184b0b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -240,3 +240,4 @@ exclude = [".env/**", ".venv/**"] extraPaths = ["src"] pythonVersion = "3.8" typeCheckingMode="basic" +disableBytesTypePromotions = true