diff --git a/CHANGELOG.md b/CHANGELOG.md index ac640f3..81dc91d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New `--version` flag to display Zimit version - New `--logging` flag to adjust Browsertrix Crawler logging (#273) - Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275) +- New `--noMobileDevice` CLI argument ### Changed @@ -21,6 +22,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Adopt Python bootstrap conventions - Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim - Upgrade to Python 3.12 + upgrade dependencies +- `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values +- `--userAgent` CLI arguement is not mandatory anymore +- Upgraded Browsertrix Crawler to 1.0.3 + +### Fixed + +- Fix support for Youtube videos (#291) ## [1.6.3] - 2024-01-18 diff --git a/Dockerfile b/Dockerfile index 53306f3..c1731d9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.0.0-beta.6 +FROM webrecorder/browsertrix-crawler:1.0.3 LABEL org.opencontainers.image.source https://github.com/openzim/zimit # add deadsnakes ppa for Python 3.12 on Ubuntu Jammy diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index dbf3965..7c2764a 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -26,11 +26,6 @@ from zimit.__about__ import __version__ -DEFAULT_USER_AGENT = ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 " - "(KHTML, like Gecko) Version/17.0 Safari/605.1.15" -) - EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2 EXIT_CODE_CRAWLER_LIMIT_HIT = 11 NORMAL_WARC2ZIM_EXIT_CODE = 100 @@ -230,13 +225,21 @@ def run(raw_args): help="Emulate mobile device by name from " "https://github.com/puppeteer/puppeteer/blob/" "main/packages/puppeteer-core/src/common/Device.ts", + default="Pixel 2", + ) + + parser.add_argument( + "--noMobileDevice", + help="Do not emulate a mobile device (use at your own risk, behavior is" + "uncertain)", + action="store_true", + default=False, ) parser.add_argument( "--userAgent", - help="Override default user-agent with specified value ; --userAgentSuffix is " - "still applied", - default=DEFAULT_USER_AGENT, + help="Override default user-agent with specified value ; --userAgentSuffix and " + "--adminEmail have no effect when this is set", ) parser.add_argument( @@ -367,7 +370,7 @@ def run(raw_args): except Exception: logger.error("Failed to get Browsertrix crawler version") raise - crawler_version = crawl.stdout + crawler_version = crawl.stdout.strip() logger.info(f"Browsertrix crawler: version {crawler_version}") # pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler @@ -384,11 +387,9 @@ def run(raw_args): url = zimit_args.url - user_agent = zimit_args.userAgent - if zimit_args.userAgentSuffix: - user_agent += f" {zimit_args.userAgentSuffix}" + user_agent_suffix = zimit_args.userAgentSuffix if zimit_args.adminEmail: - user_agent += f" {zimit_args.adminEmail}" + user_agent_suffix += f" {zimit_args.adminEmail}" if url: url = get_cleaned_url(url) @@ -443,8 +444,12 @@ def cleanup(): cmd_args.append("--url") cmd_args.append(url) - cmd_args.append("--userAgent") - cmd_args.append(user_agent) + cmd_args.append("--userAgentSuffix") + cmd_args.append(user_agent_suffix) + + if not zimit_args.noMobileDevice: + cmd_args.append("--mobileDevice") + cmd_args.append(zimit_args.mobileDevice) cmd_args.append("--cwd") cmd_args.append(str(temp_root_dir)) @@ -538,7 +543,7 @@ def get_node_cmd_line(args): "collection", "allowHashUrls", "lang", - "mobileDevice", + "userAgent", "useSitemap", "behaviors", "behaviorTimeout", diff --git a/tests/test_dummy.py b/tests/test_dummy.py index dd89067..54af094 100644 --- a/tests/test_dummy.py +++ b/tests/test_dummy.py @@ -1,6 +1,6 @@ -from zimit.zimit import DEFAULT_USER_AGENT +from zimit.zimit import NORMAL_WARC2ZIM_EXIT_CODE # dummy test, just to have coverage report done -def test_default_user_agent(): - assert DEFAULT_USER_AGENT +def test_something_exists(): + assert NORMAL_WARC2ZIM_EXIT_CODE