Skip to content

Commit

Permalink
Handle case where the redirect target is bad
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Jul 24, 2024
1 parent 315d908 commit b86ed7c
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 16 deletions.
28 changes: 20 additions & 8 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,15 +412,27 @@ def gather_information_from_warc(self):
# check for duplicates, might happen due to fuzzy rules
if zim_path not in self.redirections:
if redirect_location := record.http_headers.get("Location"):
redirection_zim_path = normalize(
HttpUrl(urljoin(url, redirect_location))
)
# Redirection to same ZIM path have to be ignored (occurs for
# instance when redirecting from http to https)
if zim_path != redirection_zim_path:
self.redirections[zim_path] = redirection_zim_path
try:
redirection_zim_path = normalize(
HttpUrl(urljoin(url, redirect_location))
)
# Redirection to same ZIM path have to be ignored (occurs
# for instance when redirecting from http to https)
if zim_path != redirection_zim_path:
self.redirections[zim_path] = redirection_zim_path
except Exception as exc:
# Ignore exceptions in redirection handling, this is too
# common to have bad redirections target just like we have
# many bad URLs in HTML code
logger.debug(
f"Failed to process redirection of "
f"{zim_path.value} to {redirect_location} : {exc} ; "
"no ZIM item will be created"
)
else:
logger.warning(f"Redirection target is empty for {zim_path}")
logger.warning(
f"Redirection target is empty for {zim_path.value}"
)
else:
self.expected_zim_items.add(zim_path)

Expand Down
2 changes: 2 additions & 0 deletions test-website/Caddyfile
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@
redir /bad-redir-loop-B /bad-redir-loop-C 307
redir /bad-redir-loop-C /bad-redir-loop-D 307
redir /bad-redir-loop-D /bad-redir-loop-B 307
redir /bad-redir-target-A https://I%20mNotAhostname 307
redir /bad-redir-target-B intent://example.com/path#Intent;scheme=http;package=com.example.myapp;component=com.example.myapp/.MainActivity;end 307

header /content-types/script1.js Content-Type application/javascript
header /content-types/script2.js Content-Type text/javascript
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,20 @@

<body>

<h2>Redirection loops</h2>
<h2>Bad redirections</h2>

<p>Links below are indefinitely redirecting</p>
<p>Links below are indefinitely redirecting and hence not working</p>

<p><a href="/bad-redir-loop-B">Redirect to self through loop</a></p>

<p><a href="/bad-redir-loop-A">Redirect to inner-loop</a></p>

<p>Links below is targeting something which is not working</p>

<p><a href="/bad-redir-target-A">Redirect to silly HTTP URL</a></p>

<p><a href="/bad-redir-target-B">Redirect to an intent (not working inside ZIM)</a></p>

</body>

</html>
2 changes: 1 addition & 1 deletion test-website/content/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
<li><a href="./base-href.html">Base href</a></li>
<li><a href="./onxxx.html">onxxx HTML events</a></li>
<li><a href="./href-to-folder/">links to folder instead of file</a></li>
<li><a href="./redirection-loops.html">Bad redirections loops</a></li>
<li><a href="./bad-redirections.html">Bad redirections</a></li>
<li><a href="./content-types/index.html">Handling of content types</a></li>
</ul>
</body>
Expand Down
Binary file added tests/data/bad-redirections.warc.gz
Binary file not shown.
Binary file removed tests/data/redir-loops.warc.gz
Binary file not shown.
11 changes: 6 additions & 5 deletions tests/test_warc_to_zim.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,24 +788,24 @@ def test_http_return_codes(self, tmp_path):
zim_output, f"website.test.openzim.org/{ignored_website_items}"
)

def test_redirection_loops(self, tmp_path):
zim_output = "test-redir-loops.zim"
def test_bad_redirections(self, tmp_path):
zim_output = "test-bad-redirections.zim"

main(
[
os.path.join(TEST_DATA_DIR, "redir-loops.warc.gz"),
os.path.join(TEST_DATA_DIR, "bad-redirections.warc.gz"),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"test-redir-loops",
"test-bad-redirections",
]
)
zim_output = tmp_path / zim_output

for exising_website_items in [
"redirection-loops.html",
"bad-redirections.html",
]:
self.assert_item_exist(
zim_output, f"website.test.openzim.org/{exising_website_items}"
Expand All @@ -816,6 +816,7 @@ def test_redirection_loops(self, tmp_path):
"/bad-redir-loop-B",
"/bad-redir-loop-C",
"/bad-redir-loop-D",
"/bad-redir-target",
]:
self.assert_item_does_not_exist(
zim_output, f"website.test.openzim.org/{ignored_website_items}"
Expand Down

0 comments on commit b86ed7c

Please sign in to comment.