Skip to content

Commit

Permalink
feat: try using readability to parse web pages
Browse files Browse the repository at this point in the history
  • Loading branch information
CNSeniorious000 committed Nov 22, 2024
1 parent 8f498ed commit da15652
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "reasonify-headless"
description = "headless implementation for the Reasonify Agent"
license = { text = "MIT" }
dynamic = [ "version" ]
requires-python = ">=3.12"
requires-python = "~=3.12"
authors = [ { name = "Muspi Merol", email = "[email protected]" } ]
readme = "README.md"
dependencies = [
Expand All @@ -16,6 +16,7 @@ dependencies = [
"html-text~=0.6.2",
"html2text2~=1.0.0",
"isomorphic-fetch~=0.0.0.0.dev2",
"python-readability~=0.0.2",
]

[build-system]
Expand Down
6 changes: 5 additions & 1 deletion reasonify-headless/reasonify/tools/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,8 @@ async def _fetch(url: str, strategy: Strategy):
except OSError:
res = await fetch(f"/api/proxy?url={url}")

return res.status, post_process(pre_process(res.text, strategy))
from readability import parse

text = parse(res.text, keep_classes=True).content or res.text

return res.status, post_process(pre_process(text, strategy))

0 comments on commit da15652

Please sign in to comment.