Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor fetch node, add integration for preprocessing #37

Merged
merged 2 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion scrapegraphai/nodes/fetch_html_node.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,24 @@
"""
Module for fetching the HTML node
"""
from typing import Any
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_core.documents import Document
from .base_node import BaseNode
from ..utils.remover import remover


def _build_metadata(soup: Any, url: str) -> dict:
"""Build metadata from BeautifulSoup output."""
metadata = {"source": url}
if title := soup.find("title"):
metadata["title"] = title.get_text()
if description := soup.find("meta", attrs={"name": "description"}):
metadata["description"] = description.get(
"content", "No description found.")
if html := soup.find("html"):
metadata["language"] = html.get("lang", "No language found.")
return metadata


class FetchHTMLNode(BaseNode):
Expand Down Expand Up @@ -65,7 +81,10 @@ def execute(self, state: dict) -> dict:

loader = AsyncHtmlLoader(url)
document = loader.load()
metadata = document[0].metadata
document = remover(str(document[0]))

state["document"] = document
state["document"] = [
Document(page_content=document, metadata=metadata)]

return state
40 changes: 16 additions & 24 deletions scrapegraphai/utils/remover.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,32 @@
"""
Module for removing the unused html tags
"""
from bs4 import BeautifulSoup


def remover(file: str, only_body: bool = False) -> str:
def remover(html_content: str) -> str:
"""
This function elaborates the HTML file and remove all the not necessary tag
This function processes the HTML content, removes unnecessary tags,
and retrieves the title and body content.

Parameters:
file (str): the file to parse
html_content (str): the HTML content to parse

Returns:
str: the parsed file
str: the parsed title followed by the body content without script tags
"""

res = ""
soup = BeautifulSoup(html_content, 'html.parser')

if only_body:
is_body = True
else:
is_body = False
# Estrai il titolo
title_tag = soup.find('title')
title = title_tag.get_text() if title_tag else ""

for elem in file.splitlines():
if "<title>" in elem:
res = res + elem
# Rimuovi i tag <script> in tutto il documento
[script.extract() for script in soup.find_all('script')]

if "<body>" in elem:
is_body = True
# Estrai il corpo del documento
body_content = soup.find('body')
body = str(body_content) if body_content else ""

if "</body>" in elem:
break

if "<script>" in elem:
continue

if is_body:
res = res + elem

return res.replace("\\n", "")
return title + body
Loading