diff --git a/scrapegraphai/nodes/fetch_html_node.py b/scrapegraphai/nodes/fetch_html_node.py index 71d1aa06..e68a5f26 100644 --- a/scrapegraphai/nodes/fetch_html_node.py +++ b/scrapegraphai/nodes/fetch_html_node.py @@ -1,8 +1,24 @@ """ Module for fetching the HTML node """ +from typing import Any from langchain_community.document_loaders import AsyncHtmlLoader +from langchain_core.documents import Document from .base_node import BaseNode +from ..utils.remover import remover + + +def _build_metadata(soup: Any, url: str) -> dict: + """Build metadata from BeautifulSoup output.""" + metadata = {"source": url} + if title := soup.find("title"): + metadata["title"] = title.get_text() + if description := soup.find("meta", attrs={"name": "description"}): + metadata["description"] = description.get( + "content", "No description found.") + if html := soup.find("html"): + metadata["language"] = html.get("lang", "No language found.") + return metadata class FetchHTMLNode(BaseNode): @@ -65,7 +81,10 @@ def execute(self, state: dict) -> dict: loader = AsyncHtmlLoader(url) document = loader.load() + metadata = document[0].metadata + document = remover(str(document[0])) - state["document"] = document + state["document"] = [ + Document(page_content=document, metadata=metadata)] return state diff --git a/scrapegraphai/utils/remover.py b/scrapegraphai/utils/remover.py index add1a973..36bf5ada 100644 --- a/scrapegraphai/utils/remover.py +++ b/scrapegraphai/utils/remover.py @@ -1,40 +1,32 @@ """ Module for removing the unused html tags """ +from bs4 import BeautifulSoup -def remover(file: str, only_body: bool = False) -> str: +def remover(html_content: str) -> str: """ - This function elaborates the HTML file and remove all the not necessary tag + This function processes the HTML content, removes unnecessary tags, + and retrieves the title and body content. Parameters: - file (str): the file to parse + html_content (str): the HTML content to parse Returns: - str: the parsed file + str: the parsed title followed by the body content without script tags """ - res = "" + soup = BeautifulSoup(html_content, 'html.parser') - if only_body: - is_body = True - else: - is_body = False + # Estrai il titolo + title_tag = soup.find('title') + title = title_tag.get_text() if title_tag else "" - for elem in file.splitlines(): - if "" in elem: - res = res + elem + # Rimuovi i tag <script> in tutto il documento + [script.extract() for script in soup.find_all('script')] - if "<body>" in elem: - is_body = True + # Estrai il corpo del documento + body_content = soup.find('body') + body = str(body_content) if body_content else "" - if "</body>" in elem: - break - - if "<script>" in elem: - continue - - if is_body: - res = res + elem - - return res.replace("\\n", "") + return title + body