add new progresbbar and finished the node

ScrapeGraphAI · Mar 2, 2024 · a0c7749 · a0c7749
1 parent 1250aee
commit a0c7749
Show file tree

Hide file tree

Showing 7 changed files with 132 additions and 4,853 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # 🕷️ ScrapeGraphAI: You Only Scrape Once
 
-ScrapeGraphAI is a *web scraping* python library based on LangChain which uses LLM and direct graph logic to create scraping pipelines.
+ScrapeGraphAI is a *web scraping* python library based on LangChain which uses LLM and direct graph logic to create scraping pipelines for websites and documents.
 Just say which information you want to extract and the library will do it for you!
 
 <p align="center">

diff --git a/examples/graph_examples/graph_from_text_example.py b/examples/graph_examples/graph_from_text_example.py
@@ -6,7 +6,7 @@
 from dotenv import load_dotenv
 from scrapegraphai.models import OpenAI
 from scrapegraphai.graphs import BaseGraph
-from scrapegraphai.nodes import TextNode, ParseHTMLNode, GenerateAnswerNode
+from scrapegraphai.nodes import TextNode, ParseTextNode, GenerateAnswerNode
 
 load_dotenv()
 
@@ -20,11 +20,13 @@
 }
 model = OpenAI(llm_config)
 
-text = open("text_example.txt", "r", encoding="utf-8")
+with open("text_example.txt", "r", encoding="utf-8") as file:
+    text = file.read()
+
 
 # define the nodes for the graph
-fetch_html_node = TextNode("fetch_html")
-parse_document_node = ParseHTMLNode("parse_document", )
+fetch_html_node = TextNode("load_html")
+parse_document_node = ParseTextNode("parse_document")
 generate_answer_node = GenerateAnswerNode(model, "generate_answer")
 
 # create the graph
@@ -42,7 +44,7 @@
 )
 
 # execute the graph
-inputs = {"user_input": "Give me the news",
+inputs = {"user_input": "Give me the name of all the news",
           "url": text}
 result = graph.execute(inputs)
 

diff --git a/examples/graph_examples/text_example.txt b/examples/graph_examples/text_example.txt
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -9,3 +9,5 @@
 from .rag_node import RAGNode
 from .text_to_speech_node import TextToSpeechNode
 from .image_to_text_node import ImageToTextNode
+from .text_node import TextNode
+from .parse_text_node import ParseTextNode
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -1,9 +1,15 @@
 """
 Module for generating the answer node
 """
-from langchain_core.output_parsers import JsonOutputParser
+# Imports from standard library
+from tqdm import tqdm
+
+# Imports from Langchain
 from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.runnables import RunnableParallel
+
+# Imports from the library
 from .base_node import BaseNode
 
 
@@ -99,7 +105,8 @@ def execute(self, state: dict) -> dict:
 
         chains_dict = {}
 
-        for i, chunk in enumerate(context):
+        # Use tqdm to add progress bar
+        for i, chunk in enumerate(tqdm(context, desc="Processing chunks")):
             prompt = PromptTemplate(
                 template=template_chunks,
                 input_variables=["question"],

diff --git a/scrapegraphai/nodes/parse_text_node.py b/scrapegraphai/nodes/parse_text_node.py
@@ -0,0 +1,74 @@
+"""
+Module for parsing the HTML node
+"""
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from .base_node import BaseNode
+
+
+class ParseTextNode(BaseNode):
+    """
+    A node responsible for parsing HTML content from a document using specified tags. 
+    It uses BeautifulSoupTransformer for parsing, providing flexibility in extracting
+    specific parts of an HTML document based on the tags provided in the state.
+
+    This node enhances the scraping workflow by allowing for targeted extraction of 
+    content, thereby optimizing the processing of large HTML documents.
+
+    Attributes:
+        node_name (str): The unique identifier name for the node, defaulting to "ParseHTMLNode".
+        node_type (str): The type of the node, set to "node" indicating a standard operational node.
+
+    Args:
+        node_name (str, optional): The unique identifier name for the node. 
+        Defaults to "ParseHTMLNode".
+
+    Methods:
+        execute(state): Parses the HTML document contained within the state using 
+        the specified tags, if provided, and updates the state with the parsed content.
+    """
+
+    def __init__(self, node_name: str):
+        """
+        Initializes the ParseHTMLNode with a node name.
+        Args:
+            node_name (str): name of the node
+            node_type (str, optional): type of the node
+        """
+        super().__init__(node_name, "node")
+
+    def execute(self,  state):
+        """
+        Executes the node's logic to parse the HTML document based on specified tags. 
+        If tags are provided in the state, the document is parsed accordingly; otherwise, 
+        the document remains unchanged. The method updates the state with either the original 
+        or parsed document under the 'parsed_document' key.
+
+        Args:
+            state (dict): The current state of the graph, expected to contain 
+            'document' within 'keys', and optionally 'tags' for targeted parsing.
+
+        Returns:
+            dict: The updated state with the 'parsed_document' key containing the parsed content,
+                  if tags were provided, or the original document otherwise.
+
+        Raises:
+            KeyError: If 'document' is not found in the state, indicating that the necessary 
+                      information for parsing is missing.
+        """
+
+        print("---PARSING TEXT DOCUMENT---")
+        try:
+            document = state["document"]
+        except KeyError as e:
+            print(f"Error: {e} not found in state.")
+            raise
+
+        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+            chunk_size=4000,
+            chunk_overlap=0,
+        )
+
+        chunks = text_splitter.split_text(document)
+        state.update({"document_chunks": chunks})
+
+        return state
diff --git a/scrapegraphai/nodes/text_node.py b/scrapegraphai/nodes/text_node.py
@@ -38,14 +38,12 @@ def __init__(self, node_name: str):
         """
         super().__init__(node_name, "node")
 
-    def execute(self, state: dict, text: str) -> dict:
+    def execute(self, state: dict) -> dict:
         """
-        Executes the node's logic to fetch HTML content from a specified URL and
-        update the state with this content.
+        Add to the state the text as a document
 
         Args:
             state (dict): The current state of the graph, expected to contain a 'url' key.
-            text (str):
 
         Returns:
             dict: The updated state with a new 'document' key containing the fetched HTML content.
@@ -54,8 +52,8 @@ def execute(self, state: dict, text: str) -> dict:
             KeyError: If the 'url' key is not found in the state, indicating that the
                       necessary information to perform the operation is missing.
         """
-        print("---LOADING HTML CODE---")
+        print("---LOADING TEXT CODE---")
 
-        state["document"] = text
+        state["document"] = state["url"]
 
         return state