Merge pull request #38 from am9zZWY/summarization

Add summarization of text
am9zZWY · Jul 17, 2024 · ea5a968 · ea5a968
2 parents ac433cc + 6a23d08
commit ea5a968
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 19 deletions.
diff --git a/engine/main.py b/engine/main.py
@@ -15,11 +15,11 @@
 from crawl import Crawler
 from custom_db import index_pages, access_index, save_pages
 from download import Downloader, Loader
+from summarize import Summarizer
 from tokenizer import Tokenizer
 from index import Indexer
 # Server
 from server import start_server
-print('Test')
 
 # Threading
 MAX_THREADS = 10
@@ -48,14 +48,17 @@ async def pipeline(from_crawl: bool = False):
     tokenizer = Tokenizer()
     downloader = Downloader()
     loader = Loader()
+    summarizer = Summarizer()
 
     # Add the pipeline elements
     # Crawler: Crawl the website
     crawler.add_next(downloader)
     crawler.add_next(indexer)
+    #crawler.add_next(summarizer)
 
     # Loader: Load the pages from the disk
     loader.add_next(indexer)
+    loader.add_next(summarizer)
 
     # Indexer: Index the pages
     indexer.add_next(tokenizer)

diff --git a/engine/requirements.txt b/engine/requirements.txt
@@ -1,18 +1,16 @@
 # Automatically generated by https://github.com/damnever/pigar.
 
+aiofiles==24.1.0
+aiohttp==3.9.5
 beautifulsoup4==4.12.3
 duckdb==1.0.0
 eld==1.0.6
 Flask==3.0.3
 Flask-Cors==4.0.1
+lxml==5.2.2
 nest-asyncio==1.6.0
-nltk==3.8.1
 numpy==1.26.4
 pandas==2.2.2
 scikit-learn==1.5.1
-aiohttp==3.9.5
 spacy==3.7.5
-lxml==5.2.2
-
-aiofiles==24.1.0
-gensim==4.3.2
+transformers==4.42.4
diff --git a/engine/summarize.py b/engine/summarize.py
@@ -1,15 +1,53 @@
-from gensim.summarization import summarize
+from transformers import pipeline
 
-# Input text to be summarized
-input_text = """
-Your input text goes here. It can be a long paragraph or multiple paragraphs. 
-"""
+from pipeline import PipelineElement
 
-# Generate the summary using TextRank algorithm
-summary = summarize(input_text, ratio=0.3)  # You can adjust the ratio parameter based on the summary length you desire
+# Load summarization pipeline
+MODEL = "google/pegasus-xsum"
+print(f"Loading summarization model {MODEL} ... This may take a few minutes.")
+summarizer = pipeline("summarization", model=MODEL, tokenizer=MODEL)
 
-# Output the summary
-print("Original Text:")
-print(input_text)
-print("\nSummary:")
-print(summary)
+
+def summarize_text(text: str, max_words: int = 15) -> str:
+    summary = summarizer(text, max_length=max_words * 2, min_length=max_words, do_sample=False)[0]['summary_text']
+
+    # Truncate to the specified number of words
+    words = summary.split()
+    if len(words) > max_words:
+        summary = ' '.join(words[:max_words]) + '...'
+
+    return summary
+
+
+class Summarizer(PipelineElement):
+    """
+    Summarizes the input text.
+    """
+
+    def __init__(self):
+        super().__init__("Summarizer")
+
+    async def process(self, data, link):
+        """
+        Summarizes the input text.
+        """
+
+        soup = data
+        if soup is None:
+            print(f"Failed to summarize {link} because the data was empty.")
+            return
+
+        # Get the text from the main content
+        main_content = soup.find("main") or soup.find("article") or soup.find("section") or soup.find("body")
+
+        if main_content is None:
+            print(f"Warning: No main content found for {link}. Using entire body.")
+            main_content = soup
+
+        text = main_content.get_text()
+
+        summary = summarize_text(text)
+        print(f"Summarized {link} to: {summary}")
+
+        if not self.is_shutdown():
+            await self.call_next(summary)