Skip to content

Commit

Permalink
Merge pull request #38 from am9zZWY/summarization
Browse files Browse the repository at this point in the history
Add summarization of text
  • Loading branch information
am9zZWY authored Jul 17, 2024
2 parents ac433cc + 6a23d08 commit ea5a968
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 19 deletions.
5 changes: 4 additions & 1 deletion engine/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
from crawl import Crawler
from custom_db import index_pages, access_index, save_pages
from download import Downloader, Loader
from summarize import Summarizer
from tokenizer import Tokenizer
from index import Indexer
# Server
from server import start_server
print('Test')

# Threading
MAX_THREADS = 10
Expand Down Expand Up @@ -48,14 +48,17 @@ async def pipeline(from_crawl: bool = False):
tokenizer = Tokenizer()
downloader = Downloader()
loader = Loader()
summarizer = Summarizer()

# Add the pipeline elements
# Crawler: Crawl the website
crawler.add_next(downloader)
crawler.add_next(indexer)
#crawler.add_next(summarizer)

# Loader: Load the pages from the disk
loader.add_next(indexer)
loader.add_next(summarizer)

# Indexer: Index the pages
indexer.add_next(tokenizer)
Expand Down
10 changes: 4 additions & 6 deletions engine/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
# Automatically generated by https://github.com/damnever/pigar.

aiofiles==24.1.0
aiohttp==3.9.5
beautifulsoup4==4.12.3
duckdb==1.0.0
eld==1.0.6
Flask==3.0.3
Flask-Cors==4.0.1
lxml==5.2.2
nest-asyncio==1.6.0
nltk==3.8.1
numpy==1.26.4
pandas==2.2.2
scikit-learn==1.5.1
aiohttp==3.9.5
spacy==3.7.5
lxml==5.2.2

aiofiles==24.1.0
gensim==4.3.2
transformers==4.42.4
62 changes: 50 additions & 12 deletions engine/summarize.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,53 @@
from gensim.summarization import summarize
from transformers import pipeline

# Input text to be summarized
input_text = """
Your input text goes here. It can be a long paragraph or multiple paragraphs.
"""
from pipeline import PipelineElement

# Generate the summary using TextRank algorithm
summary = summarize(input_text, ratio=0.3) # You can adjust the ratio parameter based on the summary length you desire
# Load summarization pipeline
MODEL = "google/pegasus-xsum"
print(f"Loading summarization model {MODEL} ... This may take a few minutes.")
summarizer = pipeline("summarization", model=MODEL, tokenizer=MODEL)

# Output the summary
print("Original Text:")
print(input_text)
print("\nSummary:")
print(summary)

def summarize_text(text: str, max_words: int = 15) -> str:
summary = summarizer(text, max_length=max_words * 2, min_length=max_words, do_sample=False)[0]['summary_text']

# Truncate to the specified number of words
words = summary.split()
if len(words) > max_words:
summary = ' '.join(words[:max_words]) + '...'

return summary


class Summarizer(PipelineElement):
"""
Summarizes the input text.
"""

def __init__(self):
super().__init__("Summarizer")

async def process(self, data, link):
"""
Summarizes the input text.
"""

soup = data
if soup is None:
print(f"Failed to summarize {link} because the data was empty.")
return

# Get the text from the main content
main_content = soup.find("main") or soup.find("article") or soup.find("section") or soup.find("body")

if main_content is None:
print(f"Warning: No main content found for {link}. Using entire body.")
main_content = soup

text = main_content.get_text()

summary = summarize_text(text)
print(f"Summarized {link} to: {summary}")

if not self.is_shutdown():
await self.call_next(summary)

0 comments on commit ea5a968

Please sign in to comment.