-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #38 from am9zZWY/summarization
Add summarization of text
- Loading branch information
Showing
3 changed files
with
58 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,16 @@ | ||
# Automatically generated by https://github.com/damnever/pigar. | ||
|
||
aiofiles==24.1.0 | ||
aiohttp==3.9.5 | ||
beautifulsoup4==4.12.3 | ||
duckdb==1.0.0 | ||
eld==1.0.6 | ||
Flask==3.0.3 | ||
Flask-Cors==4.0.1 | ||
lxml==5.2.2 | ||
nest-asyncio==1.6.0 | ||
nltk==3.8.1 | ||
numpy==1.26.4 | ||
pandas==2.2.2 | ||
scikit-learn==1.5.1 | ||
aiohttp==3.9.5 | ||
spacy==3.7.5 | ||
lxml==5.2.2 | ||
|
||
aiofiles==24.1.0 | ||
gensim==4.3.2 | ||
transformers==4.42.4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,53 @@ | ||
from gensim.summarization import summarize | ||
from transformers import pipeline | ||
|
||
# Input text to be summarized | ||
input_text = """ | ||
Your input text goes here. It can be a long paragraph or multiple paragraphs. | ||
""" | ||
from pipeline import PipelineElement | ||
|
||
# Generate the summary using TextRank algorithm | ||
summary = summarize(input_text, ratio=0.3) # You can adjust the ratio parameter based on the summary length you desire | ||
# Load summarization pipeline | ||
MODEL = "google/pegasus-xsum" | ||
print(f"Loading summarization model {MODEL} ... This may take a few minutes.") | ||
summarizer = pipeline("summarization", model=MODEL, tokenizer=MODEL) | ||
|
||
# Output the summary | ||
print("Original Text:") | ||
print(input_text) | ||
print("\nSummary:") | ||
print(summary) | ||
|
||
def summarize_text(text: str, max_words: int = 15) -> str: | ||
summary = summarizer(text, max_length=max_words * 2, min_length=max_words, do_sample=False)[0]['summary_text'] | ||
|
||
# Truncate to the specified number of words | ||
words = summary.split() | ||
if len(words) > max_words: | ||
summary = ' '.join(words[:max_words]) + '...' | ||
|
||
return summary | ||
|
||
|
||
class Summarizer(PipelineElement): | ||
""" | ||
Summarizes the input text. | ||
""" | ||
|
||
def __init__(self): | ||
super().__init__("Summarizer") | ||
|
||
async def process(self, data, link): | ||
""" | ||
Summarizes the input text. | ||
""" | ||
|
||
soup = data | ||
if soup is None: | ||
print(f"Failed to summarize {link} because the data was empty.") | ||
return | ||
|
||
# Get the text from the main content | ||
main_content = soup.find("main") or soup.find("article") or soup.find("section") or soup.find("body") | ||
|
||
if main_content is None: | ||
print(f"Warning: No main content found for {link}. Using entire body.") | ||
main_content = soup | ||
|
||
text = main_content.get_text() | ||
|
||
summary = summarize_text(text) | ||
print(f"Summarized {link} to: {summary}") | ||
|
||
if not self.is_shutdown(): | ||
await self.call_next(summary) |