-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
article_parser.py
49 lines (41 loc) · 1.49 KB
/
article_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from typing import List
import terminal as terminal
unimportant_words = ["of", "the", "he", "they", "she", "it", "a", "all", "none", "to", "and", "with", "a", "an", "the", "in", "on", "at", "for", "by", "from", "of", "with", "as", "well", "also", "or", "is"]
def most_frequent(List):
counter = 0
num = List[0]
for i in List:
curr_frequency = List.count(i)
if(curr_frequency> counter):
counter = curr_frequency
num = i
return num
def normalize_words(words: List[str]):
normalized_words = []
for word in words:
new_word = ''.join(e for e in word if e.isalnum())
normalized_words.append(new_word.lower())
return normalized_words
def find_relevant_words(words: List[str]):
relevant_words = []
for word in words:
if word not in unimportant_words:
relevant_words.append(word)
normalized_words = normalize_words(relevant_words)
most_frequent_word = most_frequent(normalized_words)
return most_frequent_word
def parse_article(article: List[str]):
parsed_article = []
paragraph = article[0]
for paragraph in article:
words = paragraph.split(" ")
relevant_words = find_relevant_words(words)
duration = int(len(words) / 50) + 1
parsed_paragraph = {
"text": paragraph,
"duration": duration,
"words": words,
"images": []
}
parsed_article.append(parsed_paragraph)
return parsed_article