This repository has been archived by the owner on Feb 4, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 44
/
summary.py
80 lines (69 loc) · 2.22 KB
/
summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
Simple Extractive Text Summarization
Author: Peb Ruswono Aryan
Description: extract N most-similar sentences with title (first sentence)
"""
from tokenization import *
f = open('indonesian')
stopwords = [w.strip() for w in f.readlines()]
f.close()
def distance(s1,s2):
score = 0.
for t in s1['tokens']:
#ignore delimiters
if t in [',".()[]']:continue
if t.lower() in stopwords: continue
if t in s2['tokens']:
score += 1.
return score / max(len(s1['tokens']),len(s2['tokens']))
def strip_stopword_affix(tmp):
tl = tmp.split(' ')
startidx = 0
while tl[startidx].lower() in stopwords: startidx += 1
endidx = -1
while tl[endidx].lower() in stopwords: endidx -= 1
return " ".join(tl[startidx:endidx]).strip()
def make_summary(text, title=None, maxresult=5, minthreshold=1e-5):
lines = re.sub('[\x80-\xff]','',text).split('\n')
sentences = []
for l in lines:
if len(l) == 0: continue
out = sentence_extraction(cleaning(l))
for o in out:
sentences.append({'original': o, 'tokens': tokenisasi_kalimat(o)})
#title
if title is None:
t = sentences[0]
else:
t = {'original': title, 'tokens': tokenisasi_kalimat(title)}
#print t
#calculate dot product of each sentence with title
dist = []
for si in xrange(1,len(sentences)):
d = distance(t, sentences[si])
dist.append((si, d))
dist = sorted(dist, key=lambda d: d[1], reverse=True)
result = []
for dd in dist:
si,d = dd
result.append((si, sentences[si]))
if len(result)>=maxresult or d < minthreshold:
break
summary = sorted(result, key=lambda ss:ss[0])
output = []
for sum in summary:
otmp = sum[1]['original']
tmp = sum[1]['tokens']
if ',' in tmp:
ci = tmp.index(',')
cio = otmp.index(',')
cc = ci*1.0/len(tmp)
if cc < 0.4:
otmp = otmp[cio+1:]
output.append(strip_stopword_affix(otmp.strip()))
return " ".join(output)
if __name__ == '__main__':
f = open('data.txt', 'rb')
teks = f.read()
f.close()
print make_summary(teks)