forked from TideDra/zotero-arxiv-daily
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tldr.py
72 lines (69 loc) · 3.24 KB
/
tldr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from tempfile import TemporaryDirectory
import arxiv
import tarfile
import re
from llama_cpp import Llama
def get_paper_summary(paper:arxiv.Result) -> str:
with TemporaryDirectory() as tmpdirname:
file = paper.download_source(dirpath=tmpdirname)
with tarfile.open(file) as tar:
tex_files = [f for f in tar.getnames() if f.endswith('.tex')]
if len(tex_files) == 0:
return None
#read all tex files
introduction = ""
conclusion = ""
for t in tex_files:
f = tar.extractfile(t)
content = f.read().decode('utf-8')
#remove comments
content = re.sub(r'%.*\n', '\n', content)
content = re.sub(r'\\begin{comment}.*?\\end{comment}', '', content, flags=re.DOTALL)
content = re.sub(r'\\iffalse.*?\\fi', '', content, flags=re.DOTALL)
#remove redundant \n
content = re.sub(r'\n+', '\n', content)
#remove cite
content = re.sub(r'~?\\cite.?\{.*?\}', '', content)
#remove figure
content = re.sub(r'\\begin\{figure\}.*?\\end\{figure\}', '', content, flags=re.DOTALL)
#remove table
content = re.sub(r'\\begin\{table\}.*?\\end\{table\}', '', content, flags=re.DOTALL)
#find introduction and conclusion
# end word can be \section or \end{document} or \bibliography or \appendix
match = re.search(r'\\section\{Introduction\}.*?(\\section|\\end\{document\}|\\bibliography|\\appendix|$)', content, flags=re.DOTALL)
if match:
introduction = match.group(0)
match = re.search(r'\\section\{Conclusion\}.*?(\\section|\\end\{document\}|\\bibliography|\\appendix|$)', content, flags=re.DOTALL)
if match:
conclusion = match.group(0)
return introduction, conclusion
def get_paper_tldr(paper:arxiv.Result, model:Llama) -> str:
try:
introduction, conclusion = get_paper_summary(paper)
except:
introduction, conclusion = "", ""
prompt = """Given the title, abstract, introduction and the conclusion (if any) of a paper in latex format, generate a one-sentence TLDR summary:
\\title{__TITLE__}
\\begin{abstract}__ABSTRACT__\\end{abstract}
__INTRODUCTION__
__CONCLUSION__
"""
prompt = prompt.replace('__TITLE__', paper.title)
prompt = prompt.replace('__ABSTRACT__', paper.summary)
prompt = prompt.replace('__INTRODUCTION__', introduction)
prompt = prompt.replace('__CONCLUSION__', conclusion)
prompt_tokens = model.tokenize(prompt.encode('utf-8'))
prompt_tokens = prompt_tokens[:3800] # truncate to 3800 tokens
prompt = model.detokenize(prompt_tokens).decode('utf-8')
response = model.create_chat_completion(
messages=[
{"role": "system", "content": "You are an assistant who perfectly summarizes scientific paper, and gives the core idea of the paper to the user."},
{
"role": "user",
"content": prompt
}
],
top_k=0,
temperature=0
)
return response['choices'][0]['message']['content']