-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscidir_reader.py
60 lines (50 loc) · 1.92 KB
/
scidir_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import requests
from bs4 import BeautifulSoup
import os
import json
import re
def get_sciencedirect_content(url):
response = requests.get('https://r.jina.ai/' + url)
# sanitize the content by removing lines starting with '>'
sanitized_content = []
for line in response.text.split('\n'):
if not line.startswith('>'):
sanitized_content.append(line)
return '\n'.join(sanitized_content)
def extract_title_abstract(text):
# Extract title
title_match = re.search(r'Title:\s*(.*)', text)
title = title_match.group(1).strip() if title_match else 'Title not found'
# Extract abstract
abstract_match = re.search(r'Abstract\s*--------\s*(.*)', text, re.DOTALL)
abstract = abstract_match.group(1).strip().split("Introduction")[0] if abstract_match else 'Abstract not found'
return title, abstract
def get_sciencedirect(url, use_cache=True):
folder = 'sciencedirect'
# get the article ID from the URL
article_id = url.split('/')[-1]
if os.path.exists(os.path.join(folder, article_id + '.json')) and use_cache:
with open(os.path.join(folder, article_id+ '.json')) as file:
return json.load(file)
else:
content = get_sciencedirect_content(url)
title, abstract = extract_title_abstract(content)
return {
'article_id': article_id,
'title': title,
'abstract': abstract,
'content': content
}
def save_article(article_dict):
import os
import json
folder = 'sciencedirect'
if not os.path.exists(folder):
os.makedirs(folder)
with open(os.path.join(folder, article_dict['article_id'] + '.json'), 'w') as file:
json.dump(article_dict, file)
if __name__ == '__main__':
url = 'https://www.sciencedirect.com/science/article/abs/pii/S0079742124000033'
article_dict = get_sciencedirect(url)
print(article_dict)
save_article(article_dict)