-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
170 lines (156 loc) · 6.64 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import argparse
import asyncio
import json
import os
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from playwright.async_api import Browser, Page
import random
parser = argparse.ArgumentParser()
parser.add_argument('--start_page', type=int, default=1)
parser.add_argument('--end_page', type=int, default=464)
parser.add_argument('--output_dir', type=str, default='outputs')
args = parser.parse_args()
async def page_loading_func(page: Page):
global args
global extraction_strategy
start_page = args.start_page
end_page = args.end_page
## To start page
print(f"[HOOK] to the page {start_page} ...")
while True:
fisrt_title = await page.evaluate("document.querySelectorAll('#active-submissions > div > div > ul > li > div > h4 > a')[0].textContent")
page_btns = await page.evaluate("Array.from(document.querySelectorAll('#active-submissions > div > div > nav > ul > li > a[href=\"#\"]')).map((i) => i.text)")
available_pages = [int(page) for page in page_btns if str.isdigit(page)]
if available_pages[0] <= start_page and start_page <= available_pages[-1]:
## jump to target pages
await page.evaluate(
f"""
var page_btns = document.querySelectorAll('#active-submissions > div > div > nav > ul > li > a[href="#"]');
var target_btn = Array.from(page_btns).filter(function(page_btn) {{
return page_btn.textContent.trim() === '{start_page}';
}})[0];
target_btn.click();
"""
)
else:
## jump to the last page
await page.evaluate(
f"""
var page_btns = document.querySelectorAll('#active-submissions > div > div > nav > ul > li > a[href="#"]');
var target_btn = Array.from(page_btns).filter(function(page_btn) {{
return page_btn.textContent.trim() === '{available_pages[-1]}';
}})[0];
target_btn.click();
"""
)
while True:
## wait for page loading
await asyncio.sleep(0.5 + random.random())
next_first_title = await page.evaluate("document.querySelectorAll('#active-submissions > div > div > ul > li > div > h4 > a')[0].textContent")
if next_first_title != fisrt_title or start_page == 1:
fisrt_title = next_first_title
break
if (
available_pages[0] <= start_page
and start_page <= available_pages[-1]
):
print(f"current at {start_page}, {fisrt_title=}")
break
else:
print(f"current at {available_pages[-1]}, {fisrt_title=}")
###
for page_num in range(start_page, end_page + 1):
##
print(f"[HOOK] in the main loop, to the page {page_num}")
page_btns = await page.evaluate("Array.from(document.querySelectorAll('#active-submissions > div > div > nav > ul > li > a[href=\"#\"]')).map((i) => i.text)")
available_pages = [int(page) for page in page_btns if str.isdigit(page)]
await page.evaluate(
f"""
var page_btns = document.querySelectorAll('#active-submissions > div > div > nav > ul > li > a[href="#"]');
var target_btn = Array.from(page_btns).filter(function(page_btn) {{
return page_btn.textContent.trim() === '{page_num}';
}})[0];
target_btn.click();
"""
)
while True:
## wait for page loading
await asyncio.sleep(.5 + random.random())
next_first_title = await page.evaluate("document.querySelectorAll('#active-submissions > div > div > ul > li > div > h4 > a')[0].textContent")
if next_first_title != fisrt_title or page_num == start_page:
fisrt_title = next_first_title
break
print(f"[HOOK] unroll all details")
await page.evaluate(
"""
console.log(target_btn);
var show_detail_btns = document.querySelectorAll('a[data-toggle="collapse"]');
show_detail_btns.forEach(function(btn) {
btn.click();
});
"""
)
html = await page.content()
dict_list = extraction_strategy.extract("", html)
if os.path.exists(args.output_dir):
os.makedirs(args.output_dir, exist_ok=True)
with open(os.path.join(args.output_dir, f"result{page_num}.json"), "wt") as f:
json.dump(dict_list, f, indent=4)
extraction_strategy = JsonCssExtractionStrategy(
schema = {
"name": "activate submisson extractor",
"baseSelector": "#active-submissions > div > div > ul > li",
"fields": [
{
"name": "title",
"selector": "h4",
"type": "text",
},
{
"name": "link_suffix",
"selector": "h4 > a:nth-child(1)",
"type": "attribute",
"attribute": "href",
},
{
"name": "link",
"type": "computed",
"expression": "'https://openreview.net' + link_suffix"
},
{
"name": "pdf_link",
"type": "computed",
"expression": "'https://openreview.net/pdf?' + link_suffix.lstrip('/forum?')"
},
{
"name": "keywords",
"selector": "div.note-content > div:nth-child(1) > span",
"type": "text",
},
{
"name": "abstract",
"selector": "div.note-content-value.markdown-rendered",
"type": "text",
},
]
}
)
async def main():
crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True)
crawler_strategy.set_hook('before_retrieve_html', page_loading_func)
async with AsyncWebCrawler(
verbose=True,
crawler_strategy=crawler_strategy,
) as crawler:
await crawler.arun(
url="https://openreview.net/group?id=ICLR.cc/2025/Conference#tab-active-submissions",
# url="https://openreview.net",
bypass_cache=True,
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
wait_for="div#active-submissions",
extraction_strategy=extraction_strategy,
)
if __name__ == "__main__":
asyncio.run(main())