-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathted_scraper.py
83 lines (67 loc) · 2.47 KB
/
ted_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import bs4
import time
import re
import sys
FIREFOX_DRIVER_PATH = './geckodriver'
TALK_XPATH = "//div[@id='browse-results']//div[1]//div//div//div//div//div[2]//h4[2]//a"
TRANSCRIPT_BUTTON_XPATH = "//div[@id='content']//div//div[4]//div[1]//div/a[2]"
TRANSCRIPT_SECTION_XPATH = "//div[@id='content']//div//div[4]//div[2]//section//div//div[2]//p"
NEXT_PAGE_XPATH = "//*[@id='browse-results']/div[2]/div/a[last()]"
LAUGHTER_THRESHOLD = 3
NUM_TALKS = 100
START_PAGE = 41
LAUGHTER = '(Laughter)'
RESET = False
# Clear text file
if RESET:
with open('ted_scraped.txt', 'w') as f:
f.truncate(0)
driver = webdriver.Firefox(executable_path=FIREFOX_DRIVER_PATH)
driver.get(f'https://www.ted.com/talks?page={START_PAGE}&sort=newest&language=en')
talk_count = 0
def get_ith_talk(i):
return f"//div[@id='browse-results']//div[1]//div[{i + 1}]//div//div//div//div[2]//h4[2]//a"
while True:
# Get all talks on this page
n_talk_links = len(driver.find_elements_by_xpath(TALK_XPATH))
for i in range(n_talk_links):
# Click link
talk_link = driver.find_element_by_xpath(get_ith_talk(i))
driver.get(talk_link.get_attribute('href'))
# Look for transcript
transcript_button = driver.find_elements_by_xpath(TRANSCRIPT_BUTTON_XPATH)
# Click transcript if available, skip otherwise
if len(transcript_button) > 0:
transcript_link = transcript_button[0].get_attribute('href')
driver.get(transcript_link)
else:
print('No transcript found, skipping')
driver.back()
continue
# Get transcript sections
sections = driver.find_elements_by_xpath(TRANSCRIPT_SECTION_XPATH)
transcript = ''
# Get each section and add to current transcript
for section in sections:
text = bs4.BeautifulSoup(section.get_attribute('innerHTML'), features="html.parser").get_text()
text = re.sub(r"\n", r" ", text)
transcript += text + '\n'
# If funny, add to full transcript
if transcript.count(LAUGHTER) >= LAUGHTER_THRESHOLD:
with open('ted_scraped.txt', 'a') as f:
f.write(f"NNNNN\n{transcript}")
talk_count += 1
print(f'[Count: {talk_count}]')
if talk_count >= NUM_TALKS:
break
else:
print('Not funny, skipping')
driver.back()
driver.back()
if talk_count >= NUM_TALKS:
break
next_page = driver.find_element_by_xpath(NEXT_PAGE_XPATH)
driver.get(next_page.get_attribute('href'))
driver.quit()