-
Notifications
You must be signed in to change notification settings - Fork 0
/
sec_filings_scraper_robust.py
111 lines (97 loc) · 4.3 KB
/
sec_filings_scraper_robust.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
from bs4 import BeautifulSoup
import time
import json
from mongodb_connection import MongoDBHandler
def init_driver():
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=chrome_options)
return driver
def safe_get(driver, url, retries=3, delay=5):
while retries > 0:
try:
driver.get(url)
return
except WebDriverException:
time.sleep(delay)
retries -= 1
raise WebDriverException(f"Failed to load {url} after multiple attempts")
def scroll_to_bottom(driver, pause_time=2, scroll_limit=10):
last_height = driver.execute_script("return document.body.scrollHeight")
count = 0
while count < scroll_limit:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(pause_time)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
count += 1
def scrape_current_page(driver):
src = driver.page_source
soup = BeautifulSoup(src, 'html.parser')
data = []
rows = soup.find_all('tr')[1:]
for row in rows:
try:
tds = row.find_all('td')
if len(tds) == 6:
issuer_links = tds[2].find_all('a')
issuer = ''
for link in issuer_links:
if 'Issuer' in link.text:
issuer = link.text.split('(')[0].strip()
data.append({
'Form Type': tds[0].get_text(strip=True),
'Links': "https://www.sec.gov" + tds[1].find('a', href=True)['href'],
'Description': tds[2].get_text(strip=True),
'Accepted Date': tds[3].get_text(strip=True),
'Filing Date': tds[4].get_text(strip=True),
'File/Film Number': tds[5].get_text(strip=True)#,
#'Filing Content': '',
#'issuer': issuer
})
except (IndexError, TypeError, KeyError, AttributeError) as e:
print(f"Error processing row: {e}")
return data
def main():
driver = init_driver()
base_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&datea=&dateb=&company=&type=&SIC=&State=&Country=&CIK=&owner=include&accno=&start={}&count=40"
all_data = []
try:
for pagenum in range(0, 40, 40):
safe_get(driver, base_url.format(pagenum))
scroll_to_bottom(driver)
data = scrape_current_page(driver)
for entry in data:
try:
safe_get(driver, entry['Links'])
scroll_to_bottom(driver, pause_time=2, scroll_limit=5)
src = driver.page_source
soup = BeautifulSoup(src, 'html.parser')
entry['Filing Content'] = soup.find('pre').get_text() if soup.find('pre') else 'Content not found'
except Exception as e:
print(f"Error accessing filing details: {e}")
all_data.extend(data)
finally:
driver.quit()
with open("mongodb_credentials.json","r"):
uri = json.loads("mongodb_credentials.json")
handler = MongoDBHandler(uri)
handler.insert_records("finance_database",
"sec_filings",
all_data,
avoid_duplicates=True)
if __name__ == "__main__":
main()