-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyzer.py
58 lines (44 loc) · 1.5 KB
/
analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
articles = pd.read_csv("articles_by_year.csv")
start_time = time.time()
crime_soups = list()
for url in articles["Article"][1:10]:
session = requests.Session()
page = session.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
crime_soups.append(soup)
end_time = time.time()
execution_time = end_time - start_time
execution_time
import concurrent.futures
import requests
from bs4 import BeautifulSoup
def fetch_page(url):
session = requests.Session()
page = session.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
return soup
# Define the list of URLs
urls = articles["Article"][1:10]
# Initialize a list to store the results
crime_soups = []
# Define the maximum number of concurrent workers
MAX_WORKERS = 5 # You can adjust this based on your system resources
start_time = time.time()
# Use ThreadPoolExecutor for concurrency
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
# Submit tasks for each URL
future_to_url = {executor.submit(fetch_page, url): url for url in urls}
# Retrieve results as they are completed
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
soup = future.result()
crime_soups.append(soup)
except Exception as e:
print(f"Error fetching URL {url}: {e}")
end_time = time.time()
execution_time = end_time - start_time