Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug fix and list replaced with deque #115

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 25 additions & 10 deletions script/Scraper/gutenberg/gutenberg_scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import os
from bs4 import BeautifulSoup
from collections import deque
import time

PROXY_CONNECTION_TIMEOUT = 5 # Timeout value in seconds
Expand All @@ -10,24 +11,35 @@ class GutenbergScraper:
def __init__(self):
"""
Initializes the GutenbergScraper class.
Loads the proxy list from the local file.
Initializes the proxy list to be handled with a deque
Loads the last processed book number from the progress file.
"""
self.proxy_list = self.load_proxy_list()
self.proxy_list = deque(self.load_proxy_list())
self.progress_file = r"script\Scraper\gutenberg\progress.txt"
self.json_file_path = r"StoreHouse\Literature\gutenberg_bibliographic_records.json"
self.last_book_number = self.load_progress()


def load_proxy_list(self):
"""
Loads the proxy list from the local file.
Returns a list of proxies.
Prompts the user for a path to a proxy list.
Returns a deque of proxies.
"""
with open(r"script\proxy\validProxyList.txt", "r") as file:
user_input = input("Enter the path for a proxy list (or press Enter to skip without using a proxy list):\n")

# Check if the user provided a path
if not user_input:
print("Skipping proxy list, loading...")
return []

file_path = os.path.join(user_input)

with open(file_path, "r") as file:
proxy_list = file.read().splitlines()
print("Confirming proxy list was loaded:", proxy_list[1])
return proxy_list


def load_progress(self):
"""
Loads the last processed book number from the progress file.
Expand Down Expand Up @@ -67,11 +79,14 @@ def save_progress(self, book_number, book_data):

def rotate_proxy(self):
"""
Rotates the proxy list by moving the first proxy to the end.
Rotates the deque by moving the first proxy to the end.
"""
if self.proxy_list:
proxy = self.proxy_list.pop(0) # Get the first proxy from the list
self.proxy_list.append(proxy)
try:
if self.proxy_list:
self.proxy_list.rotate(-1) # Rotate the deque for better efficiency
except IndexError:
pass


def get_html_content(self, url, use_proxy=False):
"""
Expand Down Expand Up @@ -107,6 +122,7 @@ def get_html_content(self, url, use_proxy=False):
print(f"\nFailed to fetch URL: {url}")
return None


def scrape_gutenberg(self):
"""
Scrapes the Gutenberg website for bibliographic records.
Expand Down Expand Up @@ -159,7 +175,6 @@ def scrape_gutenberg(self):
with open(self.json_file_path, "a", encoding="utf-8") as file:
file.write("}")


# Remove progress file after scraping is complete
if os.path.exists(self.progress_file):
os.remove(self.progress_file)
Expand Down