neokd · Bchass · Oct 3, 2023 · Oct 4, 2023
diff --git a/script/Scraper/gutenberg/gutenberg_scrapper.py b/script/Scraper/gutenberg/gutenberg_scrapper.py
@@ -2,6 +2,7 @@
 import json
 import os
 from bs4 import BeautifulSoup
+from collections import deque
 import time
 
 PROXY_CONNECTION_TIMEOUT = 5  # Timeout value in seconds
@@ -10,24 +11,35 @@ class GutenbergScraper:
     def __init__(self):
         """
         Initializes the GutenbergScraper class.
-        Loads the proxy list from the local file.
+        Initializes the proxy list to be handled with a deque
         Loads the last processed book number from the progress file.
         """
-        self.proxy_list = self.load_proxy_list()
+        self.proxy_list = deque(self.load_proxy_list())
         self.progress_file = r"script\Scraper\gutenberg\progress.txt"
         self.json_file_path = r"StoreHouse\Literature\gutenberg_bibliographic_records.json"
         self.last_book_number = self.load_progress()
 
 
     def load_proxy_list(self):
         """
-        Loads the proxy list from the local file.
-        Returns a list of proxies.
+        Prompts the user for a path to a proxy list.
+        Returns a deque of proxies.
         """
-        with open(r"script\proxy\validProxyList.txt", "r") as file:
+        user_input = input("Enter the path for a proxy list (or press Enter to skip without using a proxy list):\n")
+
+        # Check if the user provided a path
+        if not user_input:
+            print("Skipping proxy list, loading...")
+            return []
+
+        file_path = os.path.join(user_input)
+
+        with open(file_path, "r") as file:
             proxy_list = file.read().splitlines()
+            print("Confirming proxy list was loaded:", proxy_list[1])
         return proxy_list
 
+
     def load_progress(self):
         """
         Loads the last processed book number from the progress file.
@@ -67,11 +79,14 @@ def save_progress(self, book_number, book_data):
 
     def rotate_proxy(self):
         """
-        Rotates the proxy list by moving the first proxy to the end.
+        Rotates the deque by moving the first proxy to the end.
         """
-        if self.proxy_list:
-            proxy = self.proxy_list.pop(0)  # Get the first proxy from the list
-            self.proxy_list.append(proxy)
+        try:
+            if self.proxy_list:
+                self.proxy_list.rotate(-1)  # Rotate the deque for better efficiency
+        except IndexError:
+            pass
+
 
     def get_html_content(self, url, use_proxy=False):
         """
@@ -107,6 +122,7 @@ def get_html_content(self, url, use_proxy=False):
             print(f"\nFailed to fetch URL: {url}")
         return None
 
+
     def scrape_gutenberg(self):
         """
         Scrapes the Gutenberg website for bibliographic records.
@@ -159,7 +175,6 @@ def scrape_gutenberg(self):
         with open(self.json_file_path, "a", encoding="utf-8") as file:
             file.write("}")
 
-
         # Remove progress file after scraping is complete
         if os.path.exists(self.progress_file):
             os.remove(self.progress_file)