diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..6dd0c01 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,53 @@ +name: CI + +on: + push: + pull_request: + branches: + - 'main' + +env: + SRC_DIR: scrapers + TEST_DIR: test + +jobs: + style_lint_and_type_check: + name: style lint and type check + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + - run: | + pip3 install --upgrade pip + pip3 install flake8 mypy black types-PyYAML + - run: | + python3 -m black --check $TEST_DIR + python3 -m flake8 $SRC_DIR # $TEST_DIR # should also run against scrapers but that will need more cleanup + python3 -m mypy $TEST_DIR --follow-imports=skip --disallow-untyped-defs + secrets_check: + name: check secrets + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + - run: | + pip3 install --upgrade pip + pip3 install detect-secrets + - run: | + set -e + detect-secrets scan $TEST_DIR | python3 -c ' + import sys, json + sys.exit(len(json.load(sys.stdin)["results"])) + ' + if [ $? -ne 0 ]; then + echo "STOP : [ $? ] secrets detected" + exit 1 + fi + echo "No secrets detected" + +# Run mypy on diff +# git diff --name-only --diff-filter=d origin/main -- '*.py' | xargs mypy --follow-imports=skip --disallow-untyped-defs diff --git a/requirements.txt b/requirements.txt index c48cf93..5a1a921 100644 --- a/requirements.txt +++ b/requirements.txt @@ -43,3 +43,7 @@ urllib3==1.26.9 Werkzeug==2.1.1 xlrd==2.0.1 zipp==3.8.0 +flask +google-cloud +google-cloud-pubsub +xlrd diff --git a/scrapers/abstract/base_workflow.py b/scrapers/abstract/base_workflow.py index 72dcbf5..9b1a17c 100644 --- a/scrapers/abstract/base_workflow.py +++ b/scrapers/abstract/base_workflow.py @@ -4,6 +4,7 @@ from abc import ABC, abstractmethod, abstractproperty from logging import Logger +from typing import Optional class BaseWorkflow(ABC): @@ -39,7 +40,7 @@ def execute( job_id: str, task_id: str, source: str, - url: str=None) -> None: + url: Optional[str]=None) -> None: """Executes the workflow. Args: diff --git a/scrapers/banks/adb.py b/scrapers/banks/adb.py index 72b0c4c..1105057 100644 --- a/scrapers/banks/adb.py +++ b/scrapers/banks/adb.py @@ -5,6 +5,7 @@ """ import re +import logging import requests from bs4 import BeautifulSoup from datetime import datetime @@ -81,10 +82,12 @@ def generate_seed_urls(self) -> List[str]: """ try: last_page_num = self.find_last_page() + logging.debug(f"Last page num found for ADB : {last_page_num}") result_pages = [ self.search_results_base_url.format(page_num=n) for n in range(self.first_page_num, last_page_num + 1) ] + # print(f"result pages list : {result_pages}") return result_pages except Exception as e: raise Exception("Failed to generate ADB search " @@ -102,13 +105,17 @@ def find_last_page(self) -> int: (int): The page number. """ try: - first_results_page = self.search_results_base_url.format( + first_results_page: str = self.search_results_base_url.format( page_num=self.first_page_num) html = requests.get(first_results_page).text + # print(f"Got text from results page, type : {type(html)}") + # print(f"Got text from results page, length : {len(html)}") soup = BeautifulSoup(html, "html.parser") - last_page_btn = soup.find('li', {"class": "pager-last"}) + last_page_btn = soup.find('li', {"class": "pager__item--last"}) + # print(f"Got last page button : {last_page_btn}") last_page_num = int(last_page_btn.find("a")["href"].split('=')[-1]) + # print("Last page number :", last_page_num) return last_page_num except Exception as e: @@ -152,7 +159,7 @@ def __init__( def project_page_base_url(self) -> str: """The base URL for individual ADB project pages. """ - return 'https://www.adb.org/print' + return 'https://www.adb.org' def scrape_results_page(self, results_page_url: str) -> List[str]: @@ -225,12 +232,13 @@ def scrape_project_page(self, url: str) -> List[Dict]: Args: url (str): The URL for a project. Has the form: - 'https://www.adb.org/print/projects/{project_id}/main'. + 'https://www.adb.org/projects/{project_id}/main'. Returns: (list of dict): The list of project records. """ # Request page and parse HTML + # print(f"URL to scrape : {url}") response = self._data_request_client.get( url=url, use_random_user_agent=True, @@ -241,7 +249,8 @@ def scrape_project_page(self, url: str) -> List[Dict]: soup = BeautifulSoup(response.text, features='html.parser') # Find first project table holding project background details - table = soup.find('table') + table = soup.find('article') + # print(f"Table here : {table}") # Extract project name, number, and status def get_field(detail_name): @@ -258,9 +267,11 @@ def get_field(detail_name): name = get_field("Project Name") number = get_field("Project Number") status = get_field("Project Status") + # print(f"table : {table}") # Extract and format countries country_label = table.find(string="Country / Economy") + # print(f"country label : {country_label}") if not country_label: country_label = table.find(string="Country") parent = country_label.findParent() @@ -374,16 +385,16 @@ def get_field(detail_name): raise Exception(f"Failed to open configuration file. {e}") # Test 'SeedUrlsWorkflow' - w = AdbSeedUrlsWorkflow(None, None, None) - print(w.generate_seed_urls()) + seed_workflow = AdbSeedUrlsWorkflow(None, None, None) + print(seed_workflow.generate_seed_urls()) # Test 'ResultsScrapeWorkflow' - w = AdbResultsScrapeWorkflow(data_request_client, None, None, None) + res_scrape_workflow = AdbResultsScrapeWorkflow(data_request_client, None, None, None) url = 'https://www.adb.org/projects?page=558' - project_page_urls = w.scrape_results_page(url) + project_page_urls = res_scrape_workflow.scrape_results_page(url) print(project_page_urls) # Test 'ProjectScrapeWorkflow' - w = AdbProjectScrapeWorkflow(data_request_client, None, None) - url = 'https://www.adb.org/print/projects/53303-001/main' - print(w.scrape_project_page(url)) + proj_scrape_workflow = AdbProjectScrapeWorkflow(data_request_client, None, None) + url = 'https://www.adb.org/projects/53303-001/main' + print(proj_scrape_workflow.scrape_project_page(url)) diff --git a/scrapers/banks/idb.py b/scrapers/banks/idb.py index 982f3d6..171db30 100644 --- a/scrapers/banks/idb.py +++ b/scrapers/banks/idb.py @@ -14,6 +14,11 @@ from scrapers.services.database import DbClient from scrapers.services.pubsub import PubSubClient from typing import Dict, List +import time, random + +USER_AGENT_HEADER = headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' +} class IdbSeedUrlsWorkflow(SeedUrlsWorkflow): """Retrieves the first set of IBD URLs to scrape. @@ -75,12 +80,14 @@ def generate_seed_urls(self) -> List[str]: Returns: (list of str): The unique list of search result pages. """ + print("Generating seed URL's for IDB") try: last_page_num = self.find_last_page() result_page_urls = [ self.search_results_base_url.format(page_num=str(n)) for n in range(self.first_page_num, last_page_num + 1) ] + print(f"Resulting pages : {result_page_urls}") return result_page_urls except Exception as e: raise Exception("Failed to generate search " @@ -97,14 +104,19 @@ def find_last_page(self) -> int: Returns: (int): The page number. """ + print("Finding last page for IDB") try: first_results_page_url = self.search_results_base_url.format( page_num=self.first_page_num) - html = requests.get(first_results_page_url).text + print(f"URL for first IDB page : {first_results_page_url}") + html = requests.get(first_results_page_url, headers = USER_AGENT_HEADER).text + print(f"html for first page of results from IDB : {html}") soup = BeautifulSoup(html, "html.parser") - - last_page_item = soup.find('li', {"class":"pager__item pager__item--last"}) - return int(last_page_item.find("a")["href"].split('=')[-1]) + last_page_item = soup.find('li', {"class":"pager__item--last"}) + print(f"Text from last pager item for IDB : {last_page_item.text}") + last_page_url = last_page_item.find('idb-button')['button-url'] + last_page_number = int(last_page_url.split('=')[-1]) + return last_page_number except Exception as e: raise Exception(f"Error retrieving last page number. {e}") @@ -210,40 +222,35 @@ def scrape_project_page(self, url: str) -> List[Dict]: """ try: # Request and parse page into BeautifulSoup object - response = self._data_request_client.get(url, use_random_user_agent=False) + time.sleep(.2) # site breaks if requests come too fast + response = self._data_request_client.get( + url, + use_random_user_agent=True, + # custom_headers=USER_AGENT_HEADER + ) soup = BeautifulSoup(response.text, 'html.parser') - # Abort process if no project data available - project_title = soup.find("h1", {"class":"project-title"}).text + project_title = soup.find("idb-section-wrapper")["heading"] if not project_title or project_title.strip() == ":": return - # Parse project detail and information sections - project_detail_section = soup.find("div", {"class": "project-detail project-section"}) - project_info_section = soup.find("div", {"class": "project-information project-section"}) + project_info_table = soup.find('idb-project-table') + table_info = {} + for row in project_info_table.findAll('idb-project-table-row'): + fact, value, *_ = [p.text for p in row.findAll('p')] + table_info[fact] = value # Define local function for extracting data from a project section - def extract_field(project_section, field_name: str): - try: - title_div = project_section.find( - name="div", - attrs={"class": "project-field-title"}, - string=re.compile(field_name, re.IGNORECASE) - ) - data = title_div.find_next_sibling("span").text.strip() - return data if data else None - except AttributeError: - return None # Retrieve fields - number = extract_field(project_detail_section, "PROJECT NUMBER") + number = table_info['Project Number'] name = project_title.split(":")[1].strip() if ":" in project_title else project_title - status = extract_field(project_detail_section, "PROJECT STATUS") - date = extract_field(project_detail_section, "APPROVAL DATE") - loan_amount = extract_field(project_info_section, "AMOUNT") - sectors = extract_field(project_detail_section, "PROJECT SECTOR") - subsectors = extract_field(project_detail_section, "PROJECT SUBSECTOR") - countries = extract_field(project_detail_section, "PROJECT COUNTRY") + status = table_info['Project Status'] + date = table_info['Approval Date'] + loan_amount = table_info['Total Cost'] + sectors = table_info['Sector'] + subsectors = table_info['Subsector'] + countries = table_info['Country'] # Parse project approval date to retrieve year, month, and day if date: diff --git a/scrapers/banks/pro.py b/scrapers/banks/pro.py index b28f9ad..04f7b15 100644 --- a/scrapers/banks/pro.py +++ b/scrapers/banks/pro.py @@ -56,7 +56,7 @@ def search_results_base_url(self) -> str: """The base URL for development bank project search results page on Proparco's website. """ - return 'https://www.proparco.fr/en/carte-des-projets?filter=&page=all&query=%2A&view=list' + return 'https://www.proparco.fr/en/carte-des-projets-list?page=all&query=%2A&view=start' @property @@ -75,6 +75,7 @@ def generate_seed_urls(self) -> List[str]: Returns: (list of str): The project page URLs. """ + print("Seeding URLs for PRO") try: response = requests.get(self.search_results_base_url) html = response.text diff --git a/scrapers/banks/wb.py b/scrapers/banks/wb.py index 39b7da7..4c7e3df 100644 --- a/scrapers/banks/wb.py +++ b/scrapers/banks/wb.py @@ -59,10 +59,14 @@ def get_projects(self) -> pd.DataFrame: (`pd.DataFrame`): The raw project records. """ try: - return pd.read_excel(self.download_url, skiprows=2) + return pd.read_excel(self.download_url, skiprows=2, engine='xlrd') + # return pd.read_html(self.download_url, flavor='bs4') except Exception as e: - raise Exception(f"Error retrieving Excel project data " - f"from the World Bank. {e}") + try: + return pd.read_csv(self.download_url, skiprows=2) + except Exception as ee: + raise Exception(f"Error retrieving Excel project data " + f"from the World Bank. {e}") def clean_projects(self, df: pd.DataFrame) -> pd.DataFrame: diff --git a/scrapers/services/pubsub.py b/scrapers/services/pubsub.py index 52dc000..16d2436 100644 --- a/scrapers/services/pubsub.py +++ b/scrapers/services/pubsub.py @@ -67,7 +67,7 @@ def publish_message(self, data: Dict) -> None: try: # Encode data data_str = json.dumps(data) - encoded_data = data_str.encode('utf-8') + encoded_data: bytes = data_str.encode('utf-8') # Initiate message publishing publish_future = self._publisher.publish(self._topic_path, encoded_data) diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/integration/__init__.py b/test/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/integration/test_breaking_scrapes.py b/test/integration/test_breaking_scrapes.py new file mode 100644 index 0000000..3a76e21 --- /dev/null +++ b/test/integration/test_breaking_scrapes.py @@ -0,0 +1,177 @@ +import json + +import yaml + +from scrapers.banks.adb import ( + AdbProjectScrapeWorkflow, + AdbResultsScrapeWorkflow, + AdbSeedUrlsWorkflow, +) +from scrapers.banks.afdb import AfdbProjectScrapeWorkflow, AfdbSeedUrlsWorkflow +from scrapers.banks.idb import ( + IdbProjectScrapeWorkflow, + IdbResultsScrapeWorkflow, + IdbSeedUrlsWorkflow, +) +from scrapers.banks.pro import ProProjectScrapeWorkflow, ProSeedUrlsWorkflow +from scrapers.banks.wb import WbDownloadWorkflow +from scrapers.constants import CONFIG_DIR_PATH +from scrapers.services.data_request import DataRequestClient + + +def test_adb_seed() -> None: + """Test scrape process for adb""" + with open( + f"{CONFIG_DIR_PATH}/user_agent_headers.json", "r", encoding="utf-8" + ) as stream: + try: + user_agent_headers = json.load(stream) + data_request_client = DataRequestClient(user_agent_headers) + except yaml.YAMLError as exc: + raise RuntimeError(f"Failed to open config file. {exc}") from exc + + # Test 'SeedUrlsWorkflow' + seed_workflow = AdbSeedUrlsWorkflow(None, None, None) + print(seed_workflow.generate_seed_urls()) + assert seed_workflow.generate_seed_urls() + + +def test_adb_result_scrape() -> None: + """Test scrape process for adb""" + with open( + f"{CONFIG_DIR_PATH}/user_agent_headers.json", "r", encoding="utf-8" + ) as stream: + try: + user_agent_headers = json.load(stream) + data_request_client = DataRequestClient(user_agent_headers) + except yaml.YAMLError as exc: + raise RuntimeError(f"Failed to open config file. {exc}") from exc + + # Test 'ResultsScrapeWorkflow' + res_scrape_workflow = AdbResultsScrapeWorkflow(data_request_client, None, None, None) + url = "https://www.adb.org/projects?page=558" + project_page_urls: list[str] = res_scrape_workflow.scrape_results_page(url) + + print(project_page_urls) + assert len(project_page_urls) > 0 + + +def test_adb_project_scrape() -> None: + """Test scrape process for adb""" + with open( + f"{CONFIG_DIR_PATH}/user_agent_headers.json", "r", encoding="utf-8" + ) as stream: + try: + user_agent_headers = json.load(stream) + data_request_client = DataRequestClient(user_agent_headers) + except yaml.YAMLError as exc: + raise RuntimeError(f"Failed to open config file. {exc}") from exc + + # Test 'ProjectScrapeWorkflow' + proj_scrape_workflow = AdbProjectScrapeWorkflow(data_request_client, None, None) + url = "https://www.adb.org/projects/53303-001/main" + print(proj_scrape_workflow.scrape_project_page(url)) + +# BROKEN 403 -- detecting scraper? +def test_idb_seed() -> None: + """Test workflow for idb scrape""" + # Set up DataRequestClient to rotate HTTP headers and add random delays + with open( + f"{CONFIG_DIR_PATH}/user_agent_headers.json", "r", encoding="UTF-8" + ) as stream: + try: + user_agent_headers = json.load(stream) + data_request_client = DataRequestClient(user_agent_headers) + except yaml.YAMLError as yml_err: + raise RuntimeError( + f"Failed to open configuration file. {yml_err}" + ) from yml_err + + # Test 'SeedUrlsWorkflow' + seed_workflow = IdbSeedUrlsWorkflow(None, None, None) + print(seed_workflow.generate_seed_urls()) + + +def test_idb_result_scrape() -> None: + """Test workflow for idb scrape""" + # Set up DataRequestClient to rotate HTTP headers and add random delays + with open( + f"{CONFIG_DIR_PATH}/user_agent_headers.json", "r", encoding="UTF-8" + ) as stream: + try: + user_agent_headers = json.load(stream) + data_request_client = DataRequestClient(user_agent_headers) + except yaml.YAMLError as yml_err: + raise RuntimeError( + f"Failed to open configuration file. {yml_err}" + ) from yml_err + + # Test 'ResultsScrapeWorkflow' + res_scrape_workflow = IdbResultsScrapeWorkflow(data_request_client, None, None, None) + url = ( + "https://www.iadb.org/en/projects-search" + "?country=§or=&status=&query=&page=120" + ) + print(res_scrape_workflow.scrape_results_page(url)) + + +def test_idb_project_scrape() -> None: + """Test workflow for idb scrape""" + # Set up DataRequestClient to rotate HTTP headers and add random delays + with open( + f"{CONFIG_DIR_PATH}/user_agent_headers.json", "r", encoding="UTF-8" + ) as stream: + try: + user_agent_headers = json.load(stream) + data_request_client = DataRequestClient(user_agent_headers) + except yaml.YAMLError as yml_err: + raise RuntimeError( + f"Failed to open configuration file. {yml_err}" + ) from yml_err + + # Test 'ProjectScrapeWorkflow' + proj_scrape_workflow = IdbProjectScrapeWorkflow(data_request_client, None, None) + url = "https://www.iadb.org/en/project/TC9409295" + print(proj_scrape_workflow.scrape_project_page(url)) + + +def test_pro_seed() -> None: + """test workflow for pro scrape""" + # Test 'StartScrapeWorkflow' + seed_workflow = ProSeedUrlsWorkflow(None, None, None) + print(seed_workflow.generate_seed_urls()) + + +def test_pro_scrape() -> None: + # Test 'ProjectScrapeWorkflow' + scrape_workflow = ProProjectScrapeWorkflow(None, None, None) + url = "https://www.proparco.fr/en/carte-des-projets/ecobank-trade-finance" + print(scrape_workflow.scrape_project_page(url)) + + +def test_wb() -> None: + """test workflow for wb scrape""" + # Test 'DownloadWorkflow' + download_workflow = WbDownloadWorkflow(None, None, None) + raw_df = download_workflow.get_projects() + clean_df = download_workflow.clean_projects(raw_df) + print(f"Found {len(clean_df)} record(s).") + print(clean_df.head()) + + +def test_afdb_seed() -> None: + """Test afdb scrape""" + # Test 'SeedUrlsWorkflow' + # NOTE: Performs a download that takes + # several seconds to complete. + seed_workflow = AfdbSeedUrlsWorkflow(None, None, None) + print(seed_workflow.generate_seed_urls()) + + +def test_afdb_scrape_project() -> None: + """Test afdb scrape""" + # Test 'ProjectScrapeWorkflow' + scrape_workflow = AfdbProjectScrapeWorkflow(None, None, None) + url = ("https://projectsportal.afdb.org" + "/dataportal/VProject/show/P-Z1-FAB-030") + print(scrape_workflow.scrape_project_page(url)) \ No newline at end of file diff --git a/test/integration/test_fast_scrapes.py b/test/integration/test_fast_scrapes.py new file mode 100644 index 0000000..6b14160 --- /dev/null +++ b/test/integration/test_fast_scrapes.py @@ -0,0 +1,128 @@ +"""Tests the fastest bank scrapers in the project""" + +import json + +import yaml + +from scrapers.banks.aiib import AiibProjectScrapeWorkflow, AiibSeedUrlsWorkflow +from scrapers.banks.bio import ( + BioProjectPartialScrapeWorkflow, + BioResultsMultiScrapeWorkflow, + BioSeedUrlsWorkflow, +) +from scrapers.banks.deg import DegDownloadWorkflow +from scrapers.banks.dfc import DfcDownloadWorkflow +from scrapers.banks.fmo import ( + FmoProjectScrapeWorkflow, + FmoResultsScrapeWorkflow, + FmoSeedUrlsWorkflow, +) +from scrapers.banks.kfw import KfwDownloadWorkflow +from scrapers.constants import CONFIG_DIR_PATH +from scrapers.services.data_request import DataRequestClient + + +def test_kfw() -> None: + """Test KfW scrape""" + # Test 'DownloadWorkflow' + w = KfwDownloadWorkflow(None, None, None) + raw_df = w.get_projects() + clean_df = w.clean_projects(raw_df) + print(f"Found {len(clean_df)} record(s).") + print(clean_df.head()) + + +def test_aiib() -> None: + """Test aiib scrape""" + # Set up DataRequestClient to rotate HTTP headers and add random delays + with open( + f"{CONFIG_DIR_PATH}/user_agent_headers.json", "r", encoding="utf-8" + ) as stream: + try: + user_agent_headers = json.load(stream) + data_request_client = DataRequestClient(user_agent_headers) + except yaml.YAMLError as yml_err: + raise RuntimeError( + f"Failed to open configuration file. {yml_err}" + ) from yml_err + + # Test 'SeedUrlsWorkflow' + seed_workflow = AiibSeedUrlsWorkflow(None, None, None) + print(seed_workflow.generate_seed_urls()) + + # Test 'ProjectScrapeWorkflow' + scrape_workflow = AiibProjectScrapeWorkflow(data_request_client, None, None) + url = ( + "https://www.aiib.org/en/projects/details/2021/proposed" + "/India-Extension-Renovation-and-Modernization-of-Grand-Anicut-" + "Canal-System.html" + ) + print(scrape_workflow.scrape_project_page(url)) + + +def test_bio() -> None: + """Test scrape for bio""" + # Test 'StartScrape' workflow + seed_workflow = BioSeedUrlsWorkflow(None, None, None) + print(seed_workflow.generate_seed_urls()) + + # Test 'ResultsPageMultiScrape' workflow + res_multi_scrape_workflow = BioResultsMultiScrapeWorkflow(None, None, None, None) + url = "https://www.bio-invest.be/en/investments/p5?search=" + urls, project_records = res_multi_scrape_workflow.scrape_results_page(url) + print(urls) + print(project_records) + + # Test 'ProjectPartialScrapeWorkflow' workflow + proj_partial_scrape_workflow = BioProjectPartialScrapeWorkflow(None, None, None) + url = "https://www.bio-invest.be/en/investments/zoscales-fund-i" + print(proj_partial_scrape_workflow.scrape_project_page(url)) + + +def test_deg() -> None: + """Test deg workflow""" + # Test 'DownloadWorkflow' + w = DegDownloadWorkflow(None, None, None) + raw_df = w.get_projects() + clean_df = w.clean_projects(raw_df) + print(f"Found {len(clean_df)} record(s).") + print(clean_df.head()) + + +def test_dfc() -> None: + """Test dfc scrape workflow""" + # Set up DataRequestClient to rotate HTTP headers and add random delays + with open( + f"{CONFIG_DIR_PATH}/user_agent_headers.json", "r", encoding="utf-8" + ) as stream: + try: + user_agent_headers = json.load(stream) + data_request_client = DataRequestClient(user_agent_headers) + except yaml.YAMLError as yml_err: + raise RuntimeError( + f"Failed to open configuration file. {yml_err}" + ) from yml_err + + # Test 'DownloadWorkflow' + w = DfcDownloadWorkflow(data_request_client, None, None) + raw_df = w.get_projects() + clean_df = w.clean_projects(raw_df) + print(f"Found {len(clean_df)} record(s).") + print(clean_df.head()) + + +def test_fmo() -> None: + """Test workflow for fmo scrape""" + # Test 'StartScrape' workflow + seed_workflow = FmoSeedUrlsWorkflow(None, None, None) + print(seed_workflow.generate_seed_urls()) + + # Test 'ResultsPageScrape' workflow + res_scrape_workflow = FmoResultsScrapeWorkflow(None, None, None, None) + url = "https://www.fmo.nl/worldmap?page=21" + print(res_scrape_workflow.scrape_results_page(url)) + + # Test 'ProjectPageScrape' workflow + proj_scrape_workflow = FmoProjectScrapeWorkflow(None, None, None) + url = "https://www.fmo.nl/project-detail/60377" + print(proj_scrape_workflow.scrape_project_page(url)) diff --git a/test/integration/test_slow_scrapes.py b/test/integration/test_slow_scrapes.py new file mode 100644 index 0000000..41675c9 --- /dev/null +++ b/test/integration/test_slow_scrapes.py @@ -0,0 +1,137 @@ +import json + +import yaml + +from scrapers.banks.ebrd import ( + EbrdProjectScrapeWorkflow, + EbrdResultsScrapeWorkflow, + EbrdSeedUrlsWorkflow, +) +from scrapers.banks.eib import EibProjectScrapeWorkflow, EibSeedUrlsWorkflow +from scrapers.banks.ifc import IfcProjectScrapeWorkflow, IfcSeedUrlsWorkflow +from scrapers.banks.miga import ( + MigaProjectScrapeWorkflow, + MigaResultsScrapeWorkflow, + MigaSeedUrlsWorkflow, +) +from scrapers.banks.nbim import NbimDownloadWorkflow +from scrapers.banks.undp import UndpProjectScrapeWorkflow, UndpSeedUrlsWorkflow +from scrapers.constants import CONFIG_DIR_PATH +from scrapers.services.data_request import DataRequestClient + + +def test_ebrd() -> None: + """Test workflow for ebfd""" + # Test 'StartScrape' workflow + seed_workflow = EbrdSeedUrlsWorkflow(None, None, None) + print(seed_workflow.generate_seed_urls()) + + # # Test 'ResultsPageScrape' workflow + res_scrape_workflow = EbrdResultsScrapeWorkflow(None, None, None, None) + url = ( + "https://www.ebrd.com/cs/Satellite" + "?c=Page&cid=1395238314964&d=&pagename=EBRD" + "/Page/SolrSearchAndFilterPSD&page=65&safSortBy=PublicationDate_sort" + "&safSortOrder=descending" + ) + print(res_scrape_workflow.scrape_results_page(url)) + + # Test 'ProjectPageScrape' workflow + proj_scrape_workflow = EbrdProjectScrapeWorkflow(None, None, None) + url = "https://www.ebrd.com/work-with-us/projects/psd/52642.html" + print(proj_scrape_workflow.scrape_project_page(url)) + + +def test_eib() -> None: + """Test workflow for eib""" + # Test 'SeedUrlsWorkflow' + seed_workflow = EibSeedUrlsWorkflow(None, None, None) + print(seed_workflow.generate_seed_urls()) + + # Test 'ProjectScrapeWorkflow' + scrape_workflow = EibProjectScrapeWorkflow(None, None, None) + url = ( + "https://www.eib.org/page-provider/projects/list" + "?pageNumber=17&itemPerPage=500&pageable=true&sortColumn=id" + ) + print(scrape_workflow.scrape_project_page(url)) + + +def test_ifc() -> None: + """Test workflow for ifc scrape""" + # Test 'SeedUrlsWorkflow' + seed_workflow = IfcSeedUrlsWorkflow(None, None, None) + print(seed_workflow.generate_seed_urls()) + + # Test 'ProjectScrapeWorkflow' + scrape_workflow = IfcProjectScrapeWorkflow(None, None, None) + url = ( + "https://externalsearch.ifc.org/spi/api/searchxls" + "?qterm=*&start=8000&srt=disclosed_date&order=desc&rows=1000" + ) + records = scrape_workflow.scrape_project_page(url) + print(records) + print(f"Found {len(records)} record(s).") + + +def test_miga() -> None: + """Test workflow for miga scrape""" + # Set up DataRequestClient to rotate HTTP headers and add random delays + with open( + f"{CONFIG_DIR_PATH}/user_agent_headers.json", "r", encoding="utf-8" + ) as stream: + try: + user_agent_headers = json.load(stream) + data_request_client = DataRequestClient(user_agent_headers) + except yaml.YAMLError as yml_err: + raise RuntimeError( + f"Failed to open configuration file. {yml_err}" + ) from yml_err + + # Test 'SeedUrlsWorkflow' + seed_workflow = MigaSeedUrlsWorkflow(None, None, None) + print(seed_workflow.generate_seed_urls()) + + # Test 'ResultsScrapeWorkflow' + res_scrape_workflow = MigaResultsScrapeWorkflow(data_request_client, None, None, None) + url = "https://www.miga.org/projects?page=1" + print(res_scrape_workflow.scrape_results_page(url)) + + # Test 'ProjectScrapeWorkflow' + proj_scrape_workflow = MigaProjectScrapeWorkflow(data_request_client, None, None) + url = "https://www.miga.org/project/bboxx-rwanda-kenya-and-drc-0" + print(proj_scrape_workflow.scrape_project_page(url)) + + +def test_nbim() -> None: + """test workflow for nbim scrape""" + # Test 'DownloadWorkflow' + download_workflow = NbimDownloadWorkflow(None, None, None) + raw_df = download_workflow.get_projects() + clean_df = download_workflow.clean_projects(raw_df) + print(f"Found {len(clean_df)} record(s).") + print(clean_df.head()) + + +def test_undp() -> None: + """test workflow for undp scrape""" + # Set up DataRequestClient to rotate HTTP headers and add random delays + with open( + f"{CONFIG_DIR_PATH}/user_agent_headers.json", "r", encoding="utf-8" + ) as stream: + try: + user_agent_headers = json.load(stream) + data_request_client = DataRequestClient(user_agent_headers) + except yaml.YAMLError as yml_err: + raise RuntimeError( + f"Failed to open configuration file. {yml_err}" + ) from yml_err + + # Test 'SeedUrlsWorkflow' + seed_workflow = UndpSeedUrlsWorkflow(None, None, None) + print(seed_workflow.generate_seed_urls()) + + # Test 'ProjectPageScrapeWorkflow' + scrape_workflow = UndpProjectScrapeWorkflow(data_request_client, None, None) + url = "https://api.open.undp.org/api/projects/00110684.json" + print(scrape_workflow.scrape_project_page(url))