Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cr/issue 1 #6

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: CI

on:
push:
pull_request:
branches:
- 'main'

env:
SRC_DIR: scrapers
TEST_DIR: test

jobs:
style_lint_and_type_check:
name: style lint and type check
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- run: |
pip3 install --upgrade pip
pip3 install flake8 mypy black types-PyYAML
- run: |
python3 -m black --check $TEST_DIR
python3 -m flake8 $SRC_DIR # $TEST_DIR # should also run against scrapers but that will need more cleanup
python3 -m mypy $TEST_DIR --follow-imports=skip --disallow-untyped-defs
secrets_check:
name: check secrets
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- run: |
pip3 install --upgrade pip
pip3 install detect-secrets
- run: |
set -e
detect-secrets scan $TEST_DIR | python3 -c '
import sys, json
sys.exit(len(json.load(sys.stdin)["results"]))
'
if [ $? -ne 0 ]; then
echo "STOP : [ $? ] secrets detected"
exit 1
fi
echo "No secrets detected"

# Run mypy on diff
# git diff --name-only --diff-filter=d origin/main -- '*.py' | xargs mypy --follow-imports=skip --disallow-untyped-defs
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,7 @@ urllib3==1.26.9
Werkzeug==2.1.1
xlrd==2.0.1
zipp==3.8.0
flask
google-cloud
google-cloud-pubsub
xlrd
3 changes: 2 additions & 1 deletion scrapers/abstract/base_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from abc import ABC, abstractmethod, abstractproperty
from logging import Logger
from typing import Optional


class BaseWorkflow(ABC):
Expand Down Expand Up @@ -39,7 +40,7 @@ def execute(
job_id: str,
task_id: str,
source: str,
url: str=None) -> None:
url: Optional[str]=None) -> None:
"""Executes the workflow.

Args:
Expand Down
35 changes: 23 additions & 12 deletions scrapers/banks/adb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

import re
import logging
import requests
from bs4 import BeautifulSoup
from datetime import datetime
Expand Down Expand Up @@ -81,10 +82,12 @@ def generate_seed_urls(self) -> List[str]:
"""
try:
last_page_num = self.find_last_page()
logging.debug(f"Last page num found for ADB : {last_page_num}")
result_pages = [
self.search_results_base_url.format(page_num=n)
for n in range(self.first_page_num, last_page_num + 1)
]
# print(f"result pages list : {result_pages}")
return result_pages
except Exception as e:
raise Exception("Failed to generate ADB search "
Expand All @@ -102,13 +105,17 @@ def find_last_page(self) -> int:
(int): The page number.
"""
try:
first_results_page = self.search_results_base_url.format(
first_results_page: str = self.search_results_base_url.format(
page_num=self.first_page_num)
html = requests.get(first_results_page).text
# print(f"Got text from results page, type : {type(html)}")
# print(f"Got text from results page, length : {len(html)}")
soup = BeautifulSoup(html, "html.parser")

last_page_btn = soup.find('li', {"class": "pager-last"})
last_page_btn = soup.find('li', {"class": "pager__item--last"})
# print(f"Got last page button : {last_page_btn}")
last_page_num = int(last_page_btn.find("a")["href"].split('=')[-1])
# print("Last page number :", last_page_num)
return last_page_num

except Exception as e:
Expand Down Expand Up @@ -152,7 +159,7 @@ def __init__(
def project_page_base_url(self) -> str:
"""The base URL for individual ADB project pages.
"""
return 'https://www.adb.org/print'
return 'https://www.adb.org'


def scrape_results_page(self, results_page_url: str) -> List[str]:
Expand Down Expand Up @@ -225,12 +232,13 @@ def scrape_project_page(self, url: str) -> List[Dict]:

Args:
url (str): The URL for a project. Has the form:
'https://www.adb.org/print/projects/{project_id}/main'.
'https://www.adb.org/projects/{project_id}/main'.

Returns:
(list of dict): The list of project records.
"""
# Request page and parse HTML
# print(f"URL to scrape : {url}")
response = self._data_request_client.get(
url=url,
use_random_user_agent=True,
Expand All @@ -241,7 +249,8 @@ def scrape_project_page(self, url: str) -> List[Dict]:
soup = BeautifulSoup(response.text, features='html.parser')

# Find first project table holding project background details
table = soup.find('table')
table = soup.find('article')
# print(f"Table here : {table}")

# Extract project name, number, and status
def get_field(detail_name):
Expand All @@ -258,9 +267,11 @@ def get_field(detail_name):
name = get_field("Project Name")
number = get_field("Project Number")
status = get_field("Project Status")
# print(f"table : {table}")

# Extract and format countries
country_label = table.find(string="Country / Economy")
# print(f"country label : {country_label}")
if not country_label:
country_label = table.find(string="Country")
parent = country_label.findParent()
Expand Down Expand Up @@ -374,16 +385,16 @@ def get_field(detail_name):
raise Exception(f"Failed to open configuration file. {e}")

# Test 'SeedUrlsWorkflow'
w = AdbSeedUrlsWorkflow(None, None, None)
print(w.generate_seed_urls())
seed_workflow = AdbSeedUrlsWorkflow(None, None, None)
print(seed_workflow.generate_seed_urls())

# Test 'ResultsScrapeWorkflow'
w = AdbResultsScrapeWorkflow(data_request_client, None, None, None)
res_scrape_workflow = AdbResultsScrapeWorkflow(data_request_client, None, None, None)
url = 'https://www.adb.org/projects?page=558'
project_page_urls = w.scrape_results_page(url)
project_page_urls = res_scrape_workflow.scrape_results_page(url)
print(project_page_urls)

# Test 'ProjectScrapeWorkflow'
w = AdbProjectScrapeWorkflow(data_request_client, None, None)
url = 'https://www.adb.org/print/projects/53303-001/main'
print(w.scrape_project_page(url))
proj_scrape_workflow = AdbProjectScrapeWorkflow(data_request_client, None, None)
url = 'https://www.adb.org/projects/53303-001/main'
print(proj_scrape_workflow.scrape_project_page(url))
63 changes: 35 additions & 28 deletions scrapers/banks/idb.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
from scrapers.services.database import DbClient
from scrapers.services.pubsub import PubSubClient
from typing import Dict, List
import time, random

USER_AGENT_HEADER = headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}

class IdbSeedUrlsWorkflow(SeedUrlsWorkflow):
"""Retrieves the first set of IBD URLs to scrape.
Expand Down Expand Up @@ -75,12 +80,14 @@ def generate_seed_urls(self) -> List[str]:
Returns:
(list of str): The unique list of search result pages.
"""
print("Generating seed URL's for IDB")
try:
last_page_num = self.find_last_page()
result_page_urls = [
self.search_results_base_url.format(page_num=str(n))
for n in range(self.first_page_num, last_page_num + 1)
]
print(f"Resulting pages : {result_page_urls}")
return result_page_urls
except Exception as e:
raise Exception("Failed to generate search "
Expand All @@ -97,14 +104,19 @@ def find_last_page(self) -> int:
Returns:
(int): The page number.
"""
print("Finding last page for IDB")
try:
first_results_page_url = self.search_results_base_url.format(
page_num=self.first_page_num)
html = requests.get(first_results_page_url).text
print(f"URL for first IDB page : {first_results_page_url}")
html = requests.get(first_results_page_url, headers = USER_AGENT_HEADER).text
print(f"html for first page of results from IDB : {html}")
soup = BeautifulSoup(html, "html.parser")

last_page_item = soup.find('li', {"class":"pager__item pager__item--last"})
return int(last_page_item.find("a")["href"].split('=')[-1])
last_page_item = soup.find('li', {"class":"pager__item--last"})
print(f"Text from last pager item for IDB : {last_page_item.text}")
last_page_url = last_page_item.find('idb-button')['button-url']
last_page_number = int(last_page_url.split('=')[-1])
return last_page_number
except Exception as e:
raise Exception(f"Error retrieving last page number. {e}")

Expand Down Expand Up @@ -210,40 +222,35 @@ def scrape_project_page(self, url: str) -> List[Dict]:
"""
try:
# Request and parse page into BeautifulSoup object
response = self._data_request_client.get(url, use_random_user_agent=False)
time.sleep(.2) # site breaks if requests come too fast
response = self._data_request_client.get(
url,
use_random_user_agent=True,
# custom_headers=USER_AGENT_HEADER
)
soup = BeautifulSoup(response.text, 'html.parser')

# Abort process if no project data available
project_title = soup.find("h1", {"class":"project-title"}).text
project_title = soup.find("idb-section-wrapper")["heading"]
if not project_title or project_title.strip() == ":":
return

# Parse project detail and information sections
project_detail_section = soup.find("div", {"class": "project-detail project-section"})
project_info_section = soup.find("div", {"class": "project-information project-section"})
project_info_table = soup.find('idb-project-table')
table_info = {}
for row in project_info_table.findAll('idb-project-table-row'):
fact, value, *_ = [p.text for p in row.findAll('p')]
table_info[fact] = value

# Define local function for extracting data from a project section
def extract_field(project_section, field_name: str):
try:
title_div = project_section.find(
name="div",
attrs={"class": "project-field-title"},
string=re.compile(field_name, re.IGNORECASE)
)
data = title_div.find_next_sibling("span").text.strip()
return data if data else None
except AttributeError:
return None

# Retrieve fields
number = extract_field(project_detail_section, "PROJECT NUMBER")
number = table_info['Project Number']
name = project_title.split(":")[1].strip() if ":" in project_title else project_title
status = extract_field(project_detail_section, "PROJECT STATUS")
date = extract_field(project_detail_section, "APPROVAL DATE")
loan_amount = extract_field(project_info_section, "AMOUNT")
sectors = extract_field(project_detail_section, "PROJECT SECTOR")
subsectors = extract_field(project_detail_section, "PROJECT SUBSECTOR")
countries = extract_field(project_detail_section, "PROJECT COUNTRY")
status = table_info['Project Status']
date = table_info['Approval Date']
loan_amount = table_info['Total Cost']
sectors = table_info['Sector']
subsectors = table_info['Subsector']
countries = table_info['Country']

# Parse project approval date to retrieve year, month, and day
if date:
Expand Down
3 changes: 2 additions & 1 deletion scrapers/banks/pro.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def search_results_base_url(self) -> str:
"""The base URL for development bank project search
results page on Proparco's website.
"""
return 'https://www.proparco.fr/en/carte-des-projets?filter=&page=all&query=%2A&view=list'
return 'https://www.proparco.fr/en/carte-des-projets-list?page=all&query=%2A&view=start'


@property
Expand All @@ -75,6 +75,7 @@ def generate_seed_urls(self) -> List[str]:
Returns:
(list of str): The project page URLs.
"""
print("Seeding URLs for PRO")
try:
response = requests.get(self.search_results_base_url)
html = response.text
Expand Down
10 changes: 7 additions & 3 deletions scrapers/banks/wb.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,14 @@ def get_projects(self) -> pd.DataFrame:
(`pd.DataFrame`): The raw project records.
"""
try:
return pd.read_excel(self.download_url, skiprows=2)
return pd.read_excel(self.download_url, skiprows=2, engine='xlrd')
# return pd.read_html(self.download_url, flavor='bs4')
except Exception as e:
raise Exception(f"Error retrieving Excel project data "
f"from the World Bank. {e}")
try:
return pd.read_csv(self.download_url, skiprows=2)
except Exception as ee:
raise Exception(f"Error retrieving Excel project data "
f"from the World Bank. {e}")


def clean_projects(self, df: pd.DataFrame) -> pd.DataFrame:
Expand Down
2 changes: 1 addition & 1 deletion scrapers/services/pubsub.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def publish_message(self, data: Dict) -> None:
try:
# Encode data
data_str = json.dumps(data)
encoded_data = data_str.encode('utf-8')
encoded_data: bytes = data_str.encode('utf-8')

# Initiate message publishing
publish_future = self._publisher.publish(self._topic_path, encoded_data)
Expand Down
Empty file added test/__init__.py
Empty file.
Empty file added test/integration/__init__.py
Empty file.
Loading
Loading