uchicago-dsi · LaunaG · Aug 10, 2023 · Aug 10, 2023 · Aug 10, 2023 · Aug 17, 2023
@@ -0,0 +1,53 @@
+name: CI
+
+on:
+  push:
+  pull_request:
+    branches:
+      - 'main'
+
+env:
+  SRC_DIR: scrapers
+  TEST_DIR: test
+
+jobs:
+  style_lint_and_type_check:
+    name: style lint and type check
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - run: |
+          pip3 install --upgrade pip
+          pip3 install flake8 mypy black types-PyYAML
+      - run: |
+          python3 -m black --check $TEST_DIR
+          python3 -m flake8 $SRC_DIR # $TEST_DIR # should also run against scrapers but that will need more cleanup
+          python3 -m mypy $TEST_DIR --follow-imports=skip --disallow-untyped-defs
+  secrets_check:
+    name: check secrets
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - run: |
+          pip3 install --upgrade pip
+          pip3 install detect-secrets
+      - run: |
+          set -e
+          detect-secrets scan $TEST_DIR | python3 -c '
+          import sys, json
+          sys.exit(len(json.load(sys.stdin)["results"]))
+          '
+          if [ $? -ne 0 ]; then
+            echo "STOP : [ $? ] secrets detected"
+            exit 1
+          fi
+          echo "No secrets detected"
+
+# Run mypy on diff
+# git diff --name-only --diff-filter=d origin/main -- '*.py' | xargs mypy --follow-imports=skip --disallow-untyped-defs
@@ -43,3 +43,7 @@ urllib3==1.26.9
 Werkzeug==2.1.1
 xlrd==2.0.1
 zipp==3.8.0
+flask
+google-cloud
+google-cloud-pubsub
+xlrd
@@ -4,6 +4,7 @@
 
 from abc import ABC, abstractmethod, abstractproperty
 from logging import Logger
+from typing import Optional
 
 
 class BaseWorkflow(ABC):
@@ -39,7 +40,7 @@ def execute(
         job_id: str,
         task_id: str,
         source: str,
-        url: str=None) -> None:
+        url: Optional[str]=None) -> None:
         """Executes the workflow.
 
         Args:

@@ -5,6 +5,7 @@
 """
 
 import re
+import logging
 import requests
 from bs4 import BeautifulSoup
 from datetime import datetime
@@ -81,10 +82,12 @@ def generate_seed_urls(self) -> List[str]:
         """
         try:
             last_page_num = self.find_last_page()
+            logging.debug(f"Last page num found for ADB : {last_page_num}")
             result_pages = [
                 self.search_results_base_url.format(page_num=n) 
                 for n in range(self.first_page_num, last_page_num + 1)
             ]
+            # print(f"result pages list : {result_pages}")
             return result_pages
         except Exception as e:
             raise Exception("Failed to generate ADB search "
@@ -102,13 +105,17 @@ def find_last_page(self) -> int:
             (int): The page number.
         """
         try:
-            first_results_page = self.search_results_base_url.format(
+            first_results_page: str = self.search_results_base_url.format(
                 page_num=self.first_page_num)
             html = requests.get(first_results_page).text
+            # print(f"Got text from results page, type :  {type(html)}")
+            # print(f"Got text from results page, length :  {len(html)}")
             soup = BeautifulSoup(html, "html.parser")
 
-            last_page_btn = soup.find('li', {"class": "pager-last"})
+            last_page_btn = soup.find('li', {"class": "pager__item--last"})
+            # print(f"Got last page button : {last_page_btn}")
             last_page_num = int(last_page_btn.find("a")["href"].split('=')[-1])
+            # print("Last page number :", last_page_num)
             return last_page_num
 
         except Exception as e:
@@ -152,7 +159,7 @@ def __init__(
     def project_page_base_url(self) -> str:
         """The base URL for individual ADB project pages.
         """
-        return 'https://www.adb.org/print'
+        return 'https://www.adb.org'
 
 
     def scrape_results_page(self, results_page_url: str) -> List[str]:
@@ -225,12 +232,13 @@ def scrape_project_page(self, url: str) -> List[Dict]:
 
         Args:
             url (str): The URL for a project. Has the form:
-                'https://www.adb.org/print/projects/{project_id}/main'.
+                'https://www.adb.org/projects/{project_id}/main'.
 
         Returns:
             (list of dict): The list of project records.
         """
         # Request page and parse HTML
+        # print(f"URL to scrape : {url}")
         response = self._data_request_client.get(
             url=url,
             use_random_user_agent=True,
@@ -241,7 +249,8 @@ def scrape_project_page(self, url: str) -> List[Dict]:
         soup = BeautifulSoup(response.text, features='html.parser')
 
         # Find first project table holding project background details
-        table = soup.find('table')
+        table = soup.find('article')
+        # print(f"Table here : {table}")
 
         # Extract project name, number, and status
         def get_field(detail_name):
@@ -258,9 +267,11 @@ def get_field(detail_name):
         name = get_field("Project Name")
         number = get_field("Project Number")
         status = get_field("Project Status")
+        # print(f"table : {table}")
 
         # Extract and format countries
         country_label = table.find(string="Country / Economy")
+        # print(f"country label : {country_label}")
         if not country_label:
             country_label = table.find(string="Country")
         parent = country_label.findParent()
@@ -374,16 +385,16 @@ def get_field(detail_name):
             raise Exception(f"Failed to open configuration file. {e}")
 
     # Test 'SeedUrlsWorkflow'
-    w = AdbSeedUrlsWorkflow(None, None, None)
-    print(w.generate_seed_urls())
+    seed_workflow = AdbSeedUrlsWorkflow(None, None, None)
+    print(seed_workflow.generate_seed_urls())
 
     # Test 'ResultsScrapeWorkflow'
-    w = AdbResultsScrapeWorkflow(data_request_client, None, None, None)
+    res_scrape_workflow = AdbResultsScrapeWorkflow(data_request_client, None, None, None)
     url = 'https://www.adb.org/projects?page=558'
-    project_page_urls = w.scrape_results_page(url)
+    project_page_urls = res_scrape_workflow.scrape_results_page(url)
     print(project_page_urls)
 
     # Test 'ProjectScrapeWorkflow'
-    w = AdbProjectScrapeWorkflow(data_request_client, None, None)
-    url = 'https://www.adb.org/print/projects/53303-001/main'
-    print(w.scrape_project_page(url))
+    proj_scrape_workflow = AdbProjectScrapeWorkflow(data_request_client, None, None)
+    url = 'https://www.adb.org/projects/53303-001/main'
+    print(proj_scrape_workflow.scrape_project_page(url))
@@ -14,6 +14,11 @@
 from scrapers.services.database import DbClient
 from scrapers.services.pubsub import PubSubClient
 from typing import Dict, List
+import time, random
+
+USER_AGENT_HEADER = headers = {
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
+}
 
 class IdbSeedUrlsWorkflow(SeedUrlsWorkflow):
     """Retrieves the first set of IBD URLs to scrape.
@@ -75,12 +80,14 @@ def generate_seed_urls(self) -> List[str]:
         Returns:
             (list of str): The unique list of search result pages.
         """
+        print("Generating seed URL's for IDB")
         try:
             last_page_num = self.find_last_page()
             result_page_urls = [
                 self.search_results_base_url.format(page_num=str(n))
                 for n in range(self.first_page_num, last_page_num + 1)
             ]
+            print(f"Resulting pages : {result_page_urls}")
             return result_page_urls
         except Exception as e:
             raise Exception("Failed to generate search "
@@ -97,14 +104,19 @@ def find_last_page(self) -> int:
         Returns:
             (int): The page number.
         """
+        print("Finding last page for IDB")
         try:
             first_results_page_url = self.search_results_base_url.format(
                 page_num=self.first_page_num)
-            html = requests.get(first_results_page_url).text
+            print(f"URL for first IDB page : {first_results_page_url}")
+            html = requests.get(first_results_page_url, headers = USER_AGENT_HEADER).text
+            print(f"html for first page of results from IDB : {html}")
             soup = BeautifulSoup(html, "html.parser")
-
-            last_page_item = soup.find('li', {"class":"pager__item pager__item--last"})
-            return int(last_page_item.find("a")["href"].split('=')[-1])
+            last_page_item = soup.find('li', {"class":"pager__item--last"})
+            print(f"Text from last pager item for IDB : {last_page_item.text}")
+            last_page_url = last_page_item.find('idb-button')['button-url']
+            last_page_number = int(last_page_url.split('=')[-1])
+            return last_page_number
         except Exception as e:
             raise Exception(f"Error retrieving last page number. {e}")
 
@@ -210,40 +222,35 @@ def scrape_project_page(self, url: str) -> List[Dict]:
         """
         try:
             # Request and parse page into BeautifulSoup object
-            response = self._data_request_client.get(url, use_random_user_agent=False)
+            time.sleep(.2) # site breaks if requests come too fast
+            response = self._data_request_client.get(
+                url, 
+                use_random_user_agent=True, 
+                # custom_headers=USER_AGENT_HEADER
+            )
             soup = BeautifulSoup(response.text, 'html.parser')
-
             # Abort process if no project data available
-            project_title = soup.find("h1", {"class":"project-title"}).text
+            project_title = soup.find("idb-section-wrapper")["heading"]
             if not project_title or project_title.strip() == ":":
                 return
-
             # Parse project detail and information sections
-            project_detail_section = soup.find("div", {"class": "project-detail project-section"})
-            project_info_section = soup.find("div", {"class": "project-information project-section"})
+            project_info_table = soup.find('idb-project-table')
+            table_info = {}
+            for row in project_info_table.findAll('idb-project-table-row'):
+                fact, value, *_ = [p.text for p in row.findAll('p')]
+                table_info[fact] = value
 
             # Define local function for extracting data from a project section
-            def extract_field(project_section, field_name: str):
-                try:
-                    title_div = project_section.find(
-                        name="div",
-                        attrs={"class": "project-field-title"},
-                        string=re.compile(field_name, re.IGNORECASE)
-                    )
-                    data = title_div.find_next_sibling("span").text.strip()
-                    return data if data else None
-                except AttributeError:
-                    return None
 
             # Retrieve fields
-            number = extract_field(project_detail_section, "PROJECT NUMBER")
+            number = table_info['Project Number']
             name = project_title.split(":")[1].strip() if ":" in project_title else project_title
-            status = extract_field(project_detail_section, "PROJECT STATUS")
-            date = extract_field(project_detail_section, "APPROVAL DATE")
-            loan_amount = extract_field(project_info_section, "AMOUNT")
-            sectors = extract_field(project_detail_section, "PROJECT SECTOR")
-            subsectors = extract_field(project_detail_section, "PROJECT SUBSECTOR")
-            countries = extract_field(project_detail_section, "PROJECT COUNTRY")
+            status = table_info['Project Status']
+            date = table_info['Approval Date']
+            loan_amount = table_info['Total Cost']
+            sectors = table_info['Sector']
+            subsectors = table_info['Subsector']
+            countries = table_info['Country']
 
             # Parse project approval date to retrieve year, month, and day
             if date:

@@ -56,7 +56,7 @@ def search_results_base_url(self) -> str:
         """The base URL for development bank project search
         results page on Proparco's website.
         """
-        return 'https://www.proparco.fr/en/carte-des-projets?filter=&page=all&query=%2A&view=list'
+        return 'https://www.proparco.fr/en/carte-des-projets-list?page=all&query=%2A&view=start'
 
 
     @property
@@ -75,6 +75,7 @@ def generate_seed_urls(self) -> List[str]:
         Returns:
             (list of str): The project page URLs.
         """
+        print("Seeding URLs for PRO")
         try:
             response = requests.get(self.search_results_base_url)
             html = response.text

@@ -59,10 +59,14 @@ def get_projects(self) -> pd.DataFrame:
             (`pd.DataFrame`): The raw project records.
         """
         try:
-            return pd.read_excel(self.download_url, skiprows=2)
+            return pd.read_excel(self.download_url, skiprows=2, engine='xlrd')
+            # return pd.read_html(self.download_url, flavor='bs4')
         except Exception as e:
-            raise Exception(f"Error retrieving Excel project data "
-                f"from the World Bank. {e}")
+            try:
+                return pd.read_csv(self.download_url, skiprows=2)
+            except Exception as ee:
+                raise Exception(f"Error retrieving Excel project data "
+                    f"from the World Bank. {e}")
 
 
     def clean_projects(self, df: pd.DataFrame) -> pd.DataFrame:

@@ -67,7 +67,7 @@ def publish_message(self, data: Dict) -> None:
         try:
             # Encode data
             data_str = json.dumps(data)
-            encoded_data = data_str.encode('utf-8')
+            encoded_data: bytes = data_str.encode('utf-8')
 
             # Initiate message publishing
             publish_future = self._publisher.publish(self._topic_path, encoded_data)