Fixed error occurred after BNA website change.

Created a GitHub workflow to check data sources for validity daily (currently for DE/BNA only)
comsysto · Dec 18, 2023 · ae5c176 · ae5c176
1 parent 2597ef7
commit ae5c176
Show file tree

Hide file tree

Showing 15 changed files with 328 additions and 44 deletions.
diff --git a/.github/workflows/check-datasources.yml b/.github/workflows/check-datasources.yml
@@ -0,0 +1,36 @@
+# This workflow checks the data sources for validity
+
+name: Data source validity checks
+
+on:
+  schedule:
+    # Runs the workflow every day at midnight
+    - cron: '0 0 * * *'
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9.18
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      #      - name: Run integration tests (only)
+      #        run: |
+      #          pip install -r test/requirements.txt
+      #          pytest -m "integration_test"
+
+      - name: "[DE/BNA] Real data validity checks"
+        run: |
+          pip install -r test/requirements.txt
+          pytest tests/integration/test_int_de_bna.py
diff --git a/charging_stations_pipelines/pipelines/de/__init__.py b/charging_stations_pipelines/pipelines/de/__init__.py
@@ -4,3 +4,23 @@
 
 DATA_SOURCE_KEY: Final[str] = "BNA"
 """The data source key for the BNA (Bundesnetzagentur) data source."""
+
+
+class BnaCrawlerException(Exception):
+    """Base class for exceptions in BnaCrawler."""
+    pass
+
+
+class FetchWebsiteException(BnaCrawlerException):
+    """Raised when there is an error fetching the website."""
+    pass
+
+
+class ExtractURLException(BnaCrawlerException):
+    """Raised when there is an error extracting the URL."""
+    pass
+
+
+class DownloadFileException(BnaCrawlerException):
+    """Raised when there is an error downloading a file."""
+    pass
diff --git a/charging_stations_pipelines/pipelines/de/bna.py b/charging_stations_pipelines/pipelines/de/bna.py
@@ -48,9 +48,9 @@ def run(self):
         logger.info(f"Running {self.country_code}/{DATA_SOURCE_KEY} Pipeline...")
         self.retrieve_data()
 
+        logger.info(f"Mapping {DATA_SOURCE_KEY} data...")
         station_updater = StationTableUpdater(session=self.session, logger=logger)
         row: pd.Series
-        logger.info(f"Mapping {DATA_SOURCE_KEY} data...")
         for _, row in tqdm(iterable=self.data.iterrows(), total=self.data.shape[0]):
             try:
                 mapped_station = map_station_bna(row)
@@ -59,8 +59,9 @@ def run(self):
             except Exception as e:
                 logger.error(f"{DATA_SOURCE_KEY} entry could not be mapped! Error: {e}")
                 continue
+
             station_updater.update_station(mapped_station, DATA_SOURCE_KEY)
+
         station_updater.log_update_station_counts()
         logger.info("Finished mapping!")
-
         logger.info(f"Finished {self.country_code}/{DATA_SOURCE_KEY} Pipeline!")
diff --git a/charging_stations_pipelines/pipelines/de/bna_crawler.py b/charging_stations_pipelines/pipelines/de/bna_crawler.py
@@ -2,37 +2,56 @@
 
 import logging
 import os
+from typing import Optional
 
 import requests as requests
 from bs4 import BeautifulSoup
 
+from charging_stations_pipelines.pipelines.de import DownloadFileException, ExtractURLException, FetchWebsiteException
 from charging_stations_pipelines.shared import download_file
 
 logger = logging.getLogger(__name__)
 
+BASE_URL = "https://www.bundesnetzagentur.de/DE/Fachthemen/ElektrizitaetundGas/E-Mobilitaet/start.html"
+LINK_CLASS = "downloadLink Publication FTxlsx"
+SEARCH_TERM = "ladesaeulenregister"
+FILE_EXTENSION = ".xlsx"
+USER_AGENT = "Mozilla/5.0"
+PARSER = "html.parser"
 
-def get_bna_data(tmp_data_path):
-    """Downloads BNA (Bundesnetzagentur) data (Excel sheet 'ladesaeulenregister.xlsx') from its website into a temporary
-    file.
 
-    :param tmp_data_path: The path to save the downloaded data file.
-    :type tmp_data_path: str
-    :return: None
-    """
+def get_bna_data(tmp_data_path: str) -> None:
+    """Downloads BNA (Bundesnetzagentur) data (Excel sheet 'Ladesaeulenregister.xlsx') from its website into a temporary
+    file."""
     # Base url & header
-    headers = {"User-Agent": "Mozilla/5.0"}
-    base_url = "https://www.bundesnetzagentur.de/DE/Fachthemen/ElektrizitaetundGas/E-Mobilitaet/start.html"
+    headers = {"User-Agent": USER_AGENT}
+    # Fetch the website for further processing
+    response = requests.get(BASE_URL, headers=headers)
 
-    r = requests.get(base_url, headers=headers)
-    soup = BeautifulSoup(r.content, "html.parser")
+    # Check if the request has been successful
+    if response.status_code != 200:
+        raise FetchWebsiteException(f"Failed to fetch the website: {BASE_URL}")
 
+    soup = BeautifulSoup(response.content, PARSER)
     # Lookup for the link in the html
-    # Example URL from "href" attribute:
-    #   https://data.bundesnetzagentur.de/Bundesnetzagentur/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/E_Mobilitaet/ladesaeulenregister.xlsx
-    download_link_elem = soup.find("a", class_="FTxlsx")
-    download_link_url = download_link_elem.get("href")
-
-    logger.info(f"Downloading BNA data from {download_link_url}...")
-    download_file(download_link_url, tmp_data_path)
-    logger.info(f"Downloaded BNA data to {tmp_data_path}")
-    logger.info(f"Downloaded file size: {os.path.getsize(tmp_data_path)} bytes")
+    download_link_elems = soup.find_all("a", class_=LINK_CLASS)
+    download_link_url: Optional[str] = None
+    for link in download_link_elems:
+        download_link_url = link.get('href')
+        if (download_link_url
+                and SEARCH_TERM in download_link_url.lower()
+                and download_link_url.lower().endswith(FILE_EXTENSION)):
+            break
+
+    # Check if the url extraction is successful
+    if download_link_url is None:
+        raise ExtractURLException("Failed to extract the download url from the website.")
+
+    logger.debug(f"Downloading BNA data from '{download_link_url}'")
+    try:
+        download_file(download_link_url, tmp_data_path)
+    except Exception as e:
+        raise DownloadFileException(f"Could not download the file: {e}")
+
+    logger.debug(f"Downloaded BNA data to '{tmp_data_path}'")
+    logger.debug(f"Downloaded file size: {os.path.getsize(tmp_data_path)} bytes")
diff --git a/charging_stations_pipelines/shared.py b/charging_stations_pipelines/shared.py
@@ -231,8 +231,8 @@ def load_excel_file(path: str) -> pd.DataFrame:
     return df_dropped
 
 
-def download_file(url, target_file):
-    """Downloads a file from the given url and saves it to the given target file."""
+def download_file(url: str, target_file: str) -> None:
+    """Downloads a file from the specified url and saves it to the target file path."""
     headers = {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
         "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"

diff --git a/pyproject.toml b/pyproject.toml
@@ -17,9 +17,19 @@ build-backend = "setuptools.build_meta"
 
 [tool.black]
 line-length = 88
-target_version = ['py38']
+target_version = ['py39']
 
 [tool.pytest.ini_options]
+
+# Set logging for code under test
+addopts = "--verbose  --capture=no --log-cli-level=DEBUG --log-cli-format='%(asctime)s %(levelname)s %(message)s' --log-cli-date-format='%Y-%m-%d %H:%M:%S' -W ignore::DeprecationWarning"
+
+# Limit search for tests to following folders
+testpaths = [
+    "test", # TODO: move content to tests/unit/ folder
+    "tests",
+]
+
 # Declare custom markers
 markers = [
     "integration_test: marks tests as integration tests, which are bit slow (deselect with '-m \"integration_test\"')",

diff --git a/test/pipelines/at/test_econtrol_crawler.py b/test/pipelines/at/test_econtrol_crawler.py
@@ -11,7 +11,7 @@
 from charging_stations_pipelines.pipelines.at.econtrol_crawler import (
     __name__ as test_module_name,
 )
-# "local_caplog" is pytest fixture from test.shared.local_caplog
+# NOTE: "local_caplog" is a pytest fixture from test.shared.local_caplog
 from test.shared import local_caplog, LogLocalCaptureFixture  # noqa: F401
 
 
@@ -112,7 +112,6 @@ def test_get_data(
         ]
     )
 
-    # Call method under test... with mocked logging
     logger = logging.getLogger(test_module_name)
     with local_caplog(level=logging.DEBUG, logger=logger):
         # Call method under test... with mocked logging

diff --git a/test/pipelines/de/test_bna_crawler.py b/test/pipelines/de/test_bna_crawler.py
@@ -1,18 +1,104 @@
-"""Integration tests for the crawler of the BNA pipeline."""
-
+"""Unit tests for the crawler of the BNA pipeline."""
+import logging
 import os
-import tempfile
+from unittest.mock import Mock, patch
 
 import pytest
+import requests
+
+import charging_stations_pipelines.pipelines.de.bna_crawler
+from charging_stations_pipelines.pipelines.de import bna_crawler
+from charging_stations_pipelines.pipelines.de.bna_crawler import __name__ as test_module_name
+# NOTE: "local_caplog" is a pytest fixture from test.shared.local_caplog
+from test.shared import local_caplog, LogLocalCaptureFixture  # noqa: F401
+
+
+@patch.object(charging_stations_pipelines.pipelines.de.bna_crawler, 'BeautifulSoup')
+@patch.object(charging_stations_pipelines.pipelines.de.bna_crawler, 'download_file')
+@patch.object(requests, 'get')
+@patch.object(os.path, 'getsize')
+def test_get_bna_data_downloads_file_with_correct_url(mock_getsize, mock_requests_get, mock_download_file,
+                                                      mock_beautiful_soup):
+    # Mock the requests.get response
+    mock_response = Mock()
+    mock_response.content = b'something, something...'
+    mock_response.status_code = 200
+    mock_requests_get.return_value = mock_response
+
+    # Mock the BeautifulSoup find_all method
+    mock_beautiful_soup.return_value.find_all.return_value = [{'href': 'https://some_ladesaeulenregister_url.xlsx'}]
+
+    # Mock the os.path.getsize method
+    mock_getsize.return_value = 4321
+
+    # Call the method under test
+    bna_crawler.get_bna_data('./tmp_data_path/some_ladesaeulenregister_url.xlsx')
+
+    # Ensure these function were called with the expected arguments.
+    mock_requests_get.assert_called_with(
+            "https://www.bundesnetzagentur.de/DE/Fachthemen/ElektrizitaetundGas/E-Mobilitaet/start.html",
+            headers={"User-Agent": "Mozilla/5.0"})
+
+    # Assert that the download_file method was called with the correct parameters
+    mock_download_file.assert_called_once_with(
+            'https://some_ladesaeulenregister_url.xlsx', './tmp_data_path/some_ladesaeulenregister_url.xlsx')
+
+    # Assert that the os.path.getsize method was called with the correct parameters
+    mock_getsize.assert_called_once_with('./tmp_data_path/some_ladesaeulenregister_url.xlsx')
+
+
+@patch.object(requests, 'get')
+@patch.object(charging_stations_pipelines.pipelines.de.bna_crawler, 'BeautifulSoup')
+def test_get_bna_data_logs_error_when_no_download_link_found(mock_beautiful_soup, mock_requests_get, caplog):
+    # Mock the requests.get response
+    mock_requests_get.return_value = Mock(content=b'some content', status_code=200)
+
+    # Mock the BeautifulSoup find method to return None
+    mock_beautiful_soup.return_value.find_all.return_value = []
+
+    with pytest.raises(bna_crawler.ExtractURLException, match='Failed to extract the download url from the website.'):
+        # Call the function under test
+        bna_crawler.get_bna_data('tmp_data_path')
+
+
+@patch.object(requests, 'get')
+@patch.object(charging_stations_pipelines.pipelines.de.bna_crawler, 'BeautifulSoup')
+@patch.object(charging_stations_pipelines.pipelines.de.bna_crawler, 'download_file')
+@patch.object(os.path, 'getsize')
+def test_get_bna_data_logs_file_size_after_download(mock_getsize, mock_download_file, mock_beautiful_soup,
+                                                    mock_requests_get, local_caplog: LogLocalCaptureFixture):
+    # Mock the requests.get response
+    mock_requests_get.return_value = Mock(content=b'some content')
+    mock_requests_get.return_value.status_code = 200
+
+    # Mock the BeautifulSoup find_all method
+    mock_beautiful_soup.return_value.find_all.return_value = [
+        {'href': 'some_url_without_search_term.xlsx'},
+        {'href': 'tmp_data_path/ladesaeulenregister.xlsx'}
+    ]
+
+    # Mock the os.path.getsize method
+    mock_getsize.return_value = 1234
+
+    logger = logging.getLogger(test_module_name)
+    with local_caplog(level=logging.DEBUG, logger=logger):
+        # Call method under test... with mocked logging
+        bna_crawler.get_bna_data('tmp_data_path/some_url1_with_search_term.xlsx')
+
+    # Assert that the file size was logged
+    assert "Downloaded file size: 1234 bytes" in local_caplog.logs
+
+    # Assert that requests.get was called correctly
+    mock_requests_get.assert_called_once_with(
+            'https://www.bundesnetzagentur.de/DE/Fachthemen/ElektrizitaetundGas/E-Mobilitaet/start.html',
+            headers={'User-Agent': 'Mozilla/5.0'})
 
-from charging_stations_pipelines.pipelines.de.bna_crawler import get_bna_data
-from test.shared import skip_if_github
+    # Assert that BeautifulSoup was called correctly
+    mock_beautiful_soup.assert_called_once_with(b'some content', 'html.parser')
 
+    # Assert that download_file was called correctly
+    mock_download_file.assert_called_once_with('tmp_data_path/ladesaeulenregister.xlsx',
+                                               'tmp_data_path/some_url1_with_search_term.xlsx')
 
-@pytest.mark.integration_test
-@pytest.mark.skipif(skip_if_github(), reason="Skip the test when running on Github")
-def test_get_bna_data():
-    """Test the get_bna_data function."""
-    with tempfile.NamedTemporaryFile() as temp_file:
-        get_bna_data(temp_file.name)
-        assert os.path.getsize(temp_file.name) > 6 * 1_000_000, "File size is less than 6MB"
+    # Assert that os.path.getsize was called correctly
+    mock_getsize.assert_called_once_with('tmp_data_path/some_url1_with_search_term.xlsx')
diff --git a/test/test_main.py b/test/test_main.py
@@ -29,5 +29,15 @@ def test_parse_no_task_arg():
 
 
 def test_parse_invalid_task_arg():
+    # NOTE: in the pytest output, the error messages, like...:
+    # ----
+    # test/test_main.py::test_parse_invalid_task_arg usage: pytest [-h] [-c <country-code> [<country-code> ...]]
+    #   [-v] [-o] [-d] [--export_file_descriptor <file descriptor>] [--export_format {csv,GeoJSON}] [--export_charging]
+    #               [--export_merged_stations] [--export_all_countries] [--export_area <lon> <lat> <radius in m>]
+    #               <task> [<task> ...]
+    # pytest: error: argument <task>:
+    #   invalid choice: 'invalid_task' (choose from 'import', 'merge', 'export', 'testdata')
+    # ----
+    # ... is an expected side-effect of using `pytest.raises(SystemExit)`
     with pytest.raises(SystemExit):
         parse_args('invalid_task --countries de'.split())
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py