Skip to content

Commit

Permalink
Fixed error occurred after BNA website change.
Browse files Browse the repository at this point in the history
Created a GitHub workflow to check data sources for validity daily (currently for DE/BNA only)
  • Loading branch information
dkling-reply committed Dec 18, 2023
1 parent 2597ef7 commit ae5c176
Show file tree
Hide file tree
Showing 15 changed files with 328 additions and 44 deletions.
36 changes: 36 additions & 0 deletions .github/workflows/check-datasources.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# This workflow checks the data sources for validity

name: Data source validity checks

on:
schedule:
# Runs the workflow every day at midnight
- cron: '0 0 * * *'

jobs:
test:
runs-on: ubuntu-latest

steps:
- name: Check out code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.9.18

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
# - name: Run integration tests (only)
# run: |
# pip install -r test/requirements.txt
# pytest -m "integration_test"

- name: "[DE/BNA] Real data validity checks"
run: |
pip install -r test/requirements.txt
pytest tests/integration/test_int_de_bna.py
20 changes: 20 additions & 0 deletions charging_stations_pipelines/pipelines/de/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,23 @@

DATA_SOURCE_KEY: Final[str] = "BNA"
"""The data source key for the BNA (Bundesnetzagentur) data source."""


class BnaCrawlerException(Exception):
"""Base class for exceptions in BnaCrawler."""
pass


class FetchWebsiteException(BnaCrawlerException):
"""Raised when there is an error fetching the website."""
pass


class ExtractURLException(BnaCrawlerException):
"""Raised when there is an error extracting the URL."""
pass


class DownloadFileException(BnaCrawlerException):
"""Raised when there is an error downloading a file."""
pass
5 changes: 3 additions & 2 deletions charging_stations_pipelines/pipelines/de/bna.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def run(self):
logger.info(f"Running {self.country_code}/{DATA_SOURCE_KEY} Pipeline...")
self.retrieve_data()

logger.info(f"Mapping {DATA_SOURCE_KEY} data...")
station_updater = StationTableUpdater(session=self.session, logger=logger)
row: pd.Series
logger.info(f"Mapping {DATA_SOURCE_KEY} data...")
for _, row in tqdm(iterable=self.data.iterrows(), total=self.data.shape[0]):
try:
mapped_station = map_station_bna(row)
Expand All @@ -59,8 +59,9 @@ def run(self):
except Exception as e:
logger.error(f"{DATA_SOURCE_KEY} entry could not be mapped! Error: {e}")
continue

station_updater.update_station(mapped_station, DATA_SOURCE_KEY)

station_updater.log_update_station_counts()
logger.info("Finished mapping!")

logger.info(f"Finished {self.country_code}/{DATA_SOURCE_KEY} Pipeline!")
59 changes: 39 additions & 20 deletions charging_stations_pipelines/pipelines/de/bna_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,56 @@

import logging
import os
from typing import Optional

import requests as requests
from bs4 import BeautifulSoup

from charging_stations_pipelines.pipelines.de import DownloadFileException, ExtractURLException, FetchWebsiteException
from charging_stations_pipelines.shared import download_file

logger = logging.getLogger(__name__)

BASE_URL = "https://www.bundesnetzagentur.de/DE/Fachthemen/ElektrizitaetundGas/E-Mobilitaet/start.html"
LINK_CLASS = "downloadLink Publication FTxlsx"
SEARCH_TERM = "ladesaeulenregister"
FILE_EXTENSION = ".xlsx"
USER_AGENT = "Mozilla/5.0"
PARSER = "html.parser"

def get_bna_data(tmp_data_path):
"""Downloads BNA (Bundesnetzagentur) data (Excel sheet 'ladesaeulenregister.xlsx') from its website into a temporary
file.

:param tmp_data_path: The path to save the downloaded data file.
:type tmp_data_path: str
:return: None
"""
def get_bna_data(tmp_data_path: str) -> None:
"""Downloads BNA (Bundesnetzagentur) data (Excel sheet 'Ladesaeulenregister.xlsx') from its website into a temporary
file."""
# Base url & header
headers = {"User-Agent": "Mozilla/5.0"}
base_url = "https://www.bundesnetzagentur.de/DE/Fachthemen/ElektrizitaetundGas/E-Mobilitaet/start.html"
headers = {"User-Agent": USER_AGENT}
# Fetch the website for further processing
response = requests.get(BASE_URL, headers=headers)

r = requests.get(base_url, headers=headers)
soup = BeautifulSoup(r.content, "html.parser")
# Check if the request has been successful
if response.status_code != 200:
raise FetchWebsiteException(f"Failed to fetch the website: {BASE_URL}")

soup = BeautifulSoup(response.content, PARSER)
# Lookup for the link in the html
# Example URL from "href" attribute:
# https://data.bundesnetzagentur.de/Bundesnetzagentur/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/E_Mobilitaet/ladesaeulenregister.xlsx
download_link_elem = soup.find("a", class_="FTxlsx")
download_link_url = download_link_elem.get("href")

logger.info(f"Downloading BNA data from {download_link_url}...")
download_file(download_link_url, tmp_data_path)
logger.info(f"Downloaded BNA data to {tmp_data_path}")
logger.info(f"Downloaded file size: {os.path.getsize(tmp_data_path)} bytes")
download_link_elems = soup.find_all("a", class_=LINK_CLASS)
download_link_url: Optional[str] = None
for link in download_link_elems:
download_link_url = link.get('href')
if (download_link_url
and SEARCH_TERM in download_link_url.lower()
and download_link_url.lower().endswith(FILE_EXTENSION)):
break

# Check if the url extraction is successful
if download_link_url is None:
raise ExtractURLException("Failed to extract the download url from the website.")

logger.debug(f"Downloading BNA data from '{download_link_url}'")
try:
download_file(download_link_url, tmp_data_path)
except Exception as e:
raise DownloadFileException(f"Could not download the file: {e}")

logger.debug(f"Downloaded BNA data to '{tmp_data_path}'")
logger.debug(f"Downloaded file size: {os.path.getsize(tmp_data_path)} bytes")
4 changes: 2 additions & 2 deletions charging_stations_pipelines/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,8 @@ def load_excel_file(path: str) -> pd.DataFrame:
return df_dropped


def download_file(url, target_file):
"""Downloads a file from the given url and saves it to the given target file."""
def download_file(url: str, target_file: str) -> None:
"""Downloads a file from the specified url and saves it to the target file path."""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
Expand Down
12 changes: 11 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,19 @@ build-backend = "setuptools.build_meta"

[tool.black]
line-length = 88
target_version = ['py38']
target_version = ['py39']

[tool.pytest.ini_options]

# Set logging for code under test
addopts = "--verbose --capture=no --log-cli-level=DEBUG --log-cli-format='%(asctime)s %(levelname)s %(message)s' --log-cli-date-format='%Y-%m-%d %H:%M:%S' -W ignore::DeprecationWarning"

# Limit search for tests to following folders
testpaths = [
"test", # TODO: move content to tests/unit/ folder
"tests",
]

# Declare custom markers
markers = [
"integration_test: marks tests as integration tests, which are bit slow (deselect with '-m \"integration_test\"')",
Expand Down
3 changes: 1 addition & 2 deletions test/pipelines/at/test_econtrol_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from charging_stations_pipelines.pipelines.at.econtrol_crawler import (
__name__ as test_module_name,
)
# "local_caplog" is pytest fixture from test.shared.local_caplog
# NOTE: "local_caplog" is a pytest fixture from test.shared.local_caplog
from test.shared import local_caplog, LogLocalCaptureFixture # noqa: F401


Expand Down Expand Up @@ -112,7 +112,6 @@ def test_get_data(
]
)

# Call method under test... with mocked logging
logger = logging.getLogger(test_module_name)
with local_caplog(level=logging.DEBUG, logger=logger):
# Call method under test... with mocked logging
Expand Down
110 changes: 98 additions & 12 deletions test/pipelines/de/test_bna_crawler.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,104 @@
"""Integration tests for the crawler of the BNA pipeline."""

"""Unit tests for the crawler of the BNA pipeline."""
import logging
import os
import tempfile
from unittest.mock import Mock, patch

import pytest
import requests

import charging_stations_pipelines.pipelines.de.bna_crawler
from charging_stations_pipelines.pipelines.de import bna_crawler
from charging_stations_pipelines.pipelines.de.bna_crawler import __name__ as test_module_name
# NOTE: "local_caplog" is a pytest fixture from test.shared.local_caplog
from test.shared import local_caplog, LogLocalCaptureFixture # noqa: F401


@patch.object(charging_stations_pipelines.pipelines.de.bna_crawler, 'BeautifulSoup')
@patch.object(charging_stations_pipelines.pipelines.de.bna_crawler, 'download_file')
@patch.object(requests, 'get')
@patch.object(os.path, 'getsize')
def test_get_bna_data_downloads_file_with_correct_url(mock_getsize, mock_requests_get, mock_download_file,
mock_beautiful_soup):
# Mock the requests.get response
mock_response = Mock()
mock_response.content = b'something, something...'
mock_response.status_code = 200
mock_requests_get.return_value = mock_response

# Mock the BeautifulSoup find_all method
mock_beautiful_soup.return_value.find_all.return_value = [{'href': 'https://some_ladesaeulenregister_url.xlsx'}]

# Mock the os.path.getsize method
mock_getsize.return_value = 4321

# Call the method under test
bna_crawler.get_bna_data('./tmp_data_path/some_ladesaeulenregister_url.xlsx')

# Ensure these function were called with the expected arguments.
mock_requests_get.assert_called_with(
"https://www.bundesnetzagentur.de/DE/Fachthemen/ElektrizitaetundGas/E-Mobilitaet/start.html",
headers={"User-Agent": "Mozilla/5.0"})

# Assert that the download_file method was called with the correct parameters
mock_download_file.assert_called_once_with(
'https://some_ladesaeulenregister_url.xlsx', './tmp_data_path/some_ladesaeulenregister_url.xlsx')

# Assert that the os.path.getsize method was called with the correct parameters
mock_getsize.assert_called_once_with('./tmp_data_path/some_ladesaeulenregister_url.xlsx')


@patch.object(requests, 'get')
@patch.object(charging_stations_pipelines.pipelines.de.bna_crawler, 'BeautifulSoup')
def test_get_bna_data_logs_error_when_no_download_link_found(mock_beautiful_soup, mock_requests_get, caplog):
# Mock the requests.get response
mock_requests_get.return_value = Mock(content=b'some content', status_code=200)

# Mock the BeautifulSoup find method to return None
mock_beautiful_soup.return_value.find_all.return_value = []

with pytest.raises(bna_crawler.ExtractURLException, match='Failed to extract the download url from the website.'):
# Call the function under test
bna_crawler.get_bna_data('tmp_data_path')


@patch.object(requests, 'get')
@patch.object(charging_stations_pipelines.pipelines.de.bna_crawler, 'BeautifulSoup')
@patch.object(charging_stations_pipelines.pipelines.de.bna_crawler, 'download_file')
@patch.object(os.path, 'getsize')
def test_get_bna_data_logs_file_size_after_download(mock_getsize, mock_download_file, mock_beautiful_soup,
mock_requests_get, local_caplog: LogLocalCaptureFixture):
# Mock the requests.get response
mock_requests_get.return_value = Mock(content=b'some content')
mock_requests_get.return_value.status_code = 200

# Mock the BeautifulSoup find_all method
mock_beautiful_soup.return_value.find_all.return_value = [
{'href': 'some_url_without_search_term.xlsx'},
{'href': 'tmp_data_path/ladesaeulenregister.xlsx'}
]

# Mock the os.path.getsize method
mock_getsize.return_value = 1234

logger = logging.getLogger(test_module_name)
with local_caplog(level=logging.DEBUG, logger=logger):
# Call method under test... with mocked logging
bna_crawler.get_bna_data('tmp_data_path/some_url1_with_search_term.xlsx')

# Assert that the file size was logged
assert "Downloaded file size: 1234 bytes" in local_caplog.logs

# Assert that requests.get was called correctly
mock_requests_get.assert_called_once_with(
'https://www.bundesnetzagentur.de/DE/Fachthemen/ElektrizitaetundGas/E-Mobilitaet/start.html',
headers={'User-Agent': 'Mozilla/5.0'})

from charging_stations_pipelines.pipelines.de.bna_crawler import get_bna_data
from test.shared import skip_if_github
# Assert that BeautifulSoup was called correctly
mock_beautiful_soup.assert_called_once_with(b'some content', 'html.parser')

# Assert that download_file was called correctly
mock_download_file.assert_called_once_with('tmp_data_path/ladesaeulenregister.xlsx',
'tmp_data_path/some_url1_with_search_term.xlsx')

@pytest.mark.integration_test
@pytest.mark.skipif(skip_if_github(), reason="Skip the test when running on Github")
def test_get_bna_data():
"""Test the get_bna_data function."""
with tempfile.NamedTemporaryFile() as temp_file:
get_bna_data(temp_file.name)
assert os.path.getsize(temp_file.name) > 6 * 1_000_000, "File size is less than 6MB"
# Assert that os.path.getsize was called correctly
mock_getsize.assert_called_once_with('tmp_data_path/some_url1_with_search_term.xlsx')
10 changes: 10 additions & 0 deletions test/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,15 @@ def test_parse_no_task_arg():


def test_parse_invalid_task_arg():
# NOTE: in the pytest output, the error messages, like...:
# ----
# test/test_main.py::test_parse_invalid_task_arg usage: pytest [-h] [-c <country-code> [<country-code> ...]]
# [-v] [-o] [-d] [--export_file_descriptor <file descriptor>] [--export_format {csv,GeoJSON}] [--export_charging]
# [--export_merged_stations] [--export_all_countries] [--export_area <lon> <lat> <radius in m>]
# <task> [<task> ...]
# pytest: error: argument <task>:
# invalid choice: 'invalid_task' (choose from 'import', 'merge', 'export', 'testdata')
# ----
# ... is an expected side-effect of using `pytest.raises(SystemExit)`
with pytest.raises(SystemExit):
parse_args('invalid_task --countries de'.split())
Empty file added tests/__init__.py
Empty file.
Empty file added tests/integration/__init__.py
Empty file.
Loading

0 comments on commit ae5c176

Please sign in to comment.