Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up data processing #683

Merged
merged 13 commits into from
Oct 23, 2024
Merged
7 changes: 7 additions & 0 deletions data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ We utilize Python libraries such as Pandas and NumPy to perform various calculat

If you notice any test failing, please submit a ticket about it.

### Performance profiling

Requires Graphwiz installed on the OS https://www.graphviz.org/download/
```sh
py.test tests --profile-svg && open prof/combined.svg
```

### How to Update Data on Site

To recalculate and refresh the site's data, navigate to the `/data` folder and execute the following command:
Expand Down
109 changes: 109 additions & 0 deletions data/issues/emissions/cache_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import hashlib
import os
import pyarrow.feather as feather

import pandas as pd


def cache_df(f: type(pd.read_excel) = None, path: str = '', freq: str = '1Y') -> type(pd.read_excel):
"""
Cache the DataFrame to an intermediate file and use it if created within the same period.

Args:
f: A function to cache (e.g., a function that loads a DataFrame).
path: Path to the file to be cached. If provided without f, acts as a decorator.
freq: Cache period, e.g. '1D', '1M', '1Y'. Defaults to '1Y'. If provided without f, acts as a decorator.

Returns:
Caching of the output - not calling the function unless we entered a new period.

Example usage:

Create a test Excel file:
>>> df_test = pd.DataFrame({"A": [1], "B": [2]})
>>> test_path = "test_data.xlsx"
>>> df_test.to_excel(test_path, index=False)

Use the decorator to cache the DataFrame loaded from the file:
>>> @cache_df
... def load_data(path):
... print("Creating DataFrame from file (first call)...")
... return pd.read_excel(path)
>>> load_data.__name__
'cache_df_load_data'
>>> print(load_data(test_path))
Creating DataFrame from file (first call)...
A B
0 1 2

>>> print(load_data(test_path)) # Data loaded from cache, no print output
A B
0 1 2

Use the decorator with a short expiration time:
>>> @cache_df(path=test_path, freq='1ms')
... def load_data_short_expiry(path=test_path):
... print("Creating DataFrame from file (short expiration)...")
... return pd.read_excel(path)
>>> print(load_data_short_expiry())
Creating DataFrame from file (short expiration)...
A B
0 1 2

>>> import time
>>> time.sleep(0.001) # Sleep for 1 millisecond

>>> print(load_data_short_expiry()) # Data expired, loading again from file
Creating DataFrame from file (short expiration)...
A B
0 1 2

Clean up the test files:
>>> file_hash = hashlib.md5(test_path.encode()).hexdigest()
>>> os.remove(test_path)
>>> os.remove(f"cache_df_load_data_{file_hash}.feather")
>>> os.remove(f"cache_df_load_data_{file_hash}.pkl")
>>> os.remove(f"cache_df_load_data_short_expiry_{file_hash}.feather")
>>> os.remove(f"cache_df_load_data_short_expiry_{file_hash}.pkl")
"""
if f is None:
return lambda f: cache_df(f, path=path, freq=freq)

def caching_f(*args, **kwargs):
input_path = kwargs.get('path') or (args[0] if args else path)

# Create a hash of the path for the cache file
path_hash = hashlib.md5(input_path.encode()).hexdigest()
df_file = f'cache_df_{f.__name__}_{path_hash}.feather'
columns_file = f'cache_df_{f.__name__}_{path_hash}.pkl'

# Check if cached file and columns file exist and is in the same period as now
if os.path.exists(df_file):
stat = os.stat(df_file)
cache_mtime = pd.Timestamp(stat.st_mtime_ns // 1_000_000, unit='ms')
if pd.Period(pd.Timestamp.now(), freq=freq) == pd.Period(cache_mtime, freq=freq):
# Load cached data
df = pd.read_feather(df_file)
# Load original column names
if os.path.exists(columns_file):
original_columns = pd.read_pickle(columns_file)
df.columns = original_columns
return df

# Process and cache the data
df = f(*args, **kwargs)
feather.write_feather(df, df_file)

# Save the original column names separately since feather does not support different heading types
pd.to_pickle(df.columns, columns_file)

return df

caching_f.__name__ = 'cache_df_' + f.__name__
return caching_f


if __name__ == "__main__":
import doctest

doctest.testmod()
7 changes: 5 additions & 2 deletions data/issues/emissions/historical_data_calculations.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
import pandas as pd

from .cache_utilities import cache_df

PATH_SMHI = 'https://nationellaemissionsdatabasen.smhi.se/' + \
'api/getexcelfile/?county=0&municipality=0&sub=CO2'


def get_smhi_data():
@cache_df(path=PATH_SMHI)
def get_smhi_data(path=PATH_SMHI):
"""
Downloads data from SMHI and loads it into a pandas dataframe.

Returns:
pandas.DataFrame: The dataframe containing the SMHI data.
"""

df_raw = pd.read_excel(PATH_SMHI)
df_raw = pd.read_excel(path, engine="openpyxl")

# Remove the first 4 rows and reset the index
df_raw = df_raw.drop([0, 1, 2]).reset_index(drop=True)
Expand Down
6 changes: 6 additions & 0 deletions data/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "klimatkollen-data"
4 changes: 3 additions & 1 deletion data/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ xlrd
openpyxl
pyarrow
scipy
pytest
pytest
pytest-profiling
graphviz
Loading