From d985543a5dff6f67f310114c09558b63d5c9c80c Mon Sep 17 00:00:00 2001 From: David Gasquez Date: Sat, 26 Oct 2024 19:31:18 +0200 Subject: [PATCH] =?UTF-8?q?refactor:=20=F0=9F=94=84=20remove=20outdated=20?= =?UTF-8?q?assets=20and=20simplify=20data=20processing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Deleted outdated assets for OWID energy and CO2 data retrieval. - Simplified World Bank WDI data processing by removing unnecessary pivot & renaming steps. - Removed `slugify` dependency for column name cleaning. --- datadex/assets/indicators.py | 59 ------------------------------------ datadex/indicators/assets.py | 27 ++--------------- 2 files changed, 3 insertions(+), 83 deletions(-) delete mode 100644 datadex/assets/indicators.py diff --git a/datadex/assets/indicators.py b/datadex/assets/indicators.py deleted file mode 100644 index b5674a5..0000000 --- a/datadex/assets/indicators.py +++ /dev/null @@ -1,59 +0,0 @@ -import io -import zipfile - -import httpx -import polars as pl -from dagster import asset - - -@asset() -def owid_energy_data() -> pl.DataFrame: - """ - Raw Energy data from Our World in Data. - """ - energy_owid_url = ( - "https://raw.githubusercontent.com/owid/energy-data/master/owid-energy-data.csv" - ) - - return pl.read_csv(energy_owid_url) - - -@asset() -def owid_co2_data() -> pl.DataFrame: - """ - Raw CO2 data from Our World in Data. - """ - co2_owid_url = ( - "https://raw.githubusercontent.com/owid/co2-data/master/owid-co2-data.csv" - ) - - return pl.read_csv(co2_owid_url) - - -@asset() -def world_bank_wdi() -> pl.DataFrame: - """ - World Development Indicators (WDI) is the World Bank's premier compilation of cross-country comparable data on development. - - Bulk data download is available at https://datatopics.worldbank.org/world-development-indicators/ - """ - - url = "https://databankfiles.worldbank.org/public/ddpext_download/WDI_CSV.zip" - - response = httpx.get(url) - - zipfile.ZipFile(io.BytesIO(response.content)).extractall(path="/tmp/") - - # Load the WDICSV.csv file as a DataFrame - df = pl.read_csv("/tmp/WDICSV.csv") - - # Reshape the dataframe - df = df.unpivot( - index=["Country Name", "Country Code", "Indicator Name", "Indicator Code"], - value_name="Indicator Value", - variable_name="Year", - ) - - df = df.with_columns(pl.col("Year").cast(pl.Int32)) - - return df diff --git a/datadex/indicators/assets.py b/datadex/indicators/assets.py index c16d404..c404dd5 100644 --- a/datadex/indicators/assets.py +++ b/datadex/indicators/assets.py @@ -4,7 +4,6 @@ import dagster as dg import httpx import polars as pl -from slugify import slugify @dg.asset() @@ -49,32 +48,12 @@ def world_bank_wdi() -> pl.DataFrame: df = pl.read_csv("/tmp/WDICSV.csv") # Reshape the dataframe - df = df.melt( - id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"], + df = df.unpivot( + index=["Country Name", "Country Code", "Indicator Name", "Indicator Code"], value_name="Indicator Value", variable_name="Year", ) - # Make one column per Indicator Name - df = df.pivot( - index=["Country Name", "Country Code", "Year"], - values="Indicator Value", - on="Indicator Value", - ) - - # Cast to floats - df = df.select( - [ - pl.col("Country Name"), - pl.col("Country Code"), - pl.col("Year").cast(pl.Int32), - *[pl.col(col).cast(pl.Float32) for col in df.columns[3:]], - ] - ) - - # Clean column names - df = df.rename( - lambda column_name: slugify(column_name.replace("%", "percent"), separator="_") - ) + df = df.with_columns(pl.col("Year").cast(pl.Int32)) return df