From d985543a5dff6f67f310114c09558b63d5c9c80c Mon Sep 17 00:00:00 2001
From: David Gasquez <davidgasquez@gmail.com>
Date: Sat, 26 Oct 2024 19:31:18 +0200
Subject: [PATCH] =?UTF-8?q?refactor:=20=F0=9F=94=84=20remove=20outdated=20?=
 =?UTF-8?q?assets=20and=20simplify=20data=20processing?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Deleted outdated assets for OWID energy and CO2 data retrieval.
- Simplified World Bank WDI data processing by removing unnecessary pivot & renaming steps.
- Removed `slugify` dependency for column name cleaning.
---
 datadex/assets/indicators.py | 59 ------------------------------------
 datadex/indicators/assets.py | 27 ++---------------
 2 files changed, 3 insertions(+), 83 deletions(-)
 delete mode 100644 datadex/assets/indicators.py

diff --git a/datadex/assets/indicators.py b/datadex/assets/indicators.py
deleted file mode 100644
index b5674a5..0000000
--- a/datadex/assets/indicators.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import io
-import zipfile
-
-import httpx
-import polars as pl
-from dagster import asset
-
-
-@asset()
-def owid_energy_data() -> pl.DataFrame:
-    """
-    Raw Energy data from Our World in Data.
-    """
-    energy_owid_url = (
-        "https://raw.githubusercontent.com/owid/energy-data/master/owid-energy-data.csv"
-    )
-
-    return pl.read_csv(energy_owid_url)
-
-
-@asset()
-def owid_co2_data() -> pl.DataFrame:
-    """
-    Raw CO2 data from Our World in Data.
-    """
-    co2_owid_url = (
-        "https://raw.githubusercontent.com/owid/co2-data/master/owid-co2-data.csv"
-    )
-
-    return pl.read_csv(co2_owid_url)
-
-
-@asset()
-def world_bank_wdi() -> pl.DataFrame:
-    """
-    World Development Indicators (WDI) is the World Bank's premier compilation of cross-country comparable data on development.
-
-    Bulk data download is available at https://datatopics.worldbank.org/world-development-indicators/
-    """
-
-    url = "https://databankfiles.worldbank.org/public/ddpext_download/WDI_CSV.zip"
-
-    response = httpx.get(url)
-
-    zipfile.ZipFile(io.BytesIO(response.content)).extractall(path="/tmp/")
-
-    # Load the WDICSV.csv file as a DataFrame
-    df = pl.read_csv("/tmp/WDICSV.csv")
-
-    # Reshape the dataframe
-    df = df.unpivot(
-        index=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
-        value_name="Indicator Value",
-        variable_name="Year",
-    )
-
-    df = df.with_columns(pl.col("Year").cast(pl.Int32))
-
-    return df
diff --git a/datadex/indicators/assets.py b/datadex/indicators/assets.py
index c16d404..c404dd5 100644
--- a/datadex/indicators/assets.py
+++ b/datadex/indicators/assets.py
@@ -4,7 +4,6 @@
 import dagster as dg
 import httpx
 import polars as pl
-from slugify import slugify
 
 
 @dg.asset()
@@ -49,32 +48,12 @@ def world_bank_wdi() -> pl.DataFrame:
     df = pl.read_csv("/tmp/WDICSV.csv")
 
     # Reshape the dataframe
-    df = df.melt(
-        id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
+    df = df.unpivot(
+        index=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
         value_name="Indicator Value",
         variable_name="Year",
     )
 
-    # Make one column per Indicator Name
-    df = df.pivot(
-        index=["Country Name", "Country Code", "Year"],
-        values="Indicator Value",
-        on="Indicator Value",
-    )
-
-    # Cast to floats
-    df = df.select(
-        [
-            pl.col("Country Name"),
-            pl.col("Country Code"),
-            pl.col("Year").cast(pl.Int32),
-            *[pl.col(col).cast(pl.Float32) for col in df.columns[3:]],
-        ]
-    )
-
-    # Clean column names
-    df = df.rename(
-        lambda column_name: slugify(column_name.replace("%", "percent"), separator="_")
-    )
+    df = df.with_columns(pl.col("Year").cast(pl.Int32))
 
     return df