copy steps

owid · Dec 27, 2024 · b831d3c · b831d3c
1 parent 464de50
commit b831d3c
Show file tree

Hide file tree

Showing 14 changed files with 8,602 additions and 0 deletions.
diff --git a/dag/main.yml b/dag/main.yml
@@ -530,6 +530,7 @@ steps:
     - data://garden/news/2024-05-23/gdelt_v2
 
   # World Development Indicators - WDI
+  # TO BE ARCHIVED
   data://meadow/worldbank_wdi/2024-05-20/wdi:
     - snapshot://worldbank_wdi/2024-05-20/wdi.zip
 
@@ -543,6 +544,20 @@ steps:
   data://grapher/worldbank_wdi/2024-05-20/wdi:
     - data://garden/worldbank_wdi/2024-05-20/wdi
 
+  # World Development Indicators - WDI
+  data://meadow/worldbank_wdi/2024-12-27/wdi:
+    - snapshot://worldbank_wdi/2024-12-27/wdi.zip
+
+  data://garden/worldbank_wdi/2024-12-27/wdi:
+    - snapshot://worldbank_wdi/2024-12-27/wdi.zip
+    - data://meadow/worldbank_wdi/2024-12-27/wdi
+    - data://garden/demography/2024-07-15/population
+    - data://garden/regions/2023-01-01/regions
+    - data://garden/wb/2024-07-29/income_groups
+
+  data://grapher/worldbank_wdi/2024-12-27/wdi:
+    - data://garden/worldbank_wdi/2024-12-27/wdi
+
   #
   # Aviation Safety Network - Aviation Statistics.
   #

diff --git a/etl/steps/data/garden/worldbank_wdi/2024-12-27/update_metadata.ipynb b/etl/steps/data/garden/worldbank_wdi/2024-12-27/update_metadata.ipynb
@@ -0,0 +1,211 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Update metadata\n",
+    "\n",
+    "Update `wdi.meta.yml` from WDI metadata file. This notebook is intended to be run manually and all changes to the YAML file need to be verified."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from wdi import load_variable_metadata\n",
+    "\n",
+    "df_vars = load_variable_metadata()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_vars.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ruamel.yaml\n",
+    "\n",
+    "yaml_path = \"wdi.meta.yml\"\n",
+    "\n",
+    "with open(yaml_path, \"r\") as f:\n",
+    "    yml = ruamel.yaml.load(f, Loader=ruamel.yaml.RoundTripLoader)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "from typing import Union\n",
+    "\n",
+    "\n",
+    "def replace_years(s: str, year: Union[int, str]) -> str:\n",
+    "    \"\"\"replaces all years in string with {year}.\n",
+    "\n",
+    "    Example:\n",
+    "\n",
+    "        >>> replace_years(\"GDP (constant 2010 US$)\", 2015)\n",
+    "        \"GDP (constant 2015 US$)\"\n",
+    "    \"\"\"\n",
+    "    year_regex = re.compile(r\"\\b([1-2]\\d{3})\\b\")\n",
+    "    s_new = year_regex.sub(str(year), s)\n",
+    "    return s_new\n",
+    "\n",
+    "\n",
+    "variables = yml[\"tables\"][\"wdi\"][\"variables\"]\n",
+    "\n",
+    "for indicator_code in df_vars.index:\n",
+    "    if indicator_code in variables:\n",
+    "        var = variables[indicator_code]\n",
+    "    else:\n",
+    "        var = {}\n",
+    "        variables[indicator_code] = var\n",
+    "\n",
+    "    # update titles from metadata file\n",
+    "    try:\n",
+    "        var[\"title\"] = df_vars.loc[indicator_code].indicator_name\n",
+    "    except KeyError:\n",
+    "        continue\n",
+    "\n",
+    "    # if title contains year, try to update units too\n",
+    "    year_regex = re.compile(r\"\\b([1-2]\\d{3})\\b\")\n",
+    "    regex_res = year_regex.search(df_vars.loc[indicator_code].indicator_name)\n",
+    "    if regex_res:\n",
+    "        assert len(regex_res.groups()) == 1\n",
+    "        year = regex_res.groups()[0]\n",
+    "\n",
+    "        if \"unit\" in var:\n",
+    "            var[\"unit\"] = replace_years(var[\"unit\"], year)\n",
+    "\n",
+    "        if \"short_unit\" in var:\n",
+    "            var[\"short_unit\"] = replace_years(var[\"short_unit\"], year)\n",
+    "\n",
+    "        for k in [\"name\", \"unit\", \"short_unit\"]:\n",
+    "            if var.get(\"display\", {}).get(\"unit\"):\n",
+    "                var[\"display\"][\"unit\"] = replace_years(var[\"display\"][\"unit\"], year)\n",
+    "\n",
+    "            if var.get(\"display\", {}).get(\"short_unit\"):\n",
+    "                var[\"display\"][\"short_unit\"] = replace_years(var[\"display\"][\"short_unit\"], year)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(yaml_path, \"w\") as f:\n",
+    "    ruamel.yaml.dump(yml, f, Dumper=ruamel.yaml.RoundTripDumper, width=120)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Update Sources"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "with open(\"wdi.sources.json\", \"r\") as f:\n",
+    "    sources = json.load(f)\n",
+    "\n",
+    "sources = [s for s in sources if not s[\"name\"].startswith(\"TODO\")]\n",
+    "\n",
+    "missing_sources = list(set(df_vars[\"source\"]) - {s[\"rawName\"] for s in sources})\n",
+    "missing_sources"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from openai import OpenAI\n",
+    "import random\n",
+    "\n",
+    "SYSTEM_PROMPT = f\"\"\"\n",
+    "You are given list of examples in JSON format you should use for learning. Each example has\n",
+    "rawName and fields name and dataPublisherSource are derived from rawName.\n",
+    "I'll give you a list of rawNames and you should give me a JSON list of those\n",
+    "rawNames with name and dataPublisherSource fields filled in.\n",
+    "\n",
+    "Examples:\n",
+    "{json.dumps(random.sample(sources, 20))}\n",
+    "\"\"\"\n",
+    "\n",
+    "all_sources = \"\\n\".join(missing_sources)\n",
+    "\n",
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"system\",\n",
+    "        \"content\": SYSTEM_PROMPT,\n",
+    "    },\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": all_sources,\n",
+    "    },\n",
+    "]\n",
+    "\n",
+    "client = OpenAI()\n",
+    "\n",
+    "# 10 missing sources / 5 examples -> 2min\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"gpt-4o\",\n",
+    "    temperature=0,\n",
+    "    messages=messages,\n",
+    "    response_format={\"type\": \"json_object\"},\n",
+    ")\n",
+    "print(f\"Cost GPT4o: ${response.usage.total_tokens / 1e6 * 7.5:.2f}\")\n",
+    "r = json.loads(response.choices[0].message.content)\n",
+    "print(json.dumps(r, ensure_ascii=False, indent=2))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}