Skip to content

Commit

Permalink
copy steps
Browse files Browse the repository at this point in the history
  • Loading branch information
Marigold committed Dec 27, 2024
1 parent 464de50 commit b831d3c
Show file tree
Hide file tree
Showing 14 changed files with 8,602 additions and 0 deletions.
15 changes: 15 additions & 0 deletions dag/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,7 @@ steps:
- data://garden/news/2024-05-23/gdelt_v2

# World Development Indicators - WDI
# TO BE ARCHIVED
data://meadow/worldbank_wdi/2024-05-20/wdi:
- snapshot://worldbank_wdi/2024-05-20/wdi.zip

Expand All @@ -543,6 +544,20 @@ steps:
data://grapher/worldbank_wdi/2024-05-20/wdi:
- data://garden/worldbank_wdi/2024-05-20/wdi

# World Development Indicators - WDI
data://meadow/worldbank_wdi/2024-12-27/wdi:
- snapshot://worldbank_wdi/2024-12-27/wdi.zip

data://garden/worldbank_wdi/2024-12-27/wdi:
- snapshot://worldbank_wdi/2024-12-27/wdi.zip
- data://meadow/worldbank_wdi/2024-12-27/wdi
- data://garden/demography/2024-07-15/population
- data://garden/regions/2023-01-01/regions
- data://garden/wb/2024-07-29/income_groups

data://grapher/worldbank_wdi/2024-12-27/wdi:
- data://garden/worldbank_wdi/2024-12-27/wdi

#
# Aviation Safety Network - Aviation Statistics.
#
Expand Down
211 changes: 211 additions & 0 deletions etl/steps/data/garden/worldbank_wdi/2024-12-27/update_metadata.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Update metadata\n",
"\n",
"Update `wdi.meta.yml` from WDI metadata file. This notebook is intended to be run manually and all changes to the YAML file need to be verified."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from wdi import load_variable_metadata\n",
"\n",
"df_vars = load_variable_metadata()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_vars.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import ruamel.yaml\n",
"\n",
"yaml_path = \"wdi.meta.yml\"\n",
"\n",
"with open(yaml_path, \"r\") as f:\n",
" yml = ruamel.yaml.load(f, Loader=ruamel.yaml.RoundTripLoader)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from typing import Union\n",
"\n",
"\n",
"def replace_years(s: str, year: Union[int, str]) -> str:\n",
" \"\"\"replaces all years in string with {year}.\n",
"\n",
" Example:\n",
"\n",
" >>> replace_years(\"GDP (constant 2010 US$)\", 2015)\n",
" \"GDP (constant 2015 US$)\"\n",
" \"\"\"\n",
" year_regex = re.compile(r\"\\b([1-2]\\d{3})\\b\")\n",
" s_new = year_regex.sub(str(year), s)\n",
" return s_new\n",
"\n",
"\n",
"variables = yml[\"tables\"][\"wdi\"][\"variables\"]\n",
"\n",
"for indicator_code in df_vars.index:\n",
" if indicator_code in variables:\n",
" var = variables[indicator_code]\n",
" else:\n",
" var = {}\n",
" variables[indicator_code] = var\n",
"\n",
" # update titles from metadata file\n",
" try:\n",
" var[\"title\"] = df_vars.loc[indicator_code].indicator_name\n",
" except KeyError:\n",
" continue\n",
"\n",
" # if title contains year, try to update units too\n",
" year_regex = re.compile(r\"\\b([1-2]\\d{3})\\b\")\n",
" regex_res = year_regex.search(df_vars.loc[indicator_code].indicator_name)\n",
" if regex_res:\n",
" assert len(regex_res.groups()) == 1\n",
" year = regex_res.groups()[0]\n",
"\n",
" if \"unit\" in var:\n",
" var[\"unit\"] = replace_years(var[\"unit\"], year)\n",
"\n",
" if \"short_unit\" in var:\n",
" var[\"short_unit\"] = replace_years(var[\"short_unit\"], year)\n",
"\n",
" for k in [\"name\", \"unit\", \"short_unit\"]:\n",
" if var.get(\"display\", {}).get(\"unit\"):\n",
" var[\"display\"][\"unit\"] = replace_years(var[\"display\"][\"unit\"], year)\n",
"\n",
" if var.get(\"display\", {}).get(\"short_unit\"):\n",
" var[\"display\"][\"short_unit\"] = replace_years(var[\"display\"][\"short_unit\"], year)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open(yaml_path, \"w\") as f:\n",
" ruamel.yaml.dump(yml, f, Dumper=ruamel.yaml.RoundTripDumper, width=120)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Update Sources"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"with open(\"wdi.sources.json\", \"r\") as f:\n",
" sources = json.load(f)\n",
"\n",
"sources = [s for s in sources if not s[\"name\"].startswith(\"TODO\")]\n",
"\n",
"missing_sources = list(set(df_vars[\"source\"]) - {s[\"rawName\"] for s in sources})\n",
"missing_sources"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from openai import OpenAI\n",
"import random\n",
"\n",
"SYSTEM_PROMPT = f\"\"\"\n",
"You are given list of examples in JSON format you should use for learning. Each example has\n",
"rawName and fields name and dataPublisherSource are derived from rawName.\n",
"I'll give you a list of rawNames and you should give me a JSON list of those\n",
"rawNames with name and dataPublisherSource fields filled in.\n",
"\n",
"Examples:\n",
"{json.dumps(random.sample(sources, 20))}\n",
"\"\"\"\n",
"\n",
"all_sources = \"\\n\".join(missing_sources)\n",
"\n",
"messages = [\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": SYSTEM_PROMPT,\n",
" },\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": all_sources,\n",
" },\n",
"]\n",
"\n",
"client = OpenAI()\n",
"\n",
"# 10 missing sources / 5 examples -> 2min\n",
"response = client.chat.completions.create(\n",
" model=\"gpt-4o\",\n",
" temperature=0,\n",
" messages=messages,\n",
" response_format={\"type\": \"json_object\"},\n",
")\n",
"print(f\"Cost GPT4o: ${response.usage.total_tokens / 1e6 * 7.5:.2f}\")\n",
"r = json.loads(response.choices[0].message.content)\n",
"print(json.dumps(r, ensure_ascii=False, indent=2))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit b831d3c

Please sign in to comment.