-
-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
14 changed files
with
8,602 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
211 changes: 211 additions & 0 deletions
211
etl/steps/data/garden/worldbank_wdi/2024-12-27/update_metadata.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Update metadata\n", | ||
"\n", | ||
"Update `wdi.meta.yml` from WDI metadata file. This notebook is intended to be run manually and all changes to the YAML file need to be verified." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from wdi import load_variable_metadata\n", | ||
"\n", | ||
"df_vars = load_variable_metadata()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df_vars.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import ruamel.yaml\n", | ||
"\n", | ||
"yaml_path = \"wdi.meta.yml\"\n", | ||
"\n", | ||
"with open(yaml_path, \"r\") as f:\n", | ||
" yml = ruamel.yaml.load(f, Loader=ruamel.yaml.RoundTripLoader)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import re\n", | ||
"from typing import Union\n", | ||
"\n", | ||
"\n", | ||
"def replace_years(s: str, year: Union[int, str]) -> str:\n", | ||
" \"\"\"replaces all years in string with {year}.\n", | ||
"\n", | ||
" Example:\n", | ||
"\n", | ||
" >>> replace_years(\"GDP (constant 2010 US$)\", 2015)\n", | ||
" \"GDP (constant 2015 US$)\"\n", | ||
" \"\"\"\n", | ||
" year_regex = re.compile(r\"\\b([1-2]\\d{3})\\b\")\n", | ||
" s_new = year_regex.sub(str(year), s)\n", | ||
" return s_new\n", | ||
"\n", | ||
"\n", | ||
"variables = yml[\"tables\"][\"wdi\"][\"variables\"]\n", | ||
"\n", | ||
"for indicator_code in df_vars.index:\n", | ||
" if indicator_code in variables:\n", | ||
" var = variables[indicator_code]\n", | ||
" else:\n", | ||
" var = {}\n", | ||
" variables[indicator_code] = var\n", | ||
"\n", | ||
" # update titles from metadata file\n", | ||
" try:\n", | ||
" var[\"title\"] = df_vars.loc[indicator_code].indicator_name\n", | ||
" except KeyError:\n", | ||
" continue\n", | ||
"\n", | ||
" # if title contains year, try to update units too\n", | ||
" year_regex = re.compile(r\"\\b([1-2]\\d{3})\\b\")\n", | ||
" regex_res = year_regex.search(df_vars.loc[indicator_code].indicator_name)\n", | ||
" if regex_res:\n", | ||
" assert len(regex_res.groups()) == 1\n", | ||
" year = regex_res.groups()[0]\n", | ||
"\n", | ||
" if \"unit\" in var:\n", | ||
" var[\"unit\"] = replace_years(var[\"unit\"], year)\n", | ||
"\n", | ||
" if \"short_unit\" in var:\n", | ||
" var[\"short_unit\"] = replace_years(var[\"short_unit\"], year)\n", | ||
"\n", | ||
" for k in [\"name\", \"unit\", \"short_unit\"]:\n", | ||
" if var.get(\"display\", {}).get(\"unit\"):\n", | ||
" var[\"display\"][\"unit\"] = replace_years(var[\"display\"][\"unit\"], year)\n", | ||
"\n", | ||
" if var.get(\"display\", {}).get(\"short_unit\"):\n", | ||
" var[\"display\"][\"short_unit\"] = replace_years(var[\"display\"][\"short_unit\"], year)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"with open(yaml_path, \"w\") as f:\n", | ||
" ruamel.yaml.dump(yml, f, Dumper=ruamel.yaml.RoundTripDumper, width=120)" | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Update Sources" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import json\n", | ||
"\n", | ||
"with open(\"wdi.sources.json\", \"r\") as f:\n", | ||
" sources = json.load(f)\n", | ||
"\n", | ||
"sources = [s for s in sources if not s[\"name\"].startswith(\"TODO\")]\n", | ||
"\n", | ||
"missing_sources = list(set(df_vars[\"source\"]) - {s[\"rawName\"] for s in sources})\n", | ||
"missing_sources" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"from openai import OpenAI\n", | ||
"import random\n", | ||
"\n", | ||
"SYSTEM_PROMPT = f\"\"\"\n", | ||
"You are given list of examples in JSON format you should use for learning. Each example has\n", | ||
"rawName and fields name and dataPublisherSource are derived from rawName.\n", | ||
"I'll give you a list of rawNames and you should give me a JSON list of those\n", | ||
"rawNames with name and dataPublisherSource fields filled in.\n", | ||
"\n", | ||
"Examples:\n", | ||
"{json.dumps(random.sample(sources, 20))}\n", | ||
"\"\"\"\n", | ||
"\n", | ||
"all_sources = \"\\n\".join(missing_sources)\n", | ||
"\n", | ||
"messages = [\n", | ||
" {\n", | ||
" \"role\": \"system\",\n", | ||
" \"content\": SYSTEM_PROMPT,\n", | ||
" },\n", | ||
" {\n", | ||
" \"role\": \"user\",\n", | ||
" \"content\": all_sources,\n", | ||
" },\n", | ||
"]\n", | ||
"\n", | ||
"client = OpenAI()\n", | ||
"\n", | ||
"# 10 missing sources / 5 examples -> 2min\n", | ||
"response = client.chat.completions.create(\n", | ||
" model=\"gpt-4o\",\n", | ||
" temperature=0,\n", | ||
" messages=messages,\n", | ||
" response_format={\"type\": \"json_object\"},\n", | ||
")\n", | ||
"print(f\"Cost GPT4o: ${response.usage.total_tokens / 1e6 * 7.5:.2f}\")\n", | ||
"r = json.loads(response.choices[0].message.content)\n", | ||
"print(json.dumps(r, ensure_ascii=False, indent=2))" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": ".venv", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.0" | ||
}, | ||
"orig_nbformat": 4 | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.