Skip to content

Commit

Permalink
Update: transform.ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
luke-strange committed Aug 6, 2024
1 parent 2dd515a commit 69a45c4
Showing 1 changed file with 75 additions and 79 deletions.
154 changes: 75 additions & 79 deletions pipelines/metadata/transform.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 1,
"metadata": {},
"outputs": [
{
Expand All @@ -30,7 +30,7 @@
"PosixPath('/Users/lukestrange/Code/housing')"
]
},
"execution_count": 7,
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -44,131 +44,127 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"336"
"337"
]
},
"execution_count": 8,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame = pd.DataFrame()\n",
"active_geographies = pd.DataFrame()\n",
"paths = [\n",
" \"metadata/lookups/Local_Authority_Districts_(April_2023)_Names_and_Codes_in_the_United_Kingdom.csv\", \n",
" \"metadata/lookups/Metropolitan_Counties_(December_2023)_Names_and_Codes_in_EN.csv\", \n",
" \"metadata/lookups/Regions_(December_2023)_Names_and_Codes_in_EN.csv\",\n",
" \"metadata/lookups/Combined_Authorities_(May_2024)_Names_and_Codes_in_England.csv\",\n",
" \"metadata/lookups/Counties_(April_2023)_Names_and_Codes_in_EN.csv\"\n",
" \"metadata/lookups/Counties_(April_2023)_Names_and_Codes_in_EN.csv\",\n",
" \"metadata/lookups/Countries_(December_2023)_Names_and_Codes_in_the_UK.csv\"\n",
" ]\n",
"for path in paths:\n",
" data = pd.read_csv(ROOT / path)\n",
" code_name = data.columns[data.columns.str.endswith('CD')].values[0]\n",
" geo_name = data.columns[data.columns.str.endswith('NM')].values[0]\n",
" data.rename(columns={f'{code_name}': 'geography_code', f'{geo_name}': 'geography_name'}, inplace=True)\n",
" data = data[['geography_code', 'geography_name']]\n",
" frame = pd.concat([frame, data])\n",
" active_geographies = pd.concat([active_geographies, data])\n",
"\n",
"frame = frame[~frame['geography_code'].str.startswith(('W', 'S', 'N'))]\n",
"frame.reset_index(inplace=True, drop=True)\n",
"frame['active'] = 'true'\n",
"frame.set_index('geography_code', inplace=True)\n",
"frame.to_json(ROOT / 'metadata/temp/active_geographies.json', orient='index', indent=4)\n",
"active_geographies = active_geographies[~active_geographies['geography_code'].str.startswith(('W', 'S', 'N', 'K'))]\n",
"active_geographies.reset_index(inplace=True, drop=True)\n",
"active_geographies['active'] = 'true'\n",
"active_geographies.set_index('geography_code', inplace=True)\n",
"active_geographies.to_json(ROOT / 'metadata/temp/active_geographies.json', orient='index', indent=4)\n",
"\n",
"len(frame.geography_name.unique())"
"len(active_geographies.geography_name.unique())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Inactive geographies"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"p = pd.read_csv(ROOT / 'data/vacant-homes/absolute.csv')\n",
"p = p[p.Year == max(p.Year)]"
"files = [ROOT / 'data/vacant-homes/AllCombined_Cleaned_2024.csv', ROOT / 'data/house-prices/median_house_prices.csv', ROOT / 'data/affordable-homes/by_tenure.csv']\n",
"inactive_geographies = pd.DataFrame(columns=['geography_code', 'geography_name'])\n",
"for file in files:\n",
" # Read the file\n",
" d = pd.read_csv(file)\n",
" \n",
" columns = d.columns.to_list()\n",
" assert 'geography_code' in columns, 'No column geography_code'\n",
" assert 'geography_name' in columns, 'No column geography_name'\n",
"\n",
" # Group the names and codes to get unique combinations, drop the size column.\n",
" d = d.groupby(['geography_code', 'geography_name']).size().reset_index().drop(columns=0)\n",
"\n",
" # fix some known naming bugs.\n",
" d['geography_name'] = d['geography_name'].str.replace('&', 'and')\n",
" d['geography_name'] = d['geography_name'].str.replace('St Edmundsbury', 'St. Edmundsbury')\n",
"\n",
" # Ensure no duplicates remain\n",
" d.drop_duplicates(inplace=True)\n",
"\n",
" # Get lists of unique codes and names in the current dataset\n",
" unique_active_codes = active_geographies.index.unique()\n",
" unique_active_names = active_geographies['geography_name'].unique()\n",
" df_A = active_geographies.reset_index().drop(columns='active')\n",
" df_B = d\n",
" # Merge DataFrames with indicator to show the source of each row\n",
" merged_df = df_B.merge(df_A, how='left', indicator=True)\n",
"\n",
" # Filter rows that are only in DataFrame B\n",
" unique_to_B = merged_df[merged_df['_merge'] == 'left_only']\n",
"\n",
" # Drop the _merge column\n",
" unique_to_B = unique_to_B.drop(columns='_merge')\n",
" inactive_geographies = pd.concat([unique_to_B, inactive_geographies])\n",
"\n",
"# Set the active status remaining geographies to false\n",
"inactive_geographies['active'] = 'false'\n",
"\n",
"inactive_geographies.set_index('geography_code', inplace=True)\n",
"# Drop any duplicates that came from multiple files\n",
"inactive_geographies.drop_duplicates(inplace=True)\n",
"inactive_geographies.to_json(ROOT / 'metadata/temp/inactive_geographies.json', orient='index', indent=4)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"E06000028\n",
"E06000029\n",
"E07000004\n",
"E07000005\n",
"E07000006\n",
"E07000007\n",
"E07000026\n",
"E07000027\n",
"E07000028\n",
"E07000029\n",
"E07000030\n",
"E07000031\n",
"E07000048\n",
"E07000049\n",
"E07000050\n",
"E07000051\n",
"E07000052\n",
"E07000053\n",
"E07000150\n",
"E07000151\n",
"E07000152\n",
"E07000153\n",
"E07000154\n",
"E07000155\n",
"E07000156\n",
"E07000163\n",
"E07000164\n",
"E07000165\n",
"E07000166\n",
"E07000167\n",
"E07000168\n",
"E07000169\n",
"E07000187\n",
"E07000188\n",
"E07000189\n",
"E07000190\n",
"E07000191\n",
"E07000201\n",
"E07000204\n",
"E07000205\n",
"E07000206\n",
"E07000246\n",
"E10000002\n",
"E10000006\n",
"E10000009\n",
"E10000021\n",
"E10000023\n",
"E10000027\n",
"E11000004\n",
"E92000001\n"
"Contains no duplicates...\n",
" writing to JSON file.\n"
]
}
],
"source": [
"len(p.AreaCode.unique())\n",
"for i in p.AreaCode.unique():\n",
" if i not in frame.index.unique():\n",
" print(i) # This is a list of inactive codes that were in the vacant-homes data."
"combined = pd.concat([active_geographies, inactive_geographies])\n",
"dupes = combined[combined.index.duplicated()]\n",
"if dupes.empty:\n",
" print('Contains no duplicates...\\n writing to JSON file.')\n",
" combined.to_json(ROOT / \"metadata/UK_geo_activity_status.json\", orient='index', indent=4)\n",
"else: \n",
" print(dupes)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit 69a45c4

Please sign in to comment.