-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexplore_re_data.py
569 lines (474 loc) · 18.5 KB
/
explore_re_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
"""
get pricing data from the web or a CSV file, explore it, and plot it.
author: christoph kugler
date: 2022-06-13
"""
import textwrap
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
import numpy as np
#%%
def read_csv(path="https://go.gv.at/l9kaufpreissammlungliegenschaften"):
"""
Read the data of real estate transaction from the internet (default)
or from your computer and return a dataframe.
The csv-file will be stored in the same folder as script!
(even if you did save it already somewhere else on your computer, e.g. in "Downloads".)
"""
# check if data is already saved as csv file in same folder as this script:
# if file exists, read csv file from folder:
csv_file = "kaufpreissammlung-liegenschaften.csv"
loc_path = Path(csv_file)
if loc_path.is_file():
print(f"\nThe file {csv_file} has already been downloaded")
print(f"Reading data from {loc_path}\n")
df = pd.read_csv(
csv_file,
sep=";",
encoding="Latin-1",
low_memory=False,
parse_dates=["Erwerbsdatum", "BJ"],
)
df["BJ"] = df["BJ"].dt.year.astype("Int64") # to get just year
else:
print(f"The file {csv_file} does not exist")
print("Reading data from: " + path)
df = pd.read_csv(
path,
sep=";",
low_memory=False,
thousands=".",
decimal=",",
header=0,
encoding="Latin-1",
names=[
"KG.Code",
"Katastralgemeinde",
"EZ",
"PLZ",
"Strasse",
"ON",
"Gst.",
"Gst.Fl.",
"ErwArt",
"Erwerbsdatum",
"Widmung",
"Bauklasse",
"Gebäudehöhe",
"Bauweise",
"Zusatz",
"Schutzzone",
"Wohnzone",
"öZ",
"Bausperre",
"seit/bis",
"zuordnung",
"Geschoße",
"parz.",
"VeräußererCode",
"Erwerbercode",
"Zähler",
"Nenner",
"BJ",
"TZ",
"Kaufpreis EUR",
"EUR/m2 Gfl.",
"AbbruchfixEU",
"m3 Abbruch",
"AbbruchkostEU",
"FreimachfixEU",
"Freimachfläche",
"FreimachkostEU",
"Baureifgest",
"% Widmung",
"Baurecht",
"Bis",
"auf EZ",
"Stammeinlage",
"sonst_wid",
"sonst_wid_prz",
"ber_Kaufpreis",
"Bauzins",
],
parse_dates=["BJ"],
)
df["Erwerbsdatum"] = pd.to_datetime(
df["Erwerbsdatum"].astype(str), format="%d%m%Y.0", errors="coerce"
)
# if "Erwerbsdatum" is later than dt.today (e.g. if the data is from the future),
# the date is set to null
df["Erwerbsdatum"] = df["Erwerbsdatum"].where(
df["Erwerbsdatum"] <= dt.datetime.today(), None
)
df["Strasse"] = (
df["Strasse"].astype(str).apply(lambda x: x.upper())
) # for address search
# Get Baujahr to just show the year, use Int64 bc there are NAs in the data
df["BJ"] = pd.to_datetime(
df["BJ"], format="%Y", errors="coerce"
).dt.year.astype("Int64")
# get "Bauzins" to type float (from "EUR 1.234.523,34" to 1234523.34)
df["Bauzins"] = df["Bauzins"].str.replace("EUR", "")
df["Bauzins"] = df["Bauzins"].str.replace(".", "")
df["Bauzins"] = df["Bauzins"].str.replace(",", ".")
df["Bauzins"] = df["Bauzins"].astype(float)
# if not already saved save file in folder
if loc_path.is_file():
return df
else:
# save the dataframe to a csv file and store it in the same folder as this script
df.to_csv(
"kaufpreissammlung-liegenschaften.csv",
sep=";",
encoding="Latin-1",
index=False,
)
return df
# %%
def show_specific_columns(df, which_colums):
"""return dataframe with certain columns
either all columns (which_columns = "all")
the following columns (which_columns = "predefined")
'zuordnung', 'BJ', 'Gebäudehöhe', 'KG.Code', 'Erwerbsdatum', 'Strasse', "ON", 'Gst.Fl.',
'AbbruchkostEU', 'ber_Kaufpreis', 'Bauzins'
or a user-input list of columns (which_columns = *user input*)
"""
if which_colums == [["predefined"]]:
return df[
[
"Erwerbsdatum",
"zuordnung",
"BJ",
"Gebäudehöhe",
"KG.Code",
"Strasse",
"ON",
"Gst.Fl.",
"AbbruchkostEU",
"ber_Kaufpreis",
"Bauzins",
]
]
else:
which_colums = which_colums[0]
return df[which_colums]
#%%
def sort_by_column(df, column, desc=False):
"""
Sort a dataframe by a column.
Default is ascending.
"""
return df.sort_values(by=column, ascending=not desc)
# %%
def show_certain_KG(df, KG_code, column="KG.Code"):
"""
Filter rows for a certain KG.
enter KG code without leading 0 (as shown in output of get_list_of_KG())
to get a list of all KG codes, run get_list_of_KG()
"""
df = df[df[column] == KG_code]
return df
def get_list_of_KG():
"""
scrape the KG from https://de.wikipedia.org/wiki/Wiener_Katastralgemeinden with BeautifulSoup
and return a dataframe with the KG code, name, municipal districts, and area.
"""
import requests
from bs4 import BeautifulSoup
url = "https://de.wikipedia.org/wiki/Wiener_Katastralgemeinden"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
table = soup.find("table", {"class": "wikitable sortable zebra"})
df = pd.read_html(str(table))[0]
df = df.iloc[:, [0, 1, 3, 5]]
return df
#%%
def describe_column(df, column):
"""
describe a column of a dataframe
"""
return df[column].describe()
#%%
def plot_column(df, ycolumn, xcolumn="Erwerbsdatum"):
"""
plot two column of a dataframe
one default column: "Erwerbsdatum"
one user-input column
"""
df.plot.scatter(x=xcolumn, y=ycolumn)
plt.title(f"{ycolumn} vs {xcolumn}")
if args.output:
"""Save plot to file"""
plt.savefig(str(Path(args.output)) + "/_" + ycolumn + "_" + xcolumn + ".png")
print("Plot saved to " + args.output)
else:
plt.show()
#%%
def column_correlation(df, column1, column2):
"""
correlation between two columns of a dataframe
"""
return df[column1].corr(df[column2])
#%%
def correlation_matrix(df):
"""
plot correlation matrix of a dataframe
if filtered by KG.code, remove KG.code column bc then there is only one unique KG.code in the dataframe
"""
if args.KG:
df_corr_mat = df.drop(columns=["KG.Code"])
corr = df_corr_mat.corr()
else:
corr = df.corr()
sns.heatmap(corr, annot=True, mask=corr.isnull())
plt.title("Correlation Matrix")
if args.output:
"""Save plot to file"""
plt.savefig(str(Path(args.output)) + "/correlation_matrix.png")
print("Plot saved to " + args.output)
else:
plt.show()
#%%
def filter_for_address(df, address):
"""
filter dataframe for address
"""
df_address = df[df["Strasse"].str.contains(address.upper())]
return df_address
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
prog="PROG",
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent(
"""\
Explore data on purchasing prices of real estate transactions in Vienna (source: land register),
including regulations of the land use and town plan, on-site inspection etc.
Link to original data: https://www.data.gv.at/katalog/dataset/kaufpreissammlung-liegenschaften-wien
CC BY 4.0 „Datenquelle: Stadt Wien – data.wien.gv.at“
Known bugs/limitations:
-Tested only on Linux, not on Windows.
-If you want to save a plot you have to use '-o "./"' as command line argument for the plot to be saved to the
current folder. If you specify another path, there will be an error. Saving a .csv file works fine.
- probably hard to use if you don't speak German which I realized just now and don't have the time to translate
it
Recomended usage:
To start the tool type in "python explore_re_data.py" (if you have python 3.XX AND python 2 on your system, you
may have to type in python3 instead of just python)
In the first run, get the list of all KG codes with the options "--no-read -l". It doesn't read in the pricing
data and prints the list of all KG codes or saves it to a csv file if you specify the option -o "path/to/file/"
Now you can explore the dataset:
--KG 1805 -> filter the data for the KG with code 1805
-a "Mariahilfer" -> filter the data for a specific street (here: every address with Mariahilfer in it)
--show-columns "predefined" -> show only the predefined columns ('zuordnung', 'BJ', 'Gebäudehöhe', 'KG.Code',
'Erwerbsdatum', 'Strasse', "ON", 'Gst.Fl.', 'AbbruchkostEU', 'ber_Kaufpreis', 'Bauzins')
--show-columns "KG.Code" "Katastralgemeinde" "EZ" "PLZ" "Strasse" "ON" "Gst." "Gst.Fl." "ErwArt"
"Erwerbsdatum" "Widmung" "Bauklasse" "Gebäudehöhe" "Bauweise" "Zusatz" "Schutzzone"
"Wohnzone" "öZ" "Bausperre" "seit/bis" "zuordnung" "Geschoße" "parz." "VeräußererCode"
"Erwerbercode" "Zähler" "Nenner" "BJ" "TZ" "Kaufpreis EUR""EUR/m2 Gfl." "AbbruchfixEU"
"m3 Abbruch" "AbbruchkostEU" "FreimachfixEU""Freimachfläche" "FreimachkostEU"
"Baureifgest" "% Widmung" "Baurecht" "Bis" "auf EZ" "Stammeinlage" "sonst_wid"
"sonst_wid_prz" "ber_Kaufpreis" "Bauzins" -> will show all columns (which is the default, but so
you can see which columns are available)
-p "ber_Kaufpreis" -> will plot the two columns "ber_Kaufpreis" and "Erwerbsdatum" (default for the x-axis is
"Erwerbsdatum" but you can specify any other column)
--corr-c "Erwerbsdatum" "Bauzins" -> will show the correlation between the two columns "Erwerbsdatum" and
"Bauzins"
--corr-m -> will show the correlation matrix of the dataframe, can be messy if there are many columns,
use with e.g. --show-columns "predefined"
-r 100 -> will show the dataframe with the first 100 rows, will print all of them to the screen!
-o "./" -> will save the output(s) to the current folder
Those two options are mutually exclusive:
-s "Bauzins" -> sort the data by column "Bauzins" (ascending)
-D "Bauzins" -> describe the column "Bauzins" (mean, std, min, max, etc.)
-s "Bauzins" -d -> sort descending
"""
),
)
parser.add_argument(
"--read", action="store_true", help="read in the princing data file"
)
parser.add_argument(
"--no-read",
dest="read",
action="store_false",
help="""don't read in the pricing data file and
ignore all options other than -l and -o""",
)
parser.set_defaults(read=True)
parser.add_argument("-f", "--file", type=str, help="path to csv file")
parser.add_argument(
"-o",
"--output",
type=str,
help="""path to output file folder, if you use this no
data will be printed to the screen, but the data will be saved
in the same folder as this script""",
)
parser.add_argument(
"-K",
"--KG",
type=int,
help="""KG code, enter without the leading 0, (use -l to get a list of
all KG codes)
If you use -K and -a together the dataframe will be first
filtered for the KG Code and then for the address""",
)
parser.add_argument(
"-a",
"--address",
type=str,
help="""Enter (part of) an address to filter the dataframe
for that address. If you use -K and -a together
the dataframe will be first filterd for the KG Code and
then for the address""",
)
parser.add_argument("-d", "--desc", action="store_true", help="sort descending")
parser.add_argument(
"-l",
"--list",
action="store_true",
help="""Prints a list of all KG codes to the screen or to a
file if -o is used""",
)
parser.add_argument(
"--show-columns",
type=str,
nargs="+",
action="append",
help="""columns to be shown, use "all" for all columns, "predefined" for the predefined
columns, or a list of columns like "Erwerbsdatum,BJ,KG.Code"
""",
)
parser.add_argument(
"-p",
"--plot",
type=str,
nargs="+",
action="append",
help="""plot scatter plot of two columns,y-column is by default "Erwerbsdatum" """,
)
parser.add_argument(
"-r",
"--rows",
type=int,
help="""how many rows should be shown (= will be printed on your
screen! or saved in the output file)""",
)
parser.add_argument(
"--corr-c",
type=str,
nargs="+",
action="append",
help="Correlation between two columns",
)
parser.add_argument(
"--corr-m",
action="store_true",
help='''plot correlation matrix of all columns of the dataframe
can be messy if there are many columns, use "predefined"''',
)
# describing groups which can not be used together
group1 = parser.add_mutually_exclusive_group()
group1.add_argument(
"-s",
"--sort-column",
type=str,
help="""column to sort by (default: ascending, use -d for
descending)""",
)
group1.add_argument("-D", "--describe-column", type=str, help="column to describe")
args = parser.parse_args()
if args.read: # if read is true, read in and explore the data, plot, etc.
if args.file:
df = read_csv(args.file)
else:
df = read_csv()
if args.KG:
df = show_certain_KG(df, args.KG)
if args.address:
df = filter_for_address(df, args.address)
if args.show_columns:
df = show_specific_columns(df, args.show_columns)
if not args.desc:
args.desc = False # default is ascending
if args.sort_column:
if args.desc:
print(f"Sorting by '{args.sort_column}' descending\n")
else:
print(f"Sorting by '{args.sort_column}' ascending\n")
df = sort_by_column(df, args.sort_column, args.desc)
if args.describe_column:
pd.set_option("display.float_format", lambda x: "%.2f" % x)
desc_df = describe_column(df, args.describe_column)
if args.output:
file_path = Path(
args.output + "/description_of_" + args.describe_column + ".csv"
)
desc_df.to_csv(file_path)
print(
"The data has been saved to: "
+ str(file_path.parent)
+ ' as "'
+ str(file_path.name)
+ '"\n'
)
else:
print(desc_df)
print("\n")
if args.rows:
pd.set_option("display.max_rows", args.rows)
df = df.head(args.rows)
# does not work yet
if args.plot:
if len(args.plot[0]) == 1:
plot_column(df, args.plot[0][0])
elif len(args.plot[0]) == 2:
plot_column(df, args.plot[0][1], args.plot[0][0])
else:
print("Error: wrong number of arguments for --plot\n")
if args.output:
file_path = Path(args.output + "/exported_df.csv")
df.to_csv(file_path, index=False)
print(
"The data has been saved to: "
+ str(file_path.parent)
+ ' as "'
+ str(file_path.name)
+ '"\n'
)
else:
print(df)
print("\n")
if args.corr_c:
if len(args.corr_c[0]) == 2:
try:
print(
f" The standard correlation coefficient of {args.corr_c[0][0]} and {args.corr_c[0][1]} is "
f"{column_correlation(df, args.corr_c[0][0], args.corr_c[0][1])}\n"
)
except:
print(
"Error: wrong column type for --corr-c (only numeric columns are supported)\n"
)
else:
print("Error: wrong number of arguments for --corr-c\n")
if args.corr_m:
correlation_matrix(df)
else: # if args.read is False (--no-read)
print("No pricing data read in")
if args.list:
KG_list = get_list_of_KG()
if args.output:
KG_list.to_csv(Path(args.output + "_list_of_KG_codes.csv"))
print(
"The KG data has been saved to: "
+ str(Path(args.output + "list_of_KG_codes.csv"))
)
else:
pd.set_option("display.max_rows", None)
print(KG_list)
pd.set_option("display.max_rows", 10)