forked from OpenDataScotland/the_od_bods
-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge_data.py
732 lines (686 loc) · 24.7 KB
/
merge_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
### Setting the environment
import pandas as pd
import os
import datetime as dt
def merge_data():
### Loading data
### From ckan output
source_ckan = pd.DataFrame()
folder = "data/ckan/"
for dirname, _, filenames in os.walk(folder):
for filename in filenames:
if filename.rsplit(".", 1)[1] == "csv":
print(filename)
source_ckan = pd.concat(
[
source_ckan,
pd.read_csv(
folder + r"/" + filename, parse_dates=["DateCreated","DateUpdated"], lineterminator='\n'
),
]
)
source_ckan["Source"] = "ckan API"
### From scotgov csv
source_scotgov = pd.read_csv("data/scotgov-datasets-sparkql.csv")
source_scotgov = source_scotgov.rename(
columns={
"title": "Title",
"category": "OriginalTags",
"organization": "Owner",
"notes": "Description",
"date_created": "DateCreated",
"date_updated": "DateUpdated",
"url": "PageURL",
"licence":"License"
}
)
source_scotgov["Source"] = "sparql"
source_scotgov['DateUpdated'] = pd.to_datetime(source_scotgov['DateUpdated'], utc=True).dt.tz_localize(None)
source_scotgov['DateCreated'] = pd.to_datetime(source_scotgov['DateCreated'], utc=True).dt.tz_localize(None)
### From arcgis api
source_arcgis = pd.DataFrame()
folder = "data/arcgis/"
for dirname, _, filenames in os.walk(folder):
for filename in filenames:
if filename.rsplit(".", 1)[1] == "csv":
source_arcgis = pd.concat(
[
source_arcgis,
pd.read_csv(
folder + r"/" + filename, parse_dates=["DateCreated","DateUpdated"]
),
]
)
source_arcgis["Source"] = "arcgis API"
### From usmart api
source_usmart = pd.DataFrame()
folder = "data/USMART/"
for dirname, _, filenames in os.walk(folder):
for filename in filenames:
if filename.rsplit(".", 1)[1] == "csv":
source_usmart = pd.concat(
[
source_usmart,
pd.read_csv(
folder + r"/" + filename, parse_dates=["DateCreated","DateUpdated"]
),
]
)
source_usmart["Source"] = "USMART API"
source_usmart["DateUpdated"] = source_usmart["DateUpdated"].dt.tz_localize(None)
source_usmart["DateCreated"] = source_usmart["DateCreated"].dt.tz_localize(None)
## From DCAT
source_dcat = pd.DataFrame()
folder = "data/dcat/"
for dirname, _, filenames in os.walk(folder):
for filename in filenames:
if filename.rsplit(".", 1)[1] == "csv":
source_dcat = pd.concat(
[
source_dcat,
pd.read_csv(
folder + r"/" + filename, parse_dates=["DateCreated","DateUpdated"]
),
]
)
source_dcat["DateUpdated"] = source_dcat["DateUpdated"].dt.tz_localize(None)
#source_dcat["DateCreated"] = source_dcat["DateCreated"].dt.tz_localize(None) ### DateCreated currently not picked up in dcat so all are NULL
source_dcat["Source"] = "DCAT feed"
## From web scraped results
source_scraped = pd.DataFrame()
folder = "data/scraped-results/"
for dirname, _, filenames in os.walk(folder):
for filename in filenames:
if filename.rsplit(".", 1)[1] == "csv":
source_scraped = pd.concat(
[
source_scraped,
pd.read_csv(
folder + r"/" + filename, parse_dates=["DateCreated","DateUpdated"]
),
]
)
source_scraped["Source"] = "Web Scraped"
### Combine all data into single table
data = pd.concat(
[
source_ckan,
source_arcgis,
source_usmart,
source_scotgov,
source_dcat,
source_scraped,
]
)
data = data.reset_index(drop=True)
### Saves copy of data without cleaning - for analysis purposes
data.to_csv("data/merged_output_untidy.csv", index=False)
### clean data
data = clean_data(data)
### Output cleaned data to csv
data.to_csv("data/merged_output.csv", index=False)
return data
def clean_data(dataframe):
"""cleans data in a dataframe
Args:
dataframe (pd.dataframe): the name of the dataframe of data to clean
Returns:
dataframe: dataframe of cleaned data
"""
### to avoid confusion and avoid re-naming everything...
data = dataframe
### Renaming entries to match
owner_renames = {
"Aberdeen": "Aberdeen City Council",
"Dundee": "Dundee City Council",
"Perth": "Perth and Kinross Council",
"Stirling": "Stirling Council",
"Angus": "Angus Council",
"open.data@southayrshire": "South Ayrshire Council",
"SEPA": "Scottish Environment Protection Agency",
"South Ayrshire": "South Ayrshire Council",
"East Ayrshire": "East Ayrshire Council",
"Highland Council GIS Organisation": "Highland Council",
"Scottish.Forestry": "Scottish Forestry",
"Na h-Eileanan an Iar": "Comhairle nan Eilean Siar",
}
data["Owner"] = data["Owner"].replace(owner_renames)
### Format dates as datetime type
data["DateCreated"] = pd.to_datetime(
data["DateCreated"], format="%Y-%m-%d", errors="coerce", utc=True
).dt.date
data["DateUpdated"] = pd.to_datetime(
data["DateUpdated"], format="%Y-%m-%d", errors="coerce", utc=True
).dt.date
### Inconsistencies in casing for FileType
data["FileType"] = data["FileType"].str.upper()
### Creating a dummy column
data["AssetStatus"] = None
### Cleaning dataset categories
def tidy_categories(categories_string):
"""tidies the categories: removes commas, strips whitespace, converts all to lower and strips any trailing ";"
Args:
categories_string (string): the dataset categories as a string
"""
tidied_string = str(categories_string).replace(",", ";")
tidied_list = [
cat.lower().strip() for cat in tidied_string.split(";") if cat != ""
]
tidied_string = ";".join(str(cat) for cat in tidied_list if str(cat) != "nan")
if len(tidied_string) > 0:
if tidied_string[-1] == ";":
tidied_string = tidied_string[:-1]
return tidied_string
### Combining dataset categories
def combine_categories(dataset_row):
"""Combine OriginalTags and ManualTags to get all tags
Args:
dataset_row (dataframe): one row of the dataset set
"""
combined_tags = []
if str(dataset_row["OriginalTags"]) != "nan":
combined_tags = combined_tags + str(dataset_row["OriginalTags"]).split(";")
if str(dataset_row["ManualTags"]) != "nan":
combined_tags = combined_tags + str(dataset_row["ManualTags"]).split(";")
combined_tags = ";".join(str(cat) for cat in set(combined_tags))
return combined_tags
data["OriginalTags"] = data["OriginalTags"].apply(tidy_categories)
data["ManualTags"] = data["ManualTags"].apply(tidy_categories)
data["CombinedTags"] = data.apply(lambda x: combine_categories(x), axis=1)
### Creating new dataset categories for ODS
def assign_ODScategories(categories_string):
"""Assigns one of ODS' 13 categories, or 'Uncategorised' if none.
Args:
categories_string (string): the dataset categories as a string
"""
combined_tags = categories_string.split(";")
### Set association between dataset tag and ODS category
ods_categories = {
"Arts / Culture / History": [
"arts",
"culture",
"history",
"military",
"art gallery",
"design",
"fashion",
"museum",
"historic centre",
"conservation",
"archaeology",
"events",
"theatre",
],
"Budget / Finance": [
"tenders",
"contracts",
"lgcs finance",
"budget",
"finance",
"payment",
"grants",
"financial year",
"council tax",
],
"Business and Economy": [
"business and economy",
"business",
"business and trade",
"economic information",
"economic development",
"business grants",
"business awards",
"health and safety",
"trading standards",
"food safety",
"business rates",
"commercial land and property" "commercial waste",
"pollution",
"farming",
"forestry",
"crofting",
"countryside",
"farming",
"emergency planning",
"health and safety",
"trading standards",
"health and safety at work",
"regeneration",
"shopping",
"shopping centres",
"markets",
"tenders",
"contracts",
"city centre management",
"town centre management",
"economy",
"economic",
"economic activity",
"economic development",
"deprivation",
"scottish index of multiple deprivation",
"simd",
"business",
"estimated population",
"population",
"labour force",
],
"Council and Government": [
"council buildings",
"community development",
"council and government",
"council",
"councils",
"council tax",
"benefits",
"council grants",
"grants",
"council departments",
"data protection",
"FOI",
"freedom of information",
"council housing",
"politicians",
"MPs",
"MSPs",
"councillors",
"elected members",
"wards",
"constituencies",
"boundaries",
"council minutes",
"council agendas",
"council plans",
"council policies",
],
"Education": [
"primary schools",
"lgcs education & skills",
"education",
"eductional",
"library",
"school meals",
"schools",
"school",
"nurseries",
"playgroups",
],
"Elections / Politics": [
"community councils",
"political",
"polling places",
"elections",
"politics",
"elecorate",
"election",
"electoral",
"electorate",
"local authority",
"council area",
"democracy",
"polling",
"lgcs democracy",
"democracy and governance",
"local government",
"councillor",
"councillors",
"community council",
],
"Food and Environment": [
"food",
"school meals",
"allotment",
"public toilets",
"air",
"tree",
"vacant and derelict land supply",
"landscape",
"nature",
"rights of way",
"tree preservation order",
"preservation",
"land",
"contaminated",
"green",
"belt",
"employment land audit",
"environment",
"forest woodland strategy",
"waste",
"recycling",
"lgcs waste management",
"water-network",
"grafitti",
"street occupations",
"regeneration",
"vandalism",
"street cleansing",
"litter",
"toilets",
"drains",
"flytipping",
"flyposting",
"pollution",
"air quality",
"household waste",
"commercial waste",
],
"Health and Social Care": [
"public toilets",
"contraception",
"implant",
"cervical",
"iud",
"ius",
"pis",
"prescribing",
"elderly",
"screening",
"screening programme",
"cancer",
"breast feeding",
"defibrillators",
"wards",
"alcohol and drug partnership",
"care homes",
"waiting times",
"drugs",
"substance use",
"pregnancy",
"induced abortion",
"therapeutic abortion",
"termination",
"abortion",
"co-dependency",
"sexual health",
"outpatient",
"waiting list",
"stage of treatment",
"daycase",
"inpatient",
"alcohol",
"waiting time",
"treatment",
"community wellbeing and social environment",
"health",
"human services",
"covid-19",
"covid",
"hospital",
"health board",
"health and social care partnership",
"medicine",
"health and social care",
"health and fitness",
"nhs24",
"hospital admissions",
"hospital mortality",
"mental health",
"pharmacy",
"GP",
"surgery",
"fostering",
"adoption",
"social work",
"asylum",
"immigration",
"citizenship",
"carers",
],
"Housing and Estates": [
"buildings",
"housing data supply 2020",
"multiple occupation",
"housing",
"sheltered housing",
"adaptations",
"repairs",
"council housing",
"landlord",
"landlord registration",
"rent arrears",
"parking",
"garages",
"homelessness",
"temporary accommodation",
"rent",
"tenancy",
"housing advice",
"housing associations",
"housing advice",
"housing repairs",
"lettings",
"real estate",
"land records",
"land-cover",
"woodland",
"dwellings",
"burial grounds",
"cemeteries",
"property",
"vacant and derelict land",
"scottish vacant and derelict land",
"allotment",
],
"Law and Licensing": [
"law",
"licensing",
"regulation",
"regulations",
"licence",
"licenses",
"permit",
"permits",
"police",
"court",
"courts",
"tribunal",
"tribunals",
],
"Parks / Recreation": [
"parks",
"recreation",
"woodland",
"parks and open spaces",
],
"Planning and Development": [
"buildings",
"vacant and derelict land supply",
"core paths. adopted",
"employment land audit",
"built environment",
"planning",
"zoning",
"council area",
"address",
"addresses",
"city development plan",
"boundaries",
"post-code",
"dwellings",
"planning permission",
"postcode-units",
"housing",
"property",
"building control",
"conservation",
],
"Public Safety": [
"emergency planning",
"public safety",
"crime and justice",
"lgcs community safety",
"street lighting",
"community safety",
"cctv",
"road safety",
],
"Sport and Leisure": [
"sport",
"sports",
"sports facilities",
" sports activities",
"countryside",
"wildlife",
"leisure",
"leisure clubs",
"clubs",
"groups",
"societies",
"libraries",
"archives",
"local history",
"heritage",
"museums",
"galleries",
"parks",
"gardens",
"open spaces",
"sports",
"sports clubs",
"leisure centres",
],
"Tourism": [
"public toilets",
"tourism",
"tourist",
"attractions",
"accomodation",
"historic buildings",
"tourist routes",
"cafes",
"restaurants",
"hotels",
"hotel",
],
"Transportation": [
"core paths. adopted",
"lgcs transport infrastructure",
"transportation",
"mobility",
"pedestrian",
"walking",
"walk",
"cycle",
"cycling",
"parking",
"car",
"bus",
"tram",
"train",
"taxi",
"transport",
"electric vehicle",
"electric vehicle charging points",
"transport / mobility",
"active travel",
"road safety",
"roads",
"community transport",
"road works",
"road closures",
"speed limits",
"port",
"harbour",
],
}
### Return ODS if tag is a match
applied_category = []
for tag in combined_tags:
for cat in ods_categories:
if tag in ods_categories[cat]:
applied_category = applied_category + [cat]
### If no match, assign "Uncategorised". Tidy list of ODS categories into string.
if len(applied_category) == 0:
applied_category = ["Uncategorised"]
applied_category = ";".join(str(cat) for cat in set(applied_category))
applied_category
return applied_category
### Apply ODS categorisation
data["ODSCategories"] = data["CombinedTags"].apply(assign_ODScategories)
### Tidy licence names
def tidy_licence(licence_name):
"""Temporary licence conversion to match export2jkan -- FOR ANALYTICS ONLY, will discard in 2022Q2 Milestone
Returns:
string: a tidied licence name
"""
known_licences = {
"https://creativecommons.org/licenses/by-sa/3.0/": "Creative Commons Attribution Share-Alike 3.0",
"https://creativecommons.org/licenses/by/4.0/legalcode": "Creative Commons Attribution 4.0 International",
"https://creativecommons.org/licenses/by/4.0": "Creative Commons Attribution 4.0 International",
"Creative Commons Attribution 4.0": "Creative Commons Attribution 4.0 International",
"https://creativecommons.org/share-your-work/public-domain/cc0": "Creative Commons CC0",
"https://rightsstatements.org/page/NoC-NC/1.0/": "Non-Commercial Use Only",
"https://opendatacommons.org/licenses/odbl/1-0/": "Open Data Commons Open Database License 1.0",
"Open Data Commons Open Database License 1.0": "Open Data Commons Open Database License 1.0",
"https://www.nationalarchives.gov.uk/doc/open-government-licence/version/2/": "Open Government Licence v2.0",
"https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/": "Open Government Licence v3.0",
"Open Government Licence 3.0 (United Kingdom)": "Open Government Licence v3.0",
"UK Open Government Licence (OGL)": "Open Government Licence v3.0",
"Open Government": "Open Government Licence v3.0",
"uk-ogl": "Open Government Licence v3.0",
"OGL3": "Open Government Licence v3.0",
"https://rightsstatements.org/vocab/NKC/1.0/": "No Known Copyright",
"https://creativecommons.org/publicdomain/mark/1.0/": "Public Domain",
"Other (Public Domain)": "Public Domain",
"Public Domain": "Public Domain",
"Public Sector End User Licence (Scotland)": "Public Sector End User Licence (Scotland)",
}
for key in known_licences.keys():
if str(licence_name).lower().strip(" /") == key.lower().strip(" /"):
return known_licences[key]
if str(licence_name) == "nan":
tidied_licence = "No licence"
else:
tidied_licence = "Custom licence: " + str(licence_name)
return tidied_licence
data["License"] = data["License"].apply(tidy_licence)
def tidy_file_type(file_type):
""" Temporary data type conversion
Args:
file_type (str): the data type name
Returns:
tidied_file_type (str): a tidied data type name
"""
file_types_to_tidy = {
"application/x-7z-compressed": "7-Zip compressed file",
"ArcGIS GeoServices REST API": "ARCGIS GEOSERVICE",
"Esri REST": "ARCGIS GEOSERVICE",
"Atom Feed": "ATOM FEED",
"htm": "HTML",
"ics": "iCalendar",
"jpeg": "Image",
"vnd.openxmlformats-officedocument.spreadsheetml.sheet": "MS EXCEL",
"vnd.ms-excel": "MS EXCEL",
"xls": "MS EXCEL",
"xlsx": "MS EXCEL",
"doc": "MS Word",
"docx": "MS Word",
"QGIS": "QGIS Shapefile",
"text": "TXT",
"web": "URL",
"UK/DATA/#TABGB1900": "URL",
"UK/ROY/GAZETTEER/#DOWNLOAD": "URL",
"Web Mapping Application": "WEB MAP",
"mets": "XML",
"alto": "XML",
}
tidied_data_type = "NULL"
for key in file_types_to_tidy.keys():
if str(file_type).lower().strip(". /") == key.lower().strip(". /"):
tidied_file_type = file_types_to_tidy[key]
return tidied_file_type
if (
str(file_type) == "nan"
or str(file_type) == ""
):
tidied_file_type = "No file type"
else:
# print("file type: ", file_type)
tidied_file_type = str(file_type).strip(". /").upper()
return tidied_file_type
### Inconsistencies in casing for FileType
data['FileType'] = data['FileType'].apply(tidy_file_type)
return data
if __name__ == "__main__":
merge_data()