-
Notifications
You must be signed in to change notification settings - Fork 3
/
LakeCat_write_edg_json.py
121 lines (106 loc) · 6.12 KB
/
LakeCat_write_edg_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from datetime import datetime as dt
import json
import pandas as pd
title = 'The LakeCat Dataset: Accumulated Attributes for NHDPlusV2 (Version 2.1) Catchments for the Conterminous United States: '
KEYWORDS =['inlandwaters', 'ecosystem', 'environment', 'monitoring', 'natural resources',
'surface water', 'modeling', 'united states of america', 'united states',
'usa', 'alabama', 'arizona', 'arkansas', 'california', 'colorado', 'connecticut',
'delaware', 'district of columbia', 'florida', 'georgia', 'idaho', 'illinois',
'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana', 'maine', 'maryland',
'massachusetts', 'michigan', 'minnesota', 'mississippi', 'missouri', 'montana',
'nebraska', 'nevada', 'new hampshire', 'new jersey', 'new mexico', 'new york',
'north carolina', 'north dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania',
'rhode island', 'south carolina', 'south dakota', 'tennessee', 'texas', 'utah',
'vermont', 'virginia', 'washington', 'west virginia', 'wisconsin', 'wyoming']
organization = 'U.S. Environmental Protection Agency, Office of Research and Development (ORD), Center for Public Health and Environmental Assessment (CPHEA), Pacific Ecological Systems Division (PESD), '
temporal = '2015/2030'
LakeCat = ('LakeCat currently contains over 300 metrics that include local catchment (Cat),'
' watershed (Ws), and special metrics. See Geospatial Framework and Terms in the'
' ReadMe for definitions of the terms \u2018catchment\u2019 and \u2018watershed\u2019'
' as used with the LakeCat Dataset. An additional metric, inStreamCat, indicates whether'
' the variable was pulled from the StreamCat Dataset or calculated with a geospatial'
' framework that was developed for LakeCat.\n\nThese metrics are available for 378,088'
' lakes and their associated catchments across the conterminous US. LakeCat metrics'
' represent both natural (e.g., soils and geology) and anthropogenic (e.g, urban areas'
' and agriculture) landscape information.')
describeBy = 'https://www.epa.gov/national-aquatic-resource-surveys/lakecat-metrics-and-definitions' #
access = 'https://www.epa.gov/national-aquatic-resource-surveys/lakecat-dataset'
gaft = 'https://gaftp.epa.gov/epadatacommons/ORD/NHDPlusLandscapeAttributes/LakeCat/FinalTables/'
tbl = pd.read_csv(
"O:/PRIV/CPHEA/PESD/COR/CORFILES/Geospatial_Library_Projects/LakeCat/Metadata"
"/submit_metadata_LakeCat.csv" #replace
)
records = []
for _, row in tbl.iterrows():
# break
records.append(
{'@type': "dcat:Dataset",
'title': title + row.title, #made title variable
'description': row.description,
'keyword': KEYWORDS,
'modified': dt.now().strftime('%Y-%m-%d'),
'publisher':
{'@type': 'org:Organization',
'name': organization # made organization variable
},
'contactPoint': {
'@type': 'vcard:Contact',
'fn': organization + row.contact, # made 'contact' variable and fill "Marc Weber"
'hasEmail': ('mailto:' + row.contact_Email) # made 'contact_Email' variable and fill [email protected]
},
'identifier': row.uuid, # got rid of f"{{{row.uuid}}}" in favor of row.uuid
'accessLevel': "public",
'bureauCode': ['020:00'],
'programCode': ['020:072'],
'license': "https://edg.epa.gov/EPA_Data_License.html",
'rights': "public (Data asset is or could be made publicly available to all without restrictions)",
'spatial': '-125.0,24.5,-66.5,49.5',
'temporal': temporal,
'distribution': [
{'@type': 'dcat:Distribution',
'accessURL': access, #
'title': 'LakeCat Dataset',
'description': LakeCat, #made LakeCat a variable to make cleaner
'format': 'API', #changed to API
'describedBy': describeBy, # made URL variable
'describedByType': 'text/html'
},
{'@type': 'dcat:Distribution',
'downloadURL': gaft, #
'format': 'Comma-Separated Values (.csv)',
'title': row.final_table_name,
'description': row.description,
"mediaType": "text/csv",
"describedBy": describeBy,
"describedByType": "text/html"
}
],
'accrualPeriodicity': "irregular",
'dataQuality': True,
'describedBy': describeBy,
'describedByType': "text/html",
'issued': "2015-04-23",
'accrualPeriodicity': "R/P3Y",
'language': ['en-US'],
'landingPage': access, #
'references': [
(gaft), #
('https://edg.epa.gov/metadata/catalog/search/resource/details.page?uuid=' + row.uuid) ###removed f'{{{row.uuid}}}
],
'theme': ['environment'],
}
)
blob = {
"@context": 'https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld', #
"@id": "https://edg.epa.gov/data/Public/ORD/NHEERL/WED/NonGeo/LakeCat.json", #
"@type": "dcat:Catalog",
"conformsTo": "https://project-open-data.cio.gov/v1.1/schema", #check validation in open
"describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
"dataset": records,
}
with open(
"O:/PRIV/CPHEA/PESD/COR/CORFILES/Geospatial_Library_Projects/LakeCat/Metadata/complete_metadata"
f"/LakeCat_metadata_{dt.now().strftime('%m_%d_%Y')}.json", #replace
"w"
) as fifi:
json.dump(blob, fifi, sort_keys=False, indent=4,)