-
Notifications
You must be signed in to change notification settings - Fork 0
/
xtract_netcdf_main.py
138 lines (109 loc) · 4.09 KB
/
xtract_netcdf_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import json
import os
from netCDF4 import Dataset
import numpy as np
import argparse
import time
"""
This is the code for the NetCDF extractor. This takes a file deemed
a NetCDF file and extracts all metadata from it as a JSON.
@Inputs: file_handle -- opened NetCDF file.
@Outputs: metadata -- metadata JSON
@Author: Tyler J. Skluzacek, derived from code by Paul Beckman.
@LastEdited: 07/27/2017
"""
class ExtractionFailed(Exception):
"""Basic error to throw when an extractor fails"""
class ExtractionPassed(Exception):
"""Indicator to throw when extractor passes for fast file
classification."""
class NumpyDecoder(json.JSONEncoder):
"""Serializer used to convert numpy types to normal json
serializable types. Since netCDF4 produces numpy types, this is
necessary for compatibility with other metadata scrapers like the
tabular, which returns a python dict."""
def default(self, obj):
if isinstance(obj, np.generic):
return obj.item()
elif isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, np.dtype):
return str(obj)
else:
return super(NumpyDecoder, self).default(obj)
def execute_extractor(filename):
t0 = time.time()
if not filename:
return None
metadata = extract_netcdf(file_handle=filename)
t1 = time.time()
metadata.update({"extract time": (t1 - t0)})
return metadata
def extract_netcdf(file_handle, pass_fail=False):
"""Create netcdf metadata dictionary from file.
ParametersL
file_handle (str): File path of netcdf file.
pass_fail (bool): Whether to exit after loading file_handle to a
netCDF4 dataset.
Return:
(dictionary): Dictionary of file format, global attributes,
dimensions, variables and attributes.
"""
t0 = time.time()
try:
dataset = Dataset(os.path.realpath(file_handle))
except IOError:
raise ExtractionFailed
if pass_fail:
raise ExtractionPassed
metadata = {
"file_format": dataset.file_format,
}
if len(dataset.ncattrs()) > 0:
metadata["global_attributes"] = {}
for attr in dataset.ncattrs():
metadata["global_attributes"][attr] = dataset.getncattr(attr)
dims = dataset.dimensions
if len(dims) > 0:
metadata["dimensions"] = {}
for dim in dims:
metadata["dimensions"][dim] = {
"size": len(dataset.dimensions[dim])
}
add_ncattr_metadata(dataset, dim, "dimensions", metadata)
variables = dataset.variables
if len(variables) > 0:
metadata["variables"] = {}
for var in variables:
if var not in dims:
metadata["variables"][var] = {
"dimensions": dataset.variables[var].dimensions,
"size": dataset.variables[var].size
}
add_ncattr_metadata(dataset, var, "variables", metadata)
meta = {"netcdf": json.loads(json.dumps(metadata, cls=NumpyDecoder))}
meta.update({"extract time": time.time() - t0})
return meta
def add_ncattr_metadata(dataset, name, dim_or_var, metadata):
"""Gets attributes from a netCDF variable or dimension.
Parameters:
dataset (netCDF4 dataset): netCDF4 dataset loaded from a netcdf
file.
name (str): Name of attribute.
dim_or_var (str): Metadata key for attribute info ("dimensions"
or "variables")
metadata (dictionary): Dictionary to add attribute info to.
"""
try:
metadata[dim_or_var][name]["type"] = dataset.variables[name].dtype
for attr in dataset.variables[name].ncattrs():
metadata[dim_or_var][name][attr] = dataset.variables[name].getncattr(attr)
except KeyError:
pass
if __name__ == "__main__":
parse = argparse.ArgumentParser()
parse.add_argument("--path", help="path to netcdf file",
type=str, required=True)
args = parse.parse_args()
meta = extract_netcdf(args.path)
print(meta)