-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxtract_hdf_main.py
144 lines (105 loc) · 3.88 KB
/
xtract_hdf_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import h5py
import time
from queue import Queue
def extract_attribute_metadata(h5py_attributes):
"""Extracts metadata from h5py attribute manager classes.
Parameters
----------
h5py_attributes : h5py.AttributeManager
h5py attribute manager class as returned by the .attrs method.
Returns
-------
metadata_dictionary : dict
Dictionary containing attribute data.
"""
metadata_dictionary = {}
for key, value in h5py_attributes.items():
metadata_dictionary[key] = value
return metadata_dictionary
def extract_group_metadata(h5py_group_obj):
"""Extracts metadata from h5py group objects.
Parameters
----------
h5py_group_obj : h5py.Group
h5py group object.
Returns
-------
"""
metadata_dictionary = dict()
metadata_dictionary["name"] = h5py_group_obj.name
metadata_dictionary["type"] = "group"
metadata_dictionary["attributes"] = extract_attribute_metadata(h5py_group_obj.attrs)
metadata_dictionary["keys"] = list(h5py_group_obj.keys())
metadata_dictionary["parent"] = h5py_group_obj.parent.name
return metadata_dictionary
def extract_dataset_metadata(h5py_dataset_obj):
"""Extracts metadata from h5py dataset objects.
Parameters
----------
h5py_dataset_obj : h5py.Dataset
h5py dataset object.
Returns
-------
"""
metadata_dictionary = dict()
metadata_dictionary["name"] = h5py_dataset_obj.name
metadata_dictionary["type"] = "dataset"
metadata_dictionary["attributes"] = extract_attribute_metadata(h5py_dataset_obj.attrs)
metadata_dictionary["parent"] = h5py_dataset_obj.parent.name
metadata_dictionary["shape"] = h5py_dataset_obj.shape
metadata_dictionary["dtype"] = h5py_dataset_obj.dtype
metadata_dictionary["size"] = h5py_dataset_obj.size
metadata_dictionary["nbytes"] = h5py_dataset_obj.nbytes
metadata_dictionary["ndim"] = h5py_dataset_obj.ndim
metadata_dictionary["compression"] = h5py_dataset_obj.compression
return metadata_dictionary
def extract_file_metadata(h5py_file_obj):
"""Extracts metadata from h5py file objects.
Parameters
----------
h5py_file_obj : h5py.File
h5py file object.
Returns
-------
metadata_dictionary : dict
Dictionary containing file object name and attributes.
"""
metadata_dictionary = dict()
metadata_dictionary["name"] = h5py_file_obj.name
metadata_dictionary["type"] = "file"
metadata_dictionary["attributes"] = extract_attribute_metadata(h5py_file_obj.attrs)
return metadata_dictionary
def extract_hdf_main(hdf_file_path):
"""Extracts metadata from .hdf files.
Parameters
----------
hdf_file_path : str
File path of .hdf file to process.
Returns
-------
"""
t0 = time.time()
metadata_dictionary = {"hdf": {}}
try:
hdf_file = h5py.File(hdf_file_path, "r")
except:
return metadata_dictionary
file_obj_metadata = extract_file_metadata(hdf_file)
metadata_dictionary["hdf"][hdf_file.name] = file_obj_metadata
unprocessed = Queue()
unprocessed.put(hdf_file)
while not(unprocessed.empty()):
current = unprocessed.get()
if isinstance(current, h5py.Group):
group_metadata_dictionary = extract_group_metadata(current)
metadata_dictionary["hdf"][current.name] = group_metadata_dictionary
for value in current.values():
unprocessed.put(value)
elif isinstance(current, h5py.Dataset):
dataset_metadata_dictionary = extract_dataset_metadata(current)
metadata_dictionary["hdf"][current.name] = dataset_metadata_dictionary
elif isinstance(current, h5py.File):
for value in current.values():
unprocessed.put(value)
metadata_dictionary.update({"extract time": time.time() - t0})
return metadata_dictionary