forked from vufind-org/vufind
-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess-marc.py
305 lines (242 loc) · 12.6 KB
/
preprocess-marc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import pymarc
import argparse
import logging
import os
import urllib
import re
import json
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
sh = logging.StreamHandler()
sh.setFormatter(formatter)
logger.addHandler(sh)
# This script is custom written for Zenon:
# * Strip internal notes from holding data (952).
# * Strip summary field (520) because of copyright concerns.
# * Automatically add helper holding fields (953) for holding data (952) in parent records.
# * In the process of generating 953 there are some sanity checks on links between records. Irregularities get reported.
# * In our initial Koha setup 001 fields were not generated automatically but with a click in Koha's cataloguing interface. This required some sanity checks:
# ** Is a 001 present? If false the record gets rejected. A librarian has to click a new 001.
# ** Is 001 valid (meaning: 9 digits)? If false the record gets rejected. Could happen with external records imported into Koha without generating a new 001.
# ** Is a record with the same biblionumber (Koha's internal ID) present in VuFind, but with a different 001? If true the record gets rejected.
# * Since early 2022 we changed the 001 generation process:
# ** New records have a biblionumber >= 3000000 and will always have biblionumber == 001 automatically.
# ** Changes to 001 by users have been disabled in Koha's interface.
# ** This makes 001 the sanity checks obsolete but their assumptions are still valid, so they were kept in regardless.
def is_writable_directory(path: str):
if os.path.exists(path) and (not os.path.isdir(path) or not os.access(path, os.W_OK)):
msg = "Please provide writable directory."
raise argparse.ArgumentTypeError(msg)
elif not os.path.exists(path):
os.makedirs(path)
return path
else:
return path
MARCXML_OPENING_ELEMENTS = bytes(
'<?xml version="1.0" encoding="UTF-8" ?><collection xmlns="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">',
'utf-8'
)
MARCXML_CLOSING_ELEMENTS = bytes(
'</collection>', 'utf-8'
)
parser = argparse.ArgumentParser(description='Preprocess MARCXML data to be imported into Vufind.')
parser.add_argument('input_file', type=str, help="The MARCXML file to be processed.")
parser.add_argument('output_directory', type=is_writable_directory, help="Output directory for the updated MARC file.")
parser.add_argument('--url', dest='server_url', type=str, default="https://zenon.dainst.org", help="Optional server URL for creating additional holding information.")
parser.add_argument('--check_biblio', dest='check_biblio', action='store_true', help="Check if datasets with given biblionumber already exist on server. If the system number differs, the new record is logged as an error." )
holdings_mapping = {}
invalid_zenon_ids = []
valid_zenon_id = re.compile(r"\d{9}")
contains_only_numbers = re.compile(r"\d+")
def is_record_valid(record):
global server_url
global check_biblio_no
if not '001' in record:
return (False, "No system number 001 in https://koha.dainst.de:8443/cgi-bin/koha/catalogue/detail.pl?biblionumber={0}. Returning None record.".format(record['999']['c']))
sys_number = record['001'].data
matcher = re.fullmatch(valid_zenon_id, sys_number)
if not matcher:
return (False, "Unusual system number 001 {1} in https://koha.dainst.de:8443/cgi-bin/koha/catalogue/detail.pl?biblionumber={0}.".format(record['999']['c'], sys_number))
if check_biblio_no:
url = "{0}/api/v1/search?lookfor=biblio_no:{1}&type=AllFields".format(server_url, record['999']['c'])
req = urllib.request.Request(url)
try:
with urllib.request.urlopen(req) as response:
result = json.loads(response.read().decode("utf-8"))
if "records" in result:
records = result["records"]
if len(records) > 1:
return (False, "There are multiple records with biblio number {0}, see {1}. https://koha.dainst.de:8443/cgi-bin/koha/catalogue/detail.pl?biblionumber={0}".format(record['999']['c'], url))
if records[0]['id'] != sys_number:
return (False, "There is already a record with biblio number {0}, but the system number differs: {1} (old) : {2} (new). https://koha.dainst.de:8443/cgi-bin/koha/catalogue/detail.pl?biblionumber={0}".format(record['999']['c'], records[0]['id'], sys_number))
except Exception as e:
logger.error(e)
return (False, "Failed to load {0}.".format(url))
url = "{0}/api/v1/search?lookfor=id:{1}&type=AllFields&field[]=biblioNumber".format(server_url, sys_number)
req = urllib.request.Request(url)
try:
with urllib.request.urlopen(req) as response:
result = json.loads(response.read().decode("utf-8"))
if "records" in result and len(result["records"]) != 0 and result["records"][0]['biblioNumber'] != record['999']['c']:
return (False, "There is already a record with system number {0}, but the new biblio number differs: {1} (old) : {2} (new). https://koha.dainst.de:8443/cgi-bin/koha/catalogue/detail.pl?biblionumber={2}".format(sys_number, result["records"][0]['biblioNumber'], record['999']['c']))
except Exception as e:
logger.error(e)
return (False, "Failed to load {0}.".format(url))
return (True, "")
def extract_parent_ids(sys_number, parents):
global invalid_zenon_ids
parent_ids = []
for parent in parents:
if 'w' not in parent:
continue
parent_sys_number = parent['w']
matcher = re.fullmatch(valid_zenon_id, parent_sys_number)
if not matcher:
fixed_value = None
number_match = re.fullmatch(contains_only_numbers, parent_sys_number)
if number_match:
if(len(parent_sys_number) > 9):
parent['w'] = parent_sys_number[(len(parent_sys_number)-9):]
else:
pad = '0' * (9 - len(parent_sys_number))
parent['w'] = pad + parent_sys_number
parent_ids += [parent['w']]
fixed_value = parent['w']
invalid_zenon_ids += [(sys_number, parent_sys_number, fixed_value)]
else:
parent_ids += [parent['w']]
return list(set(parent_ids))
def extract_holding_branch_codes(holding_fields):
holding_branches = []
for holding in holding_fields:
holding_branches.append(holding['b'])
return holding_branches
def accumulate_ancestor_holdings(sys_number_first, ids, current_depths = 0):
global server_url
global holdings_mapping
if current_depths > 10:
logger.error("Unusually deeply nested hierarchy for {0}. Aborting recursion.".format(ids))
return []
parent_ids = []
holding_branches = []
for id in ids:
if id in holdings_mapping:
(parent_ids, holding_branches) = holdings_mapping[id]
else:
url = "{1}/Record/{0}/Export?style=MARCXML".format(id, server_url)
req = urllib.request.Request(url)
try:
with urllib.request.urlopen(req) as response:
record = pymarc.parse_xml_to_array(response)[0]
holdings = record.get_fields('952')
holding_branches = extract_holding_branch_codes(holdings)
parents = record.get_fields('773')
parent_ids = extract_parent_ids(id, parents)
holdings_mapping[id] = (parent_ids, holding_branches)
except urllib.error.HTTPError as e:
logger.error("{1}, {3}/Record/{0}, initial record: {3}/Record/{2}.".format(id, e, sys_number_first, server_url))
except Exception as e:
logger.error("{1}, {3}/Record/{0}, initial record: {3}/Record/{2}.".format(id, e, sys_number_first, server_url))
if parent_ids:
return list(set(holding_branches + accumulate_ancestor_holdings(sys_number_first, parent_ids, current_depths=current_depths+1)))
return holding_branches
def add_to_holding_mapping(record):
global holdings_mapping
(valid, _message ) = is_record_valid(record)
if not valid:
return
sys_number = record['001'].data.strip()
holdings = record.get_fields('952')
holding_branches = extract_holding_branch_codes(holdings)
parent_ids = []
parents = record.get_fields('773')
parent_ids = extract_parent_ids(sys_number, parents)
holdings_mapping[sys_number] = (parent_ids, holding_branches)
def preprocess_record(record):
global holdings_mapping
(valid, message) = is_record_valid(record)
if not valid:
logger.error(message)
return None
sys_number = record['001'].data
(parent_ids, holding_branches) = holdings_mapping[sys_number]
ancestor_holding_branches = accumulate_ancestor_holdings(sys_number, parent_ids)
ancestor_holding_branches = [x for x in ancestor_holding_branches if x not in holding_branches]
holdings = record.get_fields('952')
internal_subfields = ['d', 'e', 'f', 'g', 'w', 'x', 'A', 'C', 'P', 'T', 'U']
for holding in holdings:
for subfield in internal_subfields:
holding.delete_subfield(subfield)
keys = holding.subfields[0::2]
vals = holding.subfields[1::2]
subs = dict()
for key, val in zip(keys, vals):
if key in subs:
subs[key].append(val)
else:
subs[key] = [val]
for key, values in subs.items():
values = list(filter(lambda val: val.strip() != '', values))
if len(values) == 0:
# print(f"Deleting subfield {key}.")
# print(holding)
# It seems VuFind can not handle empty elements like `<subfield code="D" />` correctly, which will result in broken XML data.
holding.delete_subfield(key)
# Remove summaries, see SD-1798
for field in record.get_fields('520'):
record.remove_field(field)
if ancestor_holding_branches:
branches_added = []
for branch in ancestor_holding_branches:
if branch in branches_added:
continue
record.add_field(
pymarc.Field(
'953',
indicators=[' ', ' '],
subfields=[ 'b', branch, 'z', "Automatically added holding branch key." ]
)
)
branches_added.append(branch)
return record
def run(file_paths, output_directory):
global invalid_zenon_ids
logger.info("Creating holding mappings.")
for file_path in file_paths:
with open(file_path, 'rb') as input_file:
logger.info(file_path)
reader = pymarc.parse_xml_to_array(input_file)
for record in reader:
add_to_holding_mapping(record)
logger.info("Preprocessing files.")
for file_path in file_paths:
with open(file_path, 'rb') as input_file, open("{0}/{1}".format(output_directory, os.path.basename(file_path)), 'wb') as output_file:
logger.info(file_path)
reader = pymarc.parse_xml_to_array(input_file)
output_file.write(MARCXML_OPENING_ELEMENTS)
for record in reader:
record = preprocess_record(record)
if record is None:
logger.error("Received None record after processing, skipping.")
else:
output_file.write(pymarc.record_to_xml(record))
output_file.write(MARCXML_CLOSING_ELEMENTS)
logger.warning("Encountered {0} invalid zenon IDs:".format(len(invalid_zenon_ids)))
invalid_zenon_ids = list(set(invalid_zenon_ids))
for entry in invalid_zenon_ids:
logger.warning("{0} contained {1} as parent, fixed: {2}.".format(entry[0], entry[1], entry[2]))
if __name__ == '__main__':
global server_url
global check_biblio_no
options = vars(parser.parse_args())
server_url = options['server_url']
check_biblio_no = options['check_biblio']
try:
files = [ os.path.join(options['input_file'], file) for file in os.listdir(options['input_file']) if os.path.splitext(file)[1] == '.xml' ]
except NotADirectoryError:
files = [options['input_file']]
if not files:
logger.error("Found no xml files at {0}".format(options['input_file']))
if files:
run(files, options['output_directory'])