-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrabKillXMLCheck.py
executable file
·90 lines (69 loc) · 3.34 KB
/
crabKillXMLCheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python
"""Check XML against CRAB log & remove files that crab thought were still transferring
This is necessary if you then use notFinishedLumis.json from crab report,
since that will contain jobs that CRAB thought were transferring
*even if* they look OK on T2.
This therefore avoids duplicate events.
"""
from __future__ import print_function
import os
import re
import argparse
def get_transferring_job_numbers(crab_log):
"""Get all job numbers that were transferring at the last status check"""
if not os.path.isfile(crab_log):
raise IOError("Cannot find crab log!")
# First find the last status of all jobs in the log file
status_text = None
with open(crab_log) as cf:
for line in cf:
if "Got information from status cache file" in line:
status_text = line.split("{", 1)[1] # divide at first {
if status_text is None:
raise RuntimeError("Cannot find status in crab log")
# Now go through and identify all bad jobs
bad_ntuple_names = []
# Would parse this as JSON, but has some single quotes that are
# impossible to remove since everything in crab.log is single quotes, bah
# Note that if they change the format of crab.log, this will need updating
this_job = None
for w in status_text.split():
this_word = w.lower().strip()
# Find job number, e.g. '123':
match = re.search(r'.[0-9]+.:', this_word)
if match:
this_job = match.group().strip(":").replace('"', "").replace("'", "")
# Figure out if job was transferring
# Note that a job can be "finished" but still have errors,
# so can't use for successful jobs
# Also, a job may have "Errors" but be successful - maybe was error from
# early retry?
# So very difficult to get *only* successful jobs
elif "transferring" in this_word:
bad_ntuple_names.append(this_job)
return bad_ntuple_names
def job_numbers_to_filenames(job_nums):
"""Translate job numbers to ntuple filenames"""
return ["Ntuple_%s.root" % this_job for this_job in job_nums]
def create_good_xml(xml_filename, new_xml_filename, bad_ntuple_names):
"""Copy lines from original XML to new XML if they don't contain bad ntuple"""
if not os.path.isfile(xml_filename):
raise IOError("Cannot find xml_filename!")
with open(xml_filename) as inf, open(new_xml_filename, "w") as of:
for line in inf:
if not any([r in line for r in bad_ntuple_names]):
of.write(line) # already has newline at end
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("crabLog", help="crab.log file for this set of crab jobs")
parser.add_argument("inXML", help="Input XML file to be cleaned up")
parser.add_argument("outXML", help="Output XML file to be written")
args = parser.parse_args()
job_nums = get_transferring_job_numbers(args.crabLog)
bad_ntuple_names = job_numbers_to_filenames(job_nums)
if len(bad_ntuple_names) > 0:
print(len(bad_ntuple_names), "'bad' ntuple filenames:", bad_ntuple_names)
create_good_xml(args.inXML, args.outXML, bad_ntuple_names)
print("Written XML without bad ntuples to", args.outXML)
else:
print("No bad filenames to replace, no updated XML will be produced")