forked from mrceyhun/ppdgui
-
Notifications
You must be signed in to change notification settings - Fork 0
/
eos_grinder.py
154 lines (124 loc) · 6.89 KB
/
eos_grinder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Author : Ceyhun Uzunoglu <ceyhunuzngl AT gmail [DOT] com>
Description : Stores metadata of DQMGui ROOT files by parsing last N years Runs. Metadata consists of all EOS ROOT files with their paths, run number, dataset name and detector name
How :
- Get the main EOS directory of DQMGUI and `find` all the ROOT files with their full paths depending on Run years, and store them in intermediary txt file
- There is a simple regex pattern to extract run year, dataset name, run number, detector group(naming refers to HLT, L1T, etc.)
- All these metadata are strictly structured using a pydantic model class: DqmFileMetadata
- And parsed and formatted metadata stored in JSON file.
- We use this metadata file to find histograms of detector groups or histograms of specific run number.
- eos_grinder is runs in the start of the container, and it is added as CRON job with one-hour period.
- Process time is less than ~2 minutes and JSON file size is <40 MB for Run2022 and Run2023.
- In the future, it can be moved to a DB according to requirements.
"""
import logging
import os
import re
import subprocess
import time
from datetime import datetime
from typing import Union
from backend.config import Config, get_config
from .models import DqmMetaStore, DqmMeta
logging.basicConfig(level=get_config().loglevel.upper())
CACHE_REFRESH_PERIOD_SECS = 10 * 60 # 10 minutes
# EOS GRINDER ----------------------------------------------------------------
def run(conf: Config | None):
"""Run with given yaml config
Find DQMGui ROOT files in EOS directories in last N Run years and store them as DqmMetaStore schema
"""
# Get config as object
__start_time = time.time()
if not conf: # for test
conf = get_config()
dqm_meta_conf = conf.dqm_meta_store
# Get config group directories to discard others
conf_group_directories = conf.get_config_group_directories()
logging.info(f"DQM EOS grinder is starting... Allowed directories: {conf_group_directories}")
# Get values from config file
dqm_eos_dir = dqm_meta_conf.base_dqm_eos_dir
find_tmp_results_file = dqm_meta_conf.find_tmp_results_file
meta_store_json_file = dqm_meta_conf.meta_store_json_file
last_n_run_years = dqm_meta_conf.last_n_run_years
file_suffix_pat = dqm_meta_conf.file_suffix_pat
allowed_group_directories = conf_group_directories
current_year = datetime.now().year
run_years = range(current_year - last_n_run_years + 1, current_year + 1)
# Full directory path list for "last_n_run_years"
base_eos_run_year_dirs = []
# Check directory exists
for run_dir in [f"{dqm_eos_dir.rstrip('/')}/Run{str(year)}" for year in run_years]:
if os.path.exists(run_dir):
base_eos_run_year_dirs.append(run_dir)
else:
logging.warning(f"Run directory not exist: {run_dir}")
# Run find script and store its data in
run_sh_find_cmd(base_eos_run_year_dirs, find_tmp_results_file, file_suffix_pat)
dqm_meta_data = get_formatted_meta_from_raw_input(find_tmp_results_file, allowed_group_directories)
with open(meta_store_json_file, "w+") as f:
f.write(dqm_meta_data.model_dump_json())
logging.info(f"DQM EOS grinder is finished. Elapsed time : {str(int(time.time() - __start_time))} seconds.")
def run_sh_find_cmd(base_search_dirs: list[str], outfile: str, file_suffix_pat: str):
"""Runs linux find command and saves results to defined outfile
Args:
base_search_dirs: Base EOS directories to run find command
outfile: file to store find command results
file_suffix_pat: find command "-iname" suffix pattern like '*DQMIO.root'
"""
# find "${baseEosDirs[@]}" -iname '*DQMIO.root' | sort -nr >"$outputFile"
cmd = f"find {' '.join(base_search_dirs)} -iname '{file_suffix_pat}' | sort -nr>{outfile}"
# cmd = f"find {' '.join(base_search_dirs)} \( -path '*/JetMET1/*' -o -path '*/HLTPhysics/*' \) -iname '{file_suffix_pat}' >{outfile}"
r = subprocess.run(cmd, capture_output=True, shell=True, check=True)
if r.returncode:
logging.warning(
f"Exit code: {r.returncode} , Stdout: {r.stdout.decode('utf-8')}, Stderr: {r.stderr.decode('utf-8')}"
)
r.check_returncode()
def get_formatted_meta_from_raw_input(input_file, allowed_group_directories) -> DqmMetaStore:
"""Read raw ROOT file names from input file and format them in DqmMetaStore schema and return
Args:
input_file: file that stores raw output of find results
allowed_group_directories: group directories from config
"""
try:
with open(input_file) as fin:
dqm_main_meta_list = [
get_group_meta(root_file_name, allowed_group_directories) for root_file_name in fin.readlines()
]
# Remove None
dqm_main_meta_list = [item for item in dqm_main_meta_list if item is not None]
return DqmMetaStore(dqm_main_meta_list)
except Exception as e:
logging.error(f"Cannot parse data of given input file. input file:{input_file}. Error: {str(e)}")
raise
# Compiled regex pattern to parse ROOT file name
# Expected input: /eos/cms/store/group/comm_dqm/DQMGUI_data/Run2023/AlCaPPSPrompt/0003658xx/DQM_V0001_R000365835__AlCaPPSPrompt__Run2023A-PromptReco-v1__DQMIO.root
# Expected output of re groupdict: {'year': '2023', 'group_directory': 'AlCaPPSPrompt', 'run': '000365835', 'dataset_prefix': 'AlCaPPSPrompt', 'era': 'Run2023A', 'dataset_suffix': 'Run2023A-PromptReco-v1'}
DQM_EOS_ROOT_RE = re.compile(
r".+?/Run(?P<year>\d+)/(?P<group_directory>.+?)/(.+?)/DQM_V(\d+)_R(?P<run>\d+)__(?P<dataset_prefix>.+?)__(?P<era>.+?)-(?P<dataset_suffix>.+?)__DQMIO.root"
)
def get_group_meta(file_name, allowed_group_directories) -> Union[DqmMeta, None]:
"""Parsea and formats single DQM EOS ROOT file name
Args:
file_name: EOS full path of ROOT file, i.e /eos/cms/store/group/comm_dqm/DQMGUI_data/Run2023/AlCa.../...x/DQM_V0001_...__DQMIO.root
allowed_group_directories: Only given group directories in the config file will be used.
"""
file_name = file_name.strip() # Remove new line
re_match_dict = re.match(DQM_EOS_ROOT_RE, file_name).groupdict() # Match regex and get key-value pairs as dict
# Get regex group dict in which the names are already provided in the regex pattern
if re_match_dict["group_directory"] in allowed_group_directories:
# 'dataset_prefix': 'AlCaPPSPrompt', 'era': 'Run2023A', 'dataset_suffix': 'Run2023A-PromptReco-v1'
dataset_name = (
re_match_dict["dataset_prefix"] + "/" + re_match_dict["era"] + "-" + re_match_dict["dataset_suffix"]
)
return DqmMeta(
dataset=dataset_name,
eos_directory=re_match_dict["group_directory"],
era=re_match_dict["era"],
root_file=file_name,
run=re_match_dict["run"],
)
else:
return None