forked from CMSCompOps/WmAgentScripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
unifiedConfiguration.json
executable file
·472 lines (472 loc) · 18.4 KB
/
unifiedConfiguration.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
{
"email_destination" : {
"value" : ["[email protected]","[email protected]","[email protected]","[email protected]","[email protected]","[email protected]"],
"description" : "The list of people that get the emails notifications"
},
"busy_agent_fraction" : {
"value" : 0.9,
"description" : "Running/MaxRunning job fraction above which to consider an agent busy. Default 0.9"
},
"idle_agent_fraction" : {
"value" : 0.1,
"description" : "Running/MaxRunning job fraction below which to consider an agent idling. Default 0.1"
},
"full_task_path_name_limit" : {
"value" : 1250,
"description" : "Maximum number of characters allowed for full task path name in a workflow"
},
"site_out_of_overflow": {
"value" : [
"NONO_T1_UK_RAL",
"NONO_T1_IT_CNAF",
"T0_CH_CSCS_HPC"],
"description" : "The sites that should be taken out completely from the overflow mapping"
},
"site_for_overflow": {
"value" : ["T2_CH_CERN_HLT",
"T0_CH_CERN"],
"description" : "The sites that we set to overflow and require a specific treatment"
},
"overflow_pressure": {
"value" : 0.5,
"description" : "The ratio pending/running over which to consider overflowing"
},
"PU_overflow_overflow_reco":{
"value" : false,
"descrition" : "Whether or not to overflow the second step (reco) for workfow with PU_overflow enabled"
},
"warning_short_time" : {
"value" : 60,
"description" : "Time in minutes per core below which jobs are considered too short and a warning is issued."
},
"warning_long_time" : {
"value" : 1440,
"description" : "Time in minutes per core above which jobs are considered too long and have a chance at timing out."
},
"warning_mem" : {
"value" : 2500,
"description" : "Value in MB/core over which to consider a workflow from using/configured too much memory"
},
"tune_min_stats" :{
"value" : 300,
"description" : "The number of finished jobs required before making a performance estimate"
},
"mem_quanta": {
"value" : 1000,
"description" : "The quanta of memory in MB for the JobRouter"
},
"time_quanta": {
"value" : 60,
"description" : "The quanta of time in min for the JobRouter"
},
"slope_quanta": {
"value" : 100,
"description" : "The quanta of the slope of memory/core in MB for the JobRouter"
},
"read_quanta": {
"value" : 100,
"description" : "The quanta of read requirements in kb/something for the JobRouter"
},
"htcondor_binning": {
"value" : 5,
"description" : "The number of bins to use for quantizing values used for tuning job requirements"
},
"min_core_for_resize": {
"value" : 4,
"description" : "The minimum of original core requested to allow resizing"
},
"DDM_buffer_level": {
"value" : 0.8,
"description" : "The fraction of the DDM quota we are allowed to use"
},
"sites_banned": {
"value" : ["T2_CH_CERN_AI",
"T2_TH_CUNSTDA",
"T0_CH_CERN",
"T0_CH_CSCS_HPC"
],
"description" : "The sites that are banned from production"
},
"sites_auto_approve": {
"value" : ["T0_CH_CERN_MSS","T1_FR_CCIN2P3_MSS","T1_IT_CNAF_MSS", "T1_RU_JINR_MSS"],
"description" : "The sites we can autoapprove tape request to"
},
"sites_space_override": {
"value" : {
},
"description" : "Over-ride the available space at a phedex end point"
},
"sites_with_goodIO": {
"value" : ["T2_CH_CSCS","T2_DE_DESY","T2_DE_RWTH","T2_ES_CIEMAT","T2_FR_GRIF_LLR", "T2_FR_GRIF_IRFU", "T2_FR_IPHC","T2_FR_CCIN2P3","T2_IT_Bari", "T2_IT_Legnaro", "T2_IT_Pisa", "T2_IT_Rome","T2_UK_London_Brunel", "T2_UK_London_IC","T2_US_Caltech","T2_US_MIT","T2_US_Nebraska","T2_US_Purdue","T2_US_UCSD","T2_US_Wisconsin","T2_US_Florida","T2_BE_IIHE","T2_EE_Estonia","T2_PL_Swierk","T2_CH_CERN","T2_CH_CERN_HLT","T2_KR_KISTI","T2_UK_SGrid_RALPP","T2_BR_SPRACE","T2_UA_KIPT","T2_BR_UERJ","T2_BE_UCL","T2_RU_JINR"],
"description" : "The sites identified as having good storage bandwidth"
},
"sites_with_goodAAA": {
"value" : ["T2_PT_NCG_Lisbon", "T2_CH_CSCS","T2_DE_DESY","T2_DE_RWTH","T2_ES_CIEMAT","T2_FR_GRIF_LLR", "T2_FR_GRIF_IRFU", "T2_FR_IPHC","T2_FR_CCIN2P3","T2_IT_Bari", "T2_IT_Legnaro", "T2_IT_Pisa", "T2_IT_Rome","T2_UK_London_Brunel", "T2_UK_London_IC","T2_US_Caltech","T2_US_MIT","T2_US_Nebraska","T2_US_Purdue","T2_US_UCSD","T2_US_Wisconsin","T2_US_Florida","T2_BE_IIHE","T2_EE_Estonia","T2_PL_Swierk","T2_CH_CERN","T2_CH_CERN_HLT","T2_TW_NCHC","T2_IN_TIFR","T2_HU_Budapest","T2_KR_KISTI","T2_BR_SPRACE","T2_RU_JINR","T2_UK_SGrid_RALPP","T2_US_Vanderbilt","T2_BE_UCL","T2_CN_Beijing","T2_UA_KIPT","T2_AT_Vienna","T2_ES_IFCA","T3_US_Baylor","T3_US_Colorado","T3_UK_London_RHUL","T3_UK_SGrid_Oxford","T3_US_OSG"],
"description" : "The sites identified as having good AAA bandwidth"
},
"blow_up_limits" : {
"value" : [5,10],
"description" : "Pair of Blow up factor for taskchain above which to act, and needed amount of cores to receive large such taskchain. Probably deprecated now with GQ taking this into account. Was (5,4000) before"
},
"GB_space_limit" : {
"value" : 20,
"description" : "The value in GB that we allow the output size to be before adapting the splitting"
},
"output_size_correction": {
"value" : { "Phase" : 1 },
"description" : "The correction factor to be applied to SizePerEvent per keyword."
},
"memory_correction": {
"value" : { "RunIIFall17DRPremix" : 14900 },
"description" : "The memory setting to be set for task with that keyword in"
},
"max_cpuh_block" : {
"value" : 40000000,
"description" : "Value of CPUh above which a wf is blocked from assigning"
},
"block_repositionning": {
"value" : true,
"description" : "Whether or not to retransfer block from WQE without location"
},
"allowed_bypass": {
"description" : "Who is allowed to bypass and force complete",
"value" : [
["vlimant","[email protected]"],
["sagarwal","[email protected]"],
["knaskar","[email protected]"],
["jen_a","[email protected]"],
["wangz","[email protected]"],
["haozturk","[email protected]"],
["tyjyang","[email protected]"],
["fuyan","[email protected]"],
["llavezzo","[email protected]"],
["madamec", "[email protected]"],
["wjang", "[email protected]"]
]
},
"max_tail_priority" : {
"value" : 5,
"description" : "Number of workflow to increase the priority of at a time"
},
"injection_delay_threshold" : {
"value" : 50,
"description" : "Number of days after wich to increase the priority of a workflow"
},
"delay_priority_increase" : {
"value" : 10000,
"description" : "Priority from original increase per week over the delay threshold"
},
"injection_delay_priority" : {
"value" : 2000000,
"description" : "Priority above which we can increase the priority of a workflow after running too long"
},
"max_force_complete" : {
"value" : 50,
"description" : "Number of workflow that can be forced complete at a time"
},
"max_per_round" : {
"description" : "limitation on the number of wf to process per module",
"value" : {
"transferor" : 500,
"assignor" : 500,
"actor" : 10,
"closor" : null,
"checkor" : 500,
"completor" : null
}
},
"default_fraction_overdoing" :{
"value" : 1.05,
"description" : "Completion fraction above which a workflow will force complete"
},
"default_fraction_pass" : {
"value" : 1.0,
"description" : "completion fraction above which to announce dataset"
},
"min_events_per_lumi_output" : {
"value" : 20,
"description": "minimum of events per lumisection allowed in the output"
},
"cumulative_fraction_pass": {
"value" : false,
"description" : "Whether or not the pass threshold per campaign are counted as is, or cumulatively."
},
"onhold_timeout": {
"value" : 30,
"description" : "Number of days from completed after which a workflow onhold gets by-passed automatically."
},
"damping_fraction_pass" : {
"value" : 10,
"description" : "Time in days (since completed) after which to start reducing the pass threshold."
},
"damping_fraction_pass_rate" : {
"value" : 7,
"description" : "Time in days (since started running) after which to reduce the passing fration by 1%, every such time."
},
"damping_fraction_pass_max" : {
"value" : 5,
"description" : "Value in % of the max damping of passing fraction"
},
"acdc_rank_for_truncate" : {
"value" : 2,
"description" : "Rank of the acdc after which we truncate at the pass threshold (-1 means any order, 0 means after first round,... )"
},
"pattern_fraction_pass": {
"value" : { },
"description" : "overide of the completion fraction of dataset with keyword"
},
"tiers_with_no_custodial": {
"value" : ["DQM","DQMIO","RECO","RAWAODSIM", "FEVTDEBUGHLT"],
"description": "The data tiers that do not go to tape. Can be overidden by custodial overide at campaign level"
},
"use_parent_custodial": {
"value" : false,
"description": "Use the location of the parent dataset for custodial copy"
},
"tape_size_limit" : {
"value" : 300,
"description" : "Size over which to prevent transfer to tape automatically"
},
"tiers_with_no_check": {
"value" : ["DQM","DQMIO"],
"description": "The data tiers that do not pass closeout checks. Can be overidden by custodial overide at campaign level"
},
"tiers_no_DDM": {
"value" : ["GEN-SIM","GEN","SIM","DQM","DQMIO","GEN-SIM-DIGI-RAW","RAW","GEN-SIM-DIGI","GEN-SIM-DIGI-RAW-HLTDEBUG"],
"description": "The data tiers that do not go to AnaOps"
},
"tiers_to_DDM": {
"value" : ["LHE", "GEN-SIM-DIGI-RAW-MINIAOD","AODSIM","MINIAODSIM","GEN-SIM-RAW","GEN-SIM-RECO","GEN-SIM-RECODEBUG","AOD","RECO","MINIAOD","ALCARECO","USER","RAW-RECO","RAWAODSIM","NANOAOD","NANOAODSIM","FEVT","PREMIX", "GEN-SIM-DIGI-RAW-HLTDEBUG-RECO", "FEVTDEBUGHLT"],
"description": "The data tiers that go to AnaOps"
},
"tiers_to_rucio_relval": {
"value" : ["NANOAOD", "NANOAODSIM"],
"description": "The data tiers not to be considered for output data placement - RelVal workflows."
},
"tiers_to_rucio_nonrelval": {
"value" : ["NANOAOD", "NANOAODSIM"],
"description": "The data tiers not to be considered for output data placement - NonRelVal workflows."
},
"secondary_lock_timeout": {
"value" : 30,
"description": "The number of days to keep a secondary input locked"
},
"tiers_keep_on_disk": {
"value" : ["LHE"],
"description": "the data tier not unlocked until used again"
},
"check_fullcopy_to_announce": {
"value" : false,
"description": "Whether to check for a full copy being present prior to announcing a dataset"
},
"check_parentage_to_announce": {
"value" : true,
"description": "Whether to check if all blocks are having their parentage dependencies resolved before announcing a dataset"
},
"stagor_sends_back": {
"value" : true,
"description": "Whether the stagor module can send workflow back to considered"
},
"max_handled_workflows": {
"value" : 8000,
"description": "The total number of workflows that we allow to handle at a time (transfer, running, assistance)"
},
"max_staging_workflows": {
"value" : 700,
"description": "The total number of workflows that we allow to stage at a time"
},
"max_staging_workflows_per_site": {
"value" : 700,
"description": "The total number of workflows that we allow to stage at a time per site"
},
"max_transfer_in_GB": {
"value" : 2000000,
"description": "The total size of the input datasets that can be transfered at a given time. Default value: 800000. Removed for now."
},
"transfer_timeout": {
"value" : 7,
"description": "Time in days after which to consider a transfer to be stuck"
},
"transfer_lowrate": {
"value" : 0.004,
"description": "Rate in GB/s under which to consider a transfer to be stuck, after transfer_timeout days"
},
"less_copies_than_requested": {
"value" : 1,
"description": "Decrease the number of requested copies by that number, floored to 1"
},
"chopping_threshold_in_GB": {
"value" : 4000,
"description": "The threshold before choping an input dataset in chunk of that size for spreading to sites"
},
"retrieve_errors_timeout": {
"value" : 10,
"description" : "Time in minute that we allow to get logs from where they are before shutting retrieval down."
},
"full_report_top_N":{
"value" : 10,
"description" : "The number of top N workflows with failures and cooloff to show."
},
"expose_top_N":{
"value" : 3,
"description" :"The number of top N error code to expose logs of."
},
"n_error_exposed":{
"value" : 2,
"description" :"The default number of job cmsrun logs to retrieve"
},
"expose_archive_code":{
"value" : [134,139,99109,99303,60450,50513,50660,8001,8002,8004,8026,11003,73,74,87,84,85,92],
"description" : "The error codes that should get a cmsrun log exposed"
},
"expose_condor_code":{
"value" : [99109,99303,60450,61202,50513,50660,11003,84,85,92],
"description" : "The error codes that should get a condor lot exposed."
},
"use_recoveror":{
"value" : false,
"description" : "Whethe automatic recovery can be tried."
},
"error_codes_to_recover": {
"value" : { "50664" : [ { "legend" : "time-out",
"solution" : "split-2" ,
"details" : null,
"rate" : 20
} ],
"50660" : [ { "legend" : "memory excess",
"solution" : "mem-+1000" ,
"details" : null,
"rate" : 20
} ],
"61104" : [ { "legend" : "failed submit",
"solution" : "recover" ,
"details" : null,
"rate" : 20
} ],
"8028" : [ { "legend" : "read error",
"solution" : "recover" ,
"details" : null,
"rate" : 20
} ],
"8021" : [ { "legend" : "cmssw failure",
"solution" : "recover" ,
"details" : "FileReadError",
"rate" : 20
} ],
"71305" : [ { "legend" : "long pending",
"solution" : "recover" ,
"details" : null,
"rate" : 20
} ],
"8001" : [ { "legend" : "lhe failure",
"solution" : "split-4" ,
"details" : "No lhe event found in ExternalLHEProducer::produce()",
"rate" : 20
} ]
},
"description" : "The error code, threshold and rules for auto-recovery"
},
"error_codes_to_block" : {
"value" :
{
"99109" : [{ "legend" : "stage-out",
"solution" : "recover",
"details" : null,
"rate" : 20
}]
},
"description" : "The error code, threshold and rules to prevent auto-recovery"
},
"error_codes_to_notify" : {
"value" : {
"8021" : { "message" : "Please take a look and come back to Ops." }
},
"description" : "The error code, threshold and rules to notify the user of an error in production"
},
"user_rereco":{
"description" : "The users from which we expect ReReco requests",
"value" : ["cerminar","fabozzi","prebello","amaltaro","pgunnell"]
},
"convert_to_stepchain":{
"description" : "A list of keywords to match workflow campaigns and processing strings, and convert to StepChain if technical constraints are met",
"value" : [ "StepChainTest","NOTYETENABLED_RunIIFall17wmLHEGS","UL" ]
},
"user_relval":{
"description" : "The users from which we expect relval requests",
"value" : ["jinfeng","nwickram","asikdar","asahasra","chayanit","jrumsevi","ameyer","srappocc","mmeena","pdmvserv","srimanob","bbilin","kskovpen","pkalbhor","kaura","jordanm", "alcauser", "alcadb.user", "haozturk", "subansal"]
},
"user_storeresults":{
"description" : "The users from which we expect StoreResults requests",
"value" : ["cmsunified", "haozturk", "sagarwal", "wangz","snorberg"]
},
"relval_routing":{
"description" : "Set of keywords and special settings for relvals",
"value" : { "highPU" : {"parameters" : { "SiteWhitelist" : ["T2_CH_CERN"]}},
"highIO" : {"parameters" : { "SiteWhitelist" : ["T2_CH_CERN"]}},
"pLHE" : {"parameters" : { "SiteWhitelist" : ["T2_CH_CERN"]}},
"ALCA_" : { "primary_AAA" : true ,
"parameters" : { "SiteWhitelist" : ["T2_CH_CERN"]}
},
"NANO_" : { "parameters" : { "MaxMergeSize" : 100 ,"MinMergeSize": 50, "LumisPerJob" : 1000 }},
"M100PPC" : {"parameters" : { "SiteWhitelist" : ["T1_IT_CNAF"]}},
"UWGPU" : {"parameters" : { "SiteWhitelist" : ["T2_US_Wisconsin"]}},
"KITGPU" : {"parameters" : { "SiteWhitelist" : ["T1_DE_KIT"]}}
}
},
"batch_goodness":{
"description" : "Level below which to include a note in the batch report",
"value" : 90
},
"efficiency_threshold_for_stepchain":{
"description" : "Minimum efficiency threshold for step chain conversion",
"value" : 0.80
},
"stepchain_threshold_for_high_priority_requests":{
"description" : "Minimum efficiency threshold for step chain conversion of high priority requests",
"value" : 0.50
},
"block1_priority":{
"description" : "The value determines the priority of block1 requests",
"value" : 110000
},
"sites_for_DQMHarvest": {
"value" : ["T2_CH_CERN","T1_US_FNAL"],
"description" : "The sites to assign DQM/DMQIO harvesting workflows"
},
"HEPCloud_sites": {
"value" : ["T3_US_NERSC","T3_US_PSC","T3_US_SDSC","T3_US_TACC", "T3_US_Anvil", "T3_US_Lancium"],
"description" : "HEPCloud sites"
},
"acdc_console_url": {
"value" : "https://wfrecovery.cern.ch",
"description" : "Base url to the ACDC console"
},
"max_nCores_for_stepchain": {
"value" : 4,
"description" : "Maximum number of cores(Multicore) for stepchain workflows"
},
"max_memory_for_stepchain": {
"value" : 7900,
"description" : "Maximum memory for stepchain workflows"
},
"pnr_users": {
"value" : ["haozturk"],
"description" : "PnR users"
},
"prepIDs_to_skip": {
"value" : [],
"description" : "PrepIDs to skip from workflow assignment"
},
"acquired_threshold_per_priority_block" : {
"description" : "threshold per priority block above which assignment will be stalled",
"value" : {
"block1" : 20000,
"block2" : 16800,
"block3" : 3000,
"block4" : 0,
"block5" : 0,
"block6" : 0,
"block7" : 0
}
}
}