Skip to content

Commit

Permalink
Merge pull request #5 from rlcee/updates_230804
Browse files Browse the repository at this point in the history
Updates 230804
  • Loading branch information
rlcee authored Aug 4, 2023
2 parents 7898968 + 7c71326 commit 0beab2c
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 13 deletions.
7 changes: 5 additions & 2 deletions Util/functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -361,12 +361,12 @@ release_SAM_file() {
watchdog() {

local TL=$1
[ -z "$TLIMIT" ] && TL=7200
[ -z "$TL" ] && TL=7200

DD=/pnfs/mu2e/scratch/users/$GRID_USER/watchdog

[ -z "$MU2E" ] && source /cvmfs/mu2e.opensciencegrid.org/setupmu2e-art.sh
[ -z "$SETUP_IFDH" ] && setup ifdhc
[ -z "$SETUP_IFDHC" ] && setup ifdhc

ifdh mkdir_p $DD

Expand Down Expand Up @@ -399,3 +399,6 @@ watchdog() {


}

# so watchdog can be run in a subshell
export -f watchdog
42 changes: 31 additions & 11 deletions Util/pushOutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,21 +457,23 @@ def copyFile(dfile):

localurl = "file://"+os.path.realpath(dfile.localfs)

rc = 999
for itry, tsleep in enumerate(retries) :
time.sleep(tsleep)

try:
rc = ctx.filecopy(params, localurl, dfile.url)
break
except Exception as e:
rc = 1
# gfal only raises generic errors, so have to parse the text
message = str(e)
# if the output file already exists, then this function is done
if "file exists" in message :
teeDate(1,"WARNING - output file exists for {}/{}"\
.format(dfile.path,dfile.fn))
return 1
print("ERROR - copy failed for try {} for {}".\
teeDate(0,"ERROR - copy failed for try {} for {}".\
format(itry,dfile.url))
print("message: "+message)

Expand Down Expand Up @@ -576,7 +578,7 @@ def declareSam(dfile):
dfile.donesam = False
dfile.samtime = -1

rc = 0
rc = 999
for itry, tsleep in enumerate(retries) :
time.sleep(tsleep)
try :
Expand All @@ -598,6 +600,9 @@ def declareSam(dfile):
if rc == 2 :
teeDate(0,"ERROR - SAM declare retries exhausted for "+dfile.fn)
return 2
elif rc != 0 :
teeDate(0,"ERROR - SAM declare errors "+str(rc)+" for "+dfile.fn)
return 2

# must be ok so far, add location
loc = dfile.samdisk + ":" + dfile.path
Expand Down Expand Up @@ -758,7 +763,7 @@ def writeLog(dfile):
ferr = os.path.expandvars("jsb_tmp/JOBSUB_ERR_FILE")

if not os.path.exists(fout) :
teeDate(0,"ERROR - writeLog could not find {} " + fout)
teeDate(0,"ERROR - writeLog could not find " + fout)
return 2

with open(fn,"w") as f:
Expand Down Expand Up @@ -866,6 +871,8 @@ def writeLog(dfile):
line = f.readline()

rcWrite = 0
rcRecover = 0
rcCheck = 0
for dfile in dflist :

#df = DataFile()
Expand Down Expand Up @@ -904,15 +911,13 @@ def writeLog(dfile):
# go into recovery algorithm
teeDate(0,"INFO - running checkTimes")
# check if previous output is recent or stale
rcCheck = 0
for dfile in dflist :
if dfile.isLog :
continue
rcCheck = checkTimes(dfile)
if rcCheck != 0 :
break

rcRecover = 0
if rcCheck == 1 :
# recent files from another job exist
teeDate(0,"INFO - running rollback")
Expand All @@ -932,20 +937,35 @@ def writeLog(dfile):
if rcRecover == 2 :
break

# always try to write a log file
# initial job rc, before writing log files
rcJob = 0
if rcWrite == 0 or rcRecover == 0 :
teeDate(0,"Success status before log write")
if rcWrite == 0 :
# normal sucessful write
rcJob = 0
elif rcWrite == 1 :
# found existing output files
if rcCheck == 0 :
# result of attempt to overwrite old files
rcJob = rcRecover
else :
# old files were not that old, no overwrite attempted
# this job must fail, to cause continued recoveries
rcJob = 3
else :
teeDate(0,"ERROR status before log write rcWrite={} rcCheck={} rcRecover={}"
.format(rcWrite,rcCheck,rcRecover))
# error during nornmal write attempt
rcJob = 2


teeDate(0,"pushOutput status before log write: " + str(rcJob))

# always try to write the log

for dfile in dflist :
if dfile.isLog :
writeLog(dfile)
rc = writeLog(dfile)
if rc != 0 :
rcJob = rc

teeDate(0,"pushOutput status at exit: " + str(rcJob))

sys.exit(rcJob)
5 changes: 5 additions & 0 deletions Util/wrapper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ source /cvmfs/mu2e.opensciencegrid.org/bin/OfflineOps/functions.sh

tee_date Starting OfflineOps/wrapper.sh

tee_date Check LANG
printenv | grep LC_
printenv LANG
unset LC_CTYPE

# always need to find setup
source /cvmfs/mu2e.opensciencegrid.org/setupmu2e-art.sh

Expand Down

0 comments on commit 0beab2c

Please sign in to comment.