From 63bbb68743d18a3dd24609d197d44358d85babd9 Mon Sep 17 00:00:00 2001 From: Jakub Moscicki Date: Wed, 1 Jun 2016 11:53:22 +0200 Subject: [PATCH] merged changes from cboxsls: simple graphite monitoring, infinite loop support and fscheck in nplusone --- bin/smash | 4 ++- corruption_test/run_nplusone_loop | 8 +++-- lib/test_nplusone.py | 44 +++++++++++++++++++++++-- python/smashbox/utilities/monitoring.py | 16 +++++++++ 4 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 python/smashbox/utilities/monitoring.py diff --git a/bin/smash b/bin/smash index 3e72466..990c5dc 100755 --- a/bin/smash +++ b/bin/smash @@ -242,7 +242,8 @@ def main(): logger.critical("Wrong testset specification: %d index out of range for %s",args.testset,barename(t)) sys.exit(1) - for j in range(1,args.loop+1): + j = 1 + while j <= args.loop or args.loop == 0: log_quiet ("Running iteration %d" % j) if not smashbox.no_engine.testsets: @@ -254,6 +255,7 @@ def main(): else: i = args.testset # this may be None if no testset indicated run_test(t,j,i) + j=j+1 if args.dry_run: log_quiet('*** DRY RUN ***') diff --git a/corruption_test/run_nplusone_loop b/corruption_test/run_nplusone_loop index 3c1659f..24f4f62 100755 --- a/corruption_test/run_nplusone_loop +++ b/corruption_test/run_nplusone_loop @@ -10,9 +10,10 @@ conf_file = os.environ.get('SMASHBOX_CONF',os.path.join(thisdir,"smashbox.conf") dirs = {'thisdir':thisdir,'smashdir':smashdir, 'conf_file':conf_file} -os.environ['OWNCLOUD_MAX_PARALLEL'] = '10' +os.environ['OWNCLOUD_MAX_PARALLEL'] = '3' -os.environ['OWNCLOUD_USE_LEGACY_JOBS'] = '1' +# this disables the checksumming! +#os.environ['OWNCLOUD_USE_LEGACY_JOBS'] = '1' import datetime now = datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S') @@ -36,6 +37,9 @@ i = 1 dirs['options']="-o nplusone_nfiles=20 -o nplusone_filesize='(5.0,1.37)'" # --keep-state" #dirs['options']="-o nplusone_nfiles=20 -o nplusone_filesize=30000000" +# infinite loop and ignore any casual errors (stop on fatal errors only) +dirs['options'] += " --loop=0 --keep-going " + cmd = '%(smashdir)s/bin/smash -c %(conf_file)s %(options)s %(smashdir)s/lib/test_nplusone.py' % dirs #print cmd diff --git a/lib/test_nplusone.py b/lib/test_nplusone.py index f156bcc..7e8b10a 100755 --- a/lib/test_nplusone.py +++ b/lib/test_nplusone.py @@ -8,10 +8,14 @@ from smashbox.utilities import * from smashbox.utilities.hash_files import * +from smashbox.utilities.monitoring import push_to_monitoring nfiles = int(config.get('nplusone_nfiles',10)) filesize = config.get('nplusone_filesize',1000) +# optional fs check before files are uploaded by worker0 +fscheck = config.get('nplusone_fscheck',False) + if type(filesize) is type(''): filesize = eval(filesize) @@ -54,8 +58,29 @@ def worker0(step): step(2,'Add %s files and check if we still have k1+nfiles after resync'%nfiles) + total_size=0 + sizes=[] + + # compute the file sizes in the set for i in range(nfiles): - create_hashfile(d,size=filesize) + size=size2nbytes(filesize) + sizes.append(size) + total_size+=size + + time0=time.time() + + logger.log(35,"Timestamp %f Files %d TotalSize %d",time.time(),nfiles,total_size) + + # create the test files + for size in sizes: + create_hashfile(d,size=size) + + if fscheck: + # drop the caches (must be running as root on Linux) + runcmd('echo 3 > /proc/sys/vm/drop_caches') + + ncorrupt = analyse_hashfiles(d)[2] + fatal_check(ncorrupt==0, 'Corrupted files ON THE FILESYSTEM (%s) found'%ncorrupt) run_ocsync(d) @@ -68,6 +93,17 @@ def worker0(step): fatal_check(ncorrupt==0, 'Corrupted files (%s) found'%ncorrupt) logger.info('SUCCESS: %d files found',k1) + + step(4,"Final report") + + time1 = time.time() + push_to_monitoring("cernbox.cboxsls.nplusone.nfiles",nfiles) + push_to_monitoring("cernbox.cboxsls.nplusone.total_size",total_size) + push_to_monitoring("cernbox.cboxsls.nplusone.elapsed",time1-time0) + push_to_monitoring("cernbox.cboxsls.nplusone.total_size",total_size) + push_to_monitoring("cernbox.cboxsls.nplusone.transfer_rate",total_size/(time1-time0)) + push_to_monitoring("cernbox.cboxsls.nplusone.worker0.synced_files",k1-k0) + @add_worker def worker1(step): @@ -83,9 +119,13 @@ def worker1(step): ncorrupt = analyse_hashfiles(d)[2] k1 = count_files(d) + push_to_monitoring("cernbox.cboxsls.nplusone.worker1.synced_files",k1-k0) + push_to_monitoring("cernbox.cboxsls.nplusone.worker1.cor",ncorrupt) + error_check(k1-k0==nfiles,'Expecting to have %d files more: see k1=%d k0=%d'%(nfiles,k1,k0)) - fatal_check(ncorrupt==0, 'Corrupted files (%s) found'%ncorrupt) + fatal_check(ncorrupt==0, 'Corrupted files (%d) found'%ncorrupt) #Massimo 12-APR + diff --git a/python/smashbox/utilities/monitoring.py b/python/smashbox/utilities/monitoring.py new file mode 100644 index 0000000..a020f86 --- /dev/null +++ b/python/smashbox/utilities/monitoring.py @@ -0,0 +1,16 @@ +from smashbox.utilities import * + +# simple monitoring to grafana (disabled if not set in config) + +def push_to_monitoring(metric,value,timestamp=None): + + monitoring_host=config.get('monitoring_host',None) + monitoring_port=config.get('monitoring_port',2003) + + if not monitoring_host: + return + + if not timestamp: + timestamp = time.time() + + os.system("echo '%s %s %s' | nc %s %s"%(metric,value,timestamp,monitoring_host,monitoring_port))