diff --git a/.gitignore b/.gitignore index 4c7384d..2cc4ee8 100644 --- a/.gitignore +++ b/.gitignore @@ -140,7 +140,7 @@ cython_debug/ *.swp userportal/settings/*-local.py userportal/local.py -slurm_prolog/api_config.ini +slurm_jobscripts/slurm_jobscripts.ini private.key public.cert idp_metadata.xml diff --git a/docs/data.md b/docs/data.md index f85a7e8..3682560 100644 --- a/docs/data.md +++ b/docs/data.md @@ -91,4 +91,4 @@ groups: The information in this database is used to show the current utilization per user within a group. ## Slurm jobscript -The script `slurm_jobscript/slurm_jobscripts_userportal.py` can be used to add the submitted script to the database of the portal. This should run on the Slurm server, it will collect the scripts from `/var/spool/slurmctld`. This script uses the REST API of Django to push the job script. A user with a token need to be created, check the [installation documentation](install.md) on how to create this API token. +The script `slurm_jobscript/slurm_jobscripts.py` can be used to add the submitted script to the database of the portal. This should run on the Slurm server, it will collect the scripts from the `spool` directory of slurm. This script uses the REST API of Django to push the job script. A user with a token need to be created, check the [installation documentation](install.md) on how to create this API token. diff --git a/docs/jobstats.md b/docs/jobstats.md index 0c19a67..565d625 100644 --- a/docs/jobstats.md +++ b/docs/jobstats.md @@ -1,5 +1,5 @@ # Jobstats -Each user can see their current uses on the cluster and a few hours in the past. The stats for each job are also available. Information about CPU, GPU, memory, filesystem, InfiniBand, power, etc. is also available per job. The submitted job script can also be collected and displayed on this page. Some automatic recommendations are also given to the user, based on the content of their job script and the stats of their job. +Each user can see their current uses on the cluster and a few hours in the past. The stats for each job are also available. Information about CPU, GPU, memory, filesystem, InfiniBand, power, etc. is also available per job. The submitted job script can also be collected from the Slurm server and then stored and displayed in the portal. Some automatic recommendations are also given to the user, based on the content of their job script and the stats of their job. Stats per user Stats per job @@ -13,4 +13,4 @@ Optional: * node\_exporter (show node information) * redfish\_exporter (show power information) * lustre\_exporter and lustre\_exporter\_slurm (show Lustre information) -* jobscript collector (show the submitted jobscript) +* slurm_jobscripts.py (show the submitted jobscript) diff --git a/slurm_jobscripts/api_config.ini.dist b/slurm_jobscripts/slurm_jobscripts.ini.dist similarity index 66% rename from slurm_jobscripts/api_config.ini.dist rename to slurm_jobscripts/slurm_jobscripts.ini.dist index d8fe58b..04b83a8 100644 --- a/slurm_jobscripts/api_config.ini.dist +++ b/slurm_jobscripts/slurm_jobscripts.ini.dist @@ -2,3 +2,6 @@ token = changeme host = http://localhost:8000 script_length = 100000 + +[slurm] +spool = /var/spool/slurmctld diff --git a/slurm_jobscripts/slurm_jobscripts.py b/slurm_jobscripts/slurm_jobscripts.py new file mode 100644 index 0000000..05972f3 --- /dev/null +++ b/slurm_jobscripts/slurm_jobscripts.py @@ -0,0 +1,81 @@ +import requests +import configparser +import os +import time +import argparse +import logging + +# This is script is taking the submitted script on the slurmctld server +# and send it to the userportal so it can be stored in a database + + +def send_job(jobid): + try: + with open('{spool}/hash.{mod}/job.{jobid}/script'.format( + spool=spool, + mod=jobid % 10, + jobid=jobid), 'r') as f: + content = f.read()[:script_length].strip('\x00') + logging.debug('Job script {}: {}'.format(jobid, content[:100])) # Only log first 100 characters into DEBUG log + r = requests.post('{}/api/jobscripts/'.format(host), + json={'id_job': int(jobid), 'submit_script': content}, + headers={'Authorization': 'Token ' + token}) + if r.status_code != 201: + if r.status_code == 401: + logging.error('Token is invalid') + elif 'job script with this id job already exists' in r.text: + logging.debug('Job script already exists') + else: + logging.error('Job script {} not saved: {}'.format(jobid, r.text)) + + except UnicodeDecodeError: + # Ignore problems with wrong file encoding + pass + except FileNotFoundError: + # The script disappeared before we could read it + pass + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--config', + help='Path to the config file (default: %(default)s)', + type=str, + default='/etc/slurm/slurm_jobscripts.ini') + parser.add_argument('--verbose', help='Verbose output', action='store_true') + args = parser.parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + config = configparser.ConfigParser() + logging.debug('Reading config file: {}'.format(args.config)) + config.read(args.config) + token = config['api']['token'] + host = config['api']['host'] + script_length = int(config['api']['script_length']) + spool = config['slurm']['spool'] + + jobs = set() + + while True: + updated_jobs = set() + for mod in range(10): + try: + listing = os.listdir('{spool}/hash.{mod}'.format(spool=spool, mod=mod)) + except FileNotFoundError: + logging.debug('hash.{mod} does not exist yet'.format(mod=mod)) + continue + for job in filter(lambda x: 'job' in x, listing): + jobid = int(job[4:]) # parse the jobid (job.12345 -> 12345) + updated_jobs.add(jobid) + + if jobid not in jobs: + logging.debug('New job: {}'.format(jobid)) + send_job(jobid) + + jobs = updated_jobs + time.sleep(5) diff --git a/slurm_jobscripts/slurm_jobscripts_userportal.py b/slurm_jobscripts/slurm_jobscripts_userportal.py deleted file mode 100644 index 4bee8f9..0000000 --- a/slurm_jobscripts/slurm_jobscripts_userportal.py +++ /dev/null @@ -1,48 +0,0 @@ -import requests -import configparser -import os -import time - -# This is script is taking the submitted script on the slurmctld server -# and send it to the userportal so it can be stored in a database - - -def send_job(jobid): - try: - with open('/var/spool/slurmctld/hash.{mod}/job.{jobid}/script'.format(mod=jobid % 10, jobid=jobid), - 'r') as f: - content = f.read()[:script_length].strip('\x00') - requests.post('{}/api/jobscripts/'.format(host), - json={'id_job': int(jobid), 'submit_script': content}, - headers={'Authorization': 'Token ' + token}) - - except UnicodeDecodeError: - # Ignore problems with wrond file encoding - pass - except FileNotFoundError: - # The script disappeared before we could read it - pass - - -config = configparser.ConfigParser() -config.read('/root/api_config.ini') -token = config['api']['token'] -host = config['api']['host'] -script_length = int(config['api']['script_length']) - -jobs = set() - -while True: - updated_jobs = set() - for mod in range(10): - listing = os.listdir('/var/spool/slurmctld/hash.{mod}'.format(mod=mod)) - for job in filter(lambda x: 'job' in x, listing): - jobid = int(job[4:]) - updated_jobs.add(jobid) - - if jobid not in jobs: - # This is a new job - send_job(jobid) - - jobs = updated_jobs - time.sleep(5)