diff --git a/.github/workflows/docker-publish.yaml b/.github/workflows/docker-publish.yaml index 46597ad..b3dd671 100644 --- a/.github/workflows/docker-publish.yaml +++ b/.github/workflows/docker-publish.yaml @@ -8,11 +8,11 @@ on: # schedule: # - cron: '41 9 * * *' push: - branches: [ master, gpu ] + branches: [ master, gpu, htcondor-10 ] # Publish semver tags as releases. tags: [ 'v*.*.*' ] pull_request: - branches: [ master, gpu ] + branches: [ master, gpu, htcondor-10 ] env: # Dockhub @@ -86,6 +86,7 @@ jobs: tag_list=() tag_list+=(${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:"latest") tag_list+=(${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:$TIMESTAMP) + tag_list=("execute-3.6-gpu") # This causes the tag_list array to be comma-separated below, # which is required for build-push-action IFS=, diff --git a/Dockerfile b/Dockerfile index f02eedc..74b6fc5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=opensciencegrid/software-base:3.6-el7-release +ARG BASE_IMAGE=hub.opensciencegrid.org/opensciencegrid/software-base:3.6-el7-release FROM ${BASE_IMAGE} ARG BASE_IMAGE @@ -47,34 +47,14 @@ RUN yum install --enablerepo=osg-upcoming -y condor RUN yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo RUN yum install -y docker-ce-cli -RUN yum install -y http://mirror.grid.uchicago.edu/pub/mwt2/sw/el7/mwt2-sysview-worker-2.0.3-1.noarch.rpm +RUN yum install -y http://mirror.grid.uchicago.edu/pub/mwt2/sw/el7/mwt2-sysview-worker-2.0.5-1.noarch.rpm RUN yum install -y python36-tabulate -# Add CVMFSEXEC -#RUN git clone https://github.com/cvmfs/cvmfsexec /cvmfsexec \ -# && cd /cvmfsexec \ -# && ./makedist osg \ -# # /cvmfs-cache and /cvmfs-logs is where the cache and logs will go; possibly bind-mounted. \ -# # Needs to be 1777 so the unpriv user can use it. \ -# # (Can't just chown, don't know the UID of the unpriv user.) \ -# && mkdir -p /cvmfs-cache /cvmfs-logs \ -# && chmod 1777 /cvmfs-cache /cvmfs-logs \ -# && rm -rf dist/var/lib/cvmfs log \ -# && ln -s /cvmfs-cache dist/var/lib/cvmfs \ -# && ln -s /cvmfs-logs log \ -# # tar up and delete the contents of /cvmfsexec so the unpriv user can extract it and own the files. \ -# && tar -czf /cvmfsexec.tar.gz ./* \ -# && rm -rf ./* \ -# # Again, needs to be 1777 so the unpriv user can extract into it. \ -# && chmod 1777 /cvmfsexec - COPY condor/*.conf /etc/condor/config.d/ COPY cron/* /etc/cron.d/ COPY supervisor/* /etc/supervisord.d/ COPY image-config/* /etc/osg/image-config.d/ COPY libexec/* /usr/local/libexec/ -COPY sysview-client/sysclient /bin/ -COPY sysview-client/client /usr/lib/python3.6/site-packages/sysview/client COPY scripts/condor_node_check.sh /usr/local/sbin/ COPY scripts/entrypoint.sh /bin/entrypoint.sh diff --git a/condor/01-ccb.conf b/condor/01-ccb.conf index 7ec2ab2..10daae4 100644 --- a/condor/01-ccb.conf +++ b/condor/01-ccb.conf @@ -1,2 +1,3 @@ CCB_ADDRESS = $(CONDOR_HOST) PRIVATE_NETWORK_NAME = $(UID_DOMAIN) +TRUST_DOMAIN = head01.af.uchicago.edu diff --git a/prometheus/exporter.py b/prometheus/exporter.py index 9c05a54..ba9087d 100644 --- a/prometheus/exporter.py +++ b/prometheus/exporter.py @@ -2,6 +2,7 @@ import os import time +from subprocess import check_call from prometheus_client import start_http_server, Gauge, Enum import requests @@ -12,7 +13,7 @@ class AppMetrics: application metrics into Prometheus metrics. """ - def __init__(self, paths=None, app_port=80, polling_interval_seconds=5): + def __init__(self, paths=None, app_port=80, polling_interval_seconds=30): self.app_port = app_port self.polling_interval_seconds = polling_interval_seconds if not paths: @@ -55,10 +56,17 @@ def fetch(self): #self.total_uptime.set(status_data["total_uptime"]) for p in self.paths: - if os.path.exists(p): - self.health.labels(path=p).state("healthy") - else: + try: + check_call(['test', '-e', p], timeout=10) + except: self.health.labels(path=p).state("unhealthy") + else: + self.health.labels(path=p).state("healthy") + # thread could stuck in D wait and result will be stale + #if os.path.exists(p): + # self.health.labels(path=p).state("healthy") + #else: + # self.health.labels(path=p).state("unhealthy") def main(): """Main entry point""" diff --git a/sysview-client/client/__init__.py b/sysview-client/client/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/sysview-client/client/client.py b/sysview-client/client/client.py deleted file mode 100644 index 3e20efb..0000000 --- a/sysview-client/client/client.py +++ /dev/null @@ -1,285 +0,0 @@ -import os -import pwd -import time -import logging -import argparse -import tabulate - -from sysview.common.utils import get_config -from sysview.common.utils import expand_hostlist -from sysview.common.utils import get_base_parser - -""" -Parser for systools -""" -def get_parser(): - parser = get_base_parser('sysclient') - - subparsers = parser.add_subparsers(help='Subcommand to run') - - parser_hoststatus = subparsers.add_parser('hoststatus', help='Get current manual status of a host list') - parser_hoststatus.add_argument('hostlist', help='Host(s) to query') - parser_hoststatus.set_defaults(func=hoststatus) - - parser_nodestatus = subparsers.add_parser('nodestatus', help='Get current machine status of a host list') - parser_nodestatus.add_argument('hostlist', help='Host(s) to query') - parser_nodestatus.set_defaults(func=nodestatus) - - parser_online = subparsers.add_parser('online', help='Mark a list of hosts "online" in the cache') - parser_online.add_argument('hostlist', help='Host(s) to mark online') - parser_online.set_defaults(func=online) - - parser_offline = subparsers.add_parser('offline', help='Mark a list of hosts "offline" in the cache with the reason "Reason"') - parser_offline.add_argument('hostlist', help='Host(s) to mark offline') - parser_offline.add_argument('-r', '--reason', help="Reason") - parser_offline.set_defaults(func=offline) - - parser_backfill = subparsers.add_parser('backfill', help='Mark a list of hosts "backfill" in the cache with the reason "Reason"') - parser_backfill.add_argument('hostlist', help='Host(s) to mark backfill') - parser_backfill.add_argument('-r', '--reason', help="Reason") - parser_backfill.set_defaults(func=backfill) - - parser_dumpsite = subparsers.add_parser('dump_site', help='Dump site information from the cache') - parser_dumpsite.add_argument('site', help='Site name') - parser_dumpsite.add_argument('--filename', help='Filename for output (write to stdout if unspecified)') - parser_dumpsite.set_defaults(func=dump_site) - - parser_loadsite = subparsers.add_parser('load_site', help='Load site information in dump_site format back into the cache') - parser_loadsite.add_argument('filename', help='Filename for input') - parser_loadsite.set_defaults(func=load_site) - - return parser - - -""" -Get current status of a host list -""" -def get_status(keynames, hostlist, cache): - logger = logging.getLogger(__name__) - logger.info('Running get_status') - data = [] - header = [ - 'Node', - 'TimeStamp', - 'Last updated by user on date', - 'State', - 'Reason'] - hosts = expand_hostlist(hostlist) - keys = ['%s.%s' % (host, keyname) for host in hosts for keyname in keynames.values()] - values = cache.get_multi(keys) - - logger.debug('Hostlist: %s' % hosts) - logger.debug('Keys: %s' % keynames) - - for host in hosts: - try: - timestamp = time.strftime( - "%F %T", - time.localtime( - int(values["%s.%s" % (host, keynames['timestamp'])]))) - except KeyError: - timestamp = 'UNDEF' - try: - mtimestamp = time.strftime( - "%F %T", - time.localtime( - int(values["%s.%s" % (host, keynames['mtimestamp'])]))) - except KeyError: - mtimestamp = 'UNDEF' - try: - status = values["%s.%s" % (host, keynames['status'])] - except KeyError: - status = 'UNDEF' - try: - reason = values["%s.%s" % (host, keynames['message'])] - except KeyError: - reason = 'UNDEF' - try: - user = values["%s.%s" % (host, keynames['user'])] - except KeyError: - user = 'UNDEF' - data.append((host, timestamp, ' '.join((user, mtimestamp)), status, reason)) - - print(tabulate.tabulate(data, headers=header, tablefmt='orgtbl')) - - -""" -Get current manual status of a host list -""" -def hoststatus(args, cache): - logger = logging.getLogger(__name__) - logger.info('Running hoststatus') - keynames = { - 'status': 'manualstatus', - 'message': 'manualreason', - 'user': 'manualuser', - 'mtimestamp': 'manualtimestamp', - 'timestamp': 'timestamp'} - get_status(keynames=keynames, hostlist=args.hostlist, cache=cache) - - -""" -Get current machine status of a host list -""" -def nodestatus(args, cache): - logger = logging.getLogger(__name__) - logger.info('Running nodestatus') - keynames = { - 'status': 'status', - 'message': 'message', - 'user': 'manualuser', - 'mtimestamp': 'manualtimestamp', - 'timestamp': 'timestamp'} - get_status(keynames=keynames, hostlist=args.hostlist, cache=cache) - - -""" -Update the status of a given hostname (or list of hostnames) -""" -def update_status(hostlist, status, reason, cache): - logger = logging.getLogger(__name__) - logger.info('Running update_status') - data = {} - hosts = expand_hostlist(hostlist) - mu = pwd.getpwuid(os.getuid()).pw_name - mts = int(time.time()) - - logger.debug('Hostlist: %s' % hosts) - logger.debug('Status: %s' % status) - logger.debug('Reason: %s' % reason) - logger.debug('User: %s' % mu) - logger.debug('Timestamp: %s' % mts) - - for host in hosts: - data['%s.manualstatus' % host] = status - data['%s.manualreason' % host] = reason - data['%s.manualuser' % host] = mu - data['%s.manualtimestamp' % host] = mts - data['%s.timestamp' % host] = mts - - cache.set_multi(data) - - -""" -Set nodes to online -""" -def online(args, cache): - logger = logging.getLogger(__name__) - logger.info('Running online') - reason = '' - update_status( - hostlist=args.hostlist, - status='online', - reason=reason, - cache=cache) - - -""" -Set nodes to offline -""" -def offline(args, cache): - logger = logging.getLogger(__name__) - logger.info('Running offline') - reason = args.reason or '*Reason Not Set By User*' - update_status( - hostlist=args.hostlist, - status='offline', - reason=reason, - cache=cache) - - -""" -Set nodes to backfill -""" -def backfill(args, cache): - logger = logging.getLogger(__name__) - logger.info('Running backfill') - reason = args.reason or '*Reason Not Set By User*' - update_status( - hostlist=args.hostlist, - status='backfill', - reason=reason, - cache=cache) - - -""" -Dump site information from the cache - -The dump is of the format Node:State:Reason:User:Timestamp - - Node Short name of the node (such as uct2-c267, iut2-c199, mwt2-c103 - State State of the node, online|backfill|offline - Reason Reason a node is not online - User User who updated the state - Timestamp Time the node state was last updated -""" -def dump_site(args, cache): - logger = logging.getLogger(__name__) - logger.info('Running dump_site') - - config = get_config(args.config_file) - keynames = ['manualstatus', 'manualreason', 'manualuser', 'manualtimestamp'] - hosts = [] - - if not config.has_section(args.site): - logger.critical("Configuration file is missing site '%s'" % args.site) - logger.critical("Valid sites: %s" % " ".join([c[0] for c in config.items('collectors')])) - exit(1) - - if args.filename: - f = open(args.filename, 'w') - - for hostlist in config.items(args.site): - hosts.extend(expand_hostlist(hostlist[0])) - - keys = ["%s.%s" % (host, keyname) for host in hosts for keyname in keynames] - - values = cache.get_multi(keys) - - for host in hosts: - try: - status = values['%s.manualstatus' % host] or values['%s.status' % host] - except KeyError: - status = 'Unknown' - try: - reason = values['%s.manualreason' % host] or values['%s.message' % host] - except KeyError: - reason = 'Unknown' - try: - user = values['%s.manualuser' % host] - except KeyError: - user = 'Unknown' - try: - mts = values['%s.manualtimestamp' % host] - except KeyError: - mts = 0 - - if args.filename: - f.write('%s:%s:%s:%s:%s\n' % (host, status, reason, user, mts)) - else: - print('%s:%s:%s:%s:%s' % (host, status, reason, user, mts)) - - if args.filename: - f.close() - - -""" -Load line in dump_site format back into memcache - -The dump is of the format Node:State:Reason - - Node Short name of the node (such as uct2-c267, iut2-c199, mwt2-c103 - State State of the node, online|backfill|offline - Reason Reason a node is not online - User User who updated the state - Timestamp Time the node state was last updated -""" -def load_site(args, cache): - logger = logging.getLogger(__name__) - logger.info('Running load_site') - - with open(args.filename, 'r') as f: - for line in f: - host, status, reason, user, timestamp = line.split(':') - update_status(host, status, reason, cache) - diff --git a/sysview-client/sysclient b/sysview-client/sysclient deleted file mode 100755 index 134adf7..0000000 --- a/sysview-client/sysclient +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 - -import logging -from sysview.common.utils import get_config -from sysview.common.cache import get_cache -from sysview.client.client import get_parser - -if __name__ == '__main__': - parser = get_parser() - args = parser.parse_args() - - if args.verbose >= 2: - loglevel = logging.DEBUG - elif args.verbose == 1: - loglevel = logging.INFO - else: - loglevel = logging.WARNING - - logging.basicConfig( - level=loglevel, - format='%(levelname)s %(module)s.%(funcName)s(): %(message)s' - ) - logger = logging.getLogger(__name__) - - logger.info('Logger level: %d' % logger.getEffectiveLevel()) - - config = get_config(args.config_file) - cache = get_cache(args=args, config=config) - - try: - args.func(args=args, cache=cache) - except AttributeError as e: - print(e) - parser.print_help() -