From e82113015aebbffdf8e7817c8c8661433ba5e558 Mon Sep 17 00:00:00 2001 From: Mats Rynge Date: Fri, 27 Oct 2023 10:10:23 -0700 Subject: [PATCH] Removed ospool-display container - old fifemon --- opensciencegrid/ospool-display/50-ospool.conf | 6 - opensciencegrid/ospool-display/Dockerfile | 32 -- .../fifemon-condor-probe/Dockerfile | 7 - .../fifemon-condor-probe/LICENSE.txt | 29 -- .../fifemon-condor-probe/README.md | 134 ------- .../constraint_example.py | 49 --- .../fifemon-condor-probe/fifemon.cfg | 64 ---- .../fifemon-condor-probe/fifemon/__init__.py | 2 - .../fifemon/condor/__init__.py | 12 - .../fifemon/condor/jobs.py | 344 ------------------ .../fifemon/condor/priorities.py | 98 ----- .../fifemon/condor/slots.py | 244 ------------- .../fifemon/condor/status.py | 55 --- .../fifemon/condor_probe.py | 193 ---------- .../fifemon-condor-probe/fifemon/graphite.py | 61 ---- .../fifemon-condor-probe/fifemon/influx.py | 73 ---- .../fifemon-condor-probe/fifemon/probe.py | 127 ------- .../fifemon-condor-probe/setup.py | 20 - .../ospool-display/fifemon.cfg-development | 65 ---- .../ospool-display/fifemon.cfg-production | 65 ---- .../image-config.d/50-fifemon.sh | 32 -- .../supervisor.d/10-fifemon.conf | 9 - 22 files changed, 1721 deletions(-) delete mode 100644 opensciencegrid/ospool-display/50-ospool.conf delete mode 100644 opensciencegrid/ospool-display/Dockerfile delete mode 100644 opensciencegrid/ospool-display/fifemon-condor-probe/Dockerfile delete mode 100644 opensciencegrid/ospool-display/fifemon-condor-probe/LICENSE.txt delete mode 100644 opensciencegrid/ospool-display/fifemon-condor-probe/README.md delete mode 100644 opensciencegrid/ospool-display/fifemon-condor-probe/constraint_example.py delete mode 100644 opensciencegrid/ospool-display/fifemon-condor-probe/fifemon.cfg delete mode 100644 opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/__init__.py delete mode 100644 opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/__init__.py delete mode 100755 opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/jobs.py delete mode 100755 opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/priorities.py delete mode 100755 opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/slots.py delete mode 100755 opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/status.py delete mode 100755 opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor_probe.py delete mode 100755 opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/graphite.py delete mode 100755 opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/influx.py delete mode 100755 opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/probe.py delete mode 100644 opensciencegrid/ospool-display/fifemon-condor-probe/setup.py delete mode 100644 opensciencegrid/ospool-display/fifemon.cfg-development delete mode 100644 opensciencegrid/ospool-display/fifemon.cfg-production delete mode 100755 opensciencegrid/ospool-display/image-config.d/50-fifemon.sh delete mode 100644 opensciencegrid/ospool-display/supervisor.d/10-fifemon.conf diff --git a/opensciencegrid/ospool-display/50-ospool.conf b/opensciencegrid/ospool-display/50-ospool.conf deleted file mode 100644 index f86555ba..00000000 --- a/opensciencegrid/ospool-display/50-ospool.conf +++ /dev/null @@ -1,6 +0,0 @@ - -# Clients should be able to query without encryption + authentication -SEC_CLIENT_AUTHENTICATION = OPTIONAL -SEC_CLIENT_ENCRYPTION = OPTIONAL -SEC_CLIENT_INTEGRITY = OPTIONAL - diff --git a/opensciencegrid/ospool-display/Dockerfile b/opensciencegrid/ospool-display/Dockerfile deleted file mode 100644 index f86ffd6c..00000000 --- a/opensciencegrid/ospool-display/Dockerfile +++ /dev/null @@ -1,32 +0,0 @@ -ARG BASE_YUM_REPO=release -ARG BASE_OSG_SERIES=3.6 -ARG BASE_OS=el9 - -FROM opensciencegrid/software-base:$BASE_OSG_SERIES-$BASE_OS-$BASE_YUM_REPO - -LABEL maintainer OSG Software - -RUN yum -y --enablerepo=osg-upcoming install \ - condor \ - curl \ - git \ - python-devel \ - python-pip \ - python-setuptools \ - wget \ - && \ - yum clean all - -COPY fifemon-condor-probe /fifemon - -RUN pip install --upgrade setuptools==41.4.0 && \ - cd /fifemon && \ - python setup.py install - -COPY fifemon.cfg-development /etc/fifemon.cfg-development -COPY fifemon.cfg-production /etc/fifemon.cfg-production -COPY 50-ospool.conf /etc/condor/config.d/50-ospool.conf - -ADD image-config.d/* /etc/osg/image-config.d/ -ADD supervisor.d/* /etc/supervisord.d/ - diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/Dockerfile b/opensciencegrid/ospool-display/fifemon-condor-probe/Dockerfile deleted file mode 100644 index 9358a33c..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/Dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM python:2.7 - -WORKDIR /fifemon -COPY . . -RUN python setup.py install - -CMD [ "fifemon", "/fifemon/fifemon.cfg" ] \ No newline at end of file diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/LICENSE.txt b/opensciencegrid/ospool-display/fifemon-condor-probe/LICENSE.txt deleted file mode 100644 index 4afcd57b..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/LICENSE.txt +++ /dev/null @@ -1,29 +0,0 @@ -Copyright (c) 2015, FERMI NATIONAL ACCELERATOR LABORATORY -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - * Neither the name of the FERMI NATIONAL ACCELERATOR LABORATORY, - nor the names of its contributors may be used to endorse or - promote products derived from this software without specific - prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/README.md b/opensciencegrid/ospool-display/fifemon-condor-probe/README.md deleted file mode 100644 index 32ec14b3..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/README.md +++ /dev/null @@ -1,134 +0,0 @@ -Fifemon -======= - -Collect HTCondor statistics and report into time-series database. All modules -support Graphite, and there is some support for InfluxDB. - -Note: this is a fork of the scripts used for monitoring the HTCondor pools -at Fermilab, and while generally intended to be "generic" for any pool still -may require some tweaking to work well for your pool. - -Copyright Fermi National Accelerator Laboratory (FNAL/Fermilab). See LICENSE.txt. - -Requirements ------------- - -* Python 2.7 or greater recommended. -* HTCondor libraries and Python bindings - * https://research.cs.wisc.edu/htcondor/downloads/ - * https://pypi.org/project/htcondor/ (setup.py will install this, see below) -* A running Graphite server (available in EPEL or via PIP) - * http://graphite.readthedocs.org/en/latest/ - -Installation ------------- - -Global install: - - python setup.py install - -Installing in a virtualenv: - - virtualenv --system-site-packages .venv - source .venv/bin/activate - python setup.py install - -Docker: - - docker build -t fifemon . - -Running -------- - - fifemon [options] config_file - -The setup script will install an executable script named `fifemon`. It -requires a configuration file passed as an argument (see below or -included `fifemon.cfg`). Some runtime options are supported on the -command line, run `fifemon --help` for info. - -Running with Docker -------------------- - - docker run fifemon - -By default the included `fifemon.cfg` configuration will be used, but -you can easily bind-mount in another: - - docker run -v myconfig.cfg:/fifemon/fifemon.cfg fifemon - - -Configuration -------------- - -Example probe config is in `fifemon.cfg`. A note on constraints: -constraints can either be a boolean, where `true` means no constraint, -a string to be passed directly to HTCondor, or a function, which will -be called at runtime with an `htcondor.Collector` argument and should -return a string constraint. See `constraint_example.py` for an example -of how this could be used. - - [probe] - # how often to send data in seconds - interval = 240 - # how many times to retry condor queries - retries = 2 - # seconds to wait beteeen retries - delay = 10 - # if true, data is output to stdout and not sent downstream - test = false - # run one time and exit, i.e. for running wtih cron - once = false - # enable promethus metrics server for monitoring the probe - publish_metrics = false - - [graphite] - # enable output to graphite - enable = true - # graphite host - host = localhost - # graphite pickle port - port = 2004 - # base namespace for metrics - namespace = test.condor - # namespace for probe monitoring metrics - meta_namespace = test.probes.condor - - [influxdb] - # enable output to influxdb (not fully supported) - enable = false - # influxdb host - host = localhost - # influxdb api port - port = 8086 - # influxdb database - db = test - # extra tags to include with all metrics (comma-separated key:value) - tags = cluster:test - - [condor] - # central manager/collector host - pool = localhost - # collect basic daemon (collector, negotiator, schedd) metrics? - post_pool_status = true - # collect machine/startd metrics? - post_pool_slots = true - # constraint to limit which startds metrics are collected for - slot_constraint = true - # collect glidein-specific startd metrics? - post_pool_glideins = false - # collect priorities and quotas? - post_pool_prio = false - # constraint to limit which negotiators are queried - negotiator_constraint = true - # If true, first try to get priorities and quotas from Accounting classads in collector, - # then fallback to negotiator. Opposite order if false. - prefer_accounting = false - # collect job metrics? - post_pool_jobs = false - # constraint to limit which schedds are queried - schedd_constraint = true - # Enable GSI (x509 certificate) authorization - use_gsi_auth = false - X509_USER_KEY = "" - X509_USER_CERT = "" diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/constraint_example.py b/opensciencegrid/ospool-display/fifemon-condor-probe/constraint_example.py deleted file mode 100644 index 900d278f..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/constraint_example.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -import logging -import pprint -import traceback - -import htcondor - -from fifemon.condor_probe import CondorProbe, get_options - -logger = logging.getLogger(__name__) - -def get_lpc_node_boundary(coll): - """ - query the negotiator for the boundary node number for the LPC-allocated nodes, - and return a constraint that limits the startds queried to those nodes. - """ - bound=1900 - try: - ads = coll.query(htcondor.AdTypes.Negotiator, - 'regexp("CMSLPC",Name)', - ['FERMIHTC_LPC_MAX_WN_NUM']) - assert len(ads) == 1 - assert 'FERMIHTC_LPC_MAX_WN_NUM' in ads[0] - bound = int(ads[0]['FERMIHTC_LPC_MAX_WN_NUM']) - except Exception as e: - logger.error('trouble getting LPC node boundary, using default %d:'%(bound)) - traceback.print_exc() - return 'FERMIHTC_NODE_NUMBER < %d && regexp("slot1",Name)' % (bound) - -def add_lpc_options(opts): - # by setting the constraint to a function it will get called by the probe at query time - opts['slot_constraint'] = get_lpc_node_boundary - opts['schedd_constraint'] = 'regexp("lpc",Name)' - opts['negotiator_constraint'] = 'regexp("CMSLPC",Name)' - return opts - -if __name__ == '__main__': - opts = add_lpc_options(get_options()) - if opts['test']: - loglevel = logging.DEBUG - else: - loglevel = logging.INFO - logging.basicConfig(level=loglevel, - format="%(asctime)s [%(levelname)s] %(name)s - %(message)s") - - logger.info('Probe configuraion: \n'+pprint.pformat(opts)) - - probe = CondorProbe(**opts) - probe.run() diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon.cfg b/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon.cfg deleted file mode 100644 index c1271ace..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon.cfg +++ /dev/null @@ -1,64 +0,0 @@ -[probe] -# how often to send data in seconds -interval = 240 -# how many times to retry condor queries -retries = 2 -# seconds to wait beteeen retries -delay = 10 -# if true, data is output to stdout and not sent downstream -test = false -# run one time and exit, i.e. for running wtih cron -once = false -# enable promethus metrics server for monitoring the probe -publish_metrics = false - -[graphite] -# enable output to graphite -enable = true -# graphite host -host = localhost -# graphite pickle port -port = 2004 -# base namespace for metrics -namespace = test.condor -# namespace for probe monitoring metrics -meta_namespace = test.probes.condor - -[influxdb] -# enable output to influxdb (not fully supported) -enable = false -# influxdb host -host = localhost -# influxdb api port -port = 8086 -# influxdb database -db = test -# extra tags to include with all metrics (comma-separated key:value) -tags = cluster:test - -[condor] -# central manager/collector host -pool = localhost -# collect basic daemon (collector, negotiator, schedd) metrics? -post_pool_status = true -# collect machine/startd metrics? -post_pool_slots = true -# constraint to limit which startds metrics are collected for -slot_constraint = true -# collect glidein-specific startd metrics? -post_pool_glideins = false -# collect priorities and quotas? -post_pool_prio = false -# constraint to limit which negotiators are queried -negotiator_constraint = true -# If true, first try to get priorities and quotas from Accounting classads in collector, -# then fallback to negotiator. Opposite order if false. -prefer_accounting = false -# collect job metrics? -post_pool_jobs = false -# constraint to limit which schedds are queried -schedd_constraint = true -# Enable GSI (x509 certificate) authorization -use_gsi_auth = false -X509_USER_KEY = "" -X509_USER_CERT = "" \ No newline at end of file diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/__init__.py b/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/__init__.py deleted file mode 100644 index 9c92d281..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .probe import Probe -from .graphite import Graphite diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/__init__.py b/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/__init__.py deleted file mode 100644 index 63d861c2..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -import os -os.environ['_CONDOR_GSI_SKIP_HOST_CHECK'] = "true" - -from .status import get_pool_status -from .slots import get_pool_slots, get_pool_glidein_slots -from .priorities import get_pool_priorities -from .jobs import Jobs - -# disable debug logging, causes memory leak in long-running processes -import htcondor -htcondor.param['TOOL_LOG'] = '/dev/null' -htcondor.enable_log() diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/jobs.py b/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/jobs.py deleted file mode 100755 index ac9cd531..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/jobs.py +++ /dev/null @@ -1,344 +0,0 @@ -#!/usr/bin/python -import time -from collections import defaultdict, deque -import logging -from threading import Thread -import traceback -import re -import urllib - -import classad -import htcondor - -logger = logging.getLogger(__name__) - - -def find_bin(value, bins): - for b in bins: - if value < b[0]: - return b[1] - return "longer" - - -def clean_metric_name(s): - """ - replaces any non-standard characters with _ - """ - if s is None: - return "None" - if len(s) == 0: - return "None" - # still have a string, url encode it - #return re.sub(r'[^a-zA-Z0-9_\-.]', '_', s) - return urllib.quote_plus(s) - - -def geteval(self, key, default=None): - """ - get an attribute from the classad, returning default if not found. - if the attribute is an expression, eval() it. - """ - r = self.get(key,default) - if isinstance(r, classad.ExprTree): - e = r.eval() - if isinstance(e, classad.Value): - # Undefined / Error - return default - return e - return r - -classad.ClassAd.geteval = geteval - - -def job_metrics(job_classad): - """ - Returns a list of base metrics for the given job. - """ - counters = [] - - try: - user_name = job_classad.get("Owner","unknown") - except: - user_name = "unknown" - try: - groups = re.findall(r'(?:group_)?(\w+)',job_classad.get("AccountingGroup","group_unknown")) - exp_name = groups[0] - subgroups = [] - if len(groups) > 1: - # sometimes each user has an accounting group, we don't want those - if groups[-1] == user_name: - subgroups = groups[1:len(groups)-1] - else: - subgroups = groups[1:] - except: - exp_name = "unknown" - subgroups = [] - - project = job_classad.geteval("ProjectName","Unknown") - try: - project = re.sub(r'^.*\.', '', project) - except: - pass - project = clean_metric_name(project) - - simg = job_classad.geteval("SingularityImage","None") - try: - simg = re.sub(r'^/cvmfs/[^/]+/', '', simg) - simg = re.sub(r'\.', '_', simg) - simg = re.sub(r'/', '__', simg) - simg = re.sub(r':', '__', simg) - except: - pass - simg = clean_metric_name(simg) - - if job_classad["JobUniverse"] == 7: - counters = [".dag.totals"] - elif job_classad["JobStatus"] == 1: - counters = [".idle.totals"] - if "DESIRED_usage_model" in job_classad: - models = set(job_classad["DESIRED_usage_model"].split(",")) - if "DESIRED_Sites" in job_classad: - sites = job_classad["DESIRED_Sites"].split(",") - for s in sites: - counters.append(".idle.sites."+s) - #if "Fermigrid" not in sites: - # models.discard("DEDICATED") - # models.discard("OPPORTUNISTIC") - models_sorted = list(models) - if len(models_sorted) == 0: - models_sorted = ["impossible"] - else: - models_sorted.sort() - counters.append(".idle.usage_models." + "_".join(models_sorted)) - else: - counters.append(".idle.usage_models.unknown") - elif job_classad["JobStatus"] == 2: - counters = [".running.totals"] - if "MATCH_EXP_JOBGLIDEIN_ResourceName" in job_classad: - site = job_classad["MATCH_EXP_JOBGLIDEIN_ResourceName"] - counters.append(".running.sites." + site) - else: - counters.append(".running.sites.unknown") - elif job_classad["JobStatus"] == 5: - counters = [".held.totals"] - else: - counters = [".unknown.totals"] - - metrics = [] - for counter in counters: - metrics.append("totals"+counter) - metrics.append("experiments."+exp_name+".totals"+counter) - metrics.append("experiments."+exp_name+".users."+user_name+counter) - if len(subgroups) > 0: - metrics.append("experiments."+exp_name+".subgroups."+".".join(subgroups)+'.'+counter) - metrics.append("projects."+project+".totals"+counter) - metrics.append("projects."+project+".users."+user_name+counter) - metrics.append("requested_singularity."+simg+".totals"+counter) - metrics.append("requested_singularity."+simg+".users."+user_name+counter) - metrics.append("users."+user_name+counter) - return metrics - - -def get_jobs(job_q, schedd_ad, constraint=True, attrs=['ClusterId','ProcId','JobStatus'], retry_delay=30, max_retries=4): - retries=0 - while retries < max_retries: - try: - schedd = htcondor.Schedd(schedd_ad) - results = schedd.query(constraint, attrs) - except IOError: - traceback.print_exc() - retries += 1 - if retries < max_retries: - logger.warning("Trouble communicating with schedd {0}, retrying in {1}s.".format(schedd_ad['Name'],retry_delay)) - time.sleep(retry_delay) - else: - logger.warning("Trouble communicating with schedd {0}, giving up.".format(schedd_ad['Name'])) - continue - else: - for r in results: - job_q.append(r) - return - logger.error("Trouble communicating with schedd {0}, giving up.".format(schedd_ad['Name'])) - -def get_idle_jobs(job_q, schedd_ad, retry_delay=30, max_retries=4): - get_jobs(job_q, schedd_ad, constraint='JobStatus==1', retry_delay=retry_delay, max_retries=max_retries, - attrs=["ClusterId","ProcId","Owner", - "NumJobStarts", "NumShadowStarts", "NumHolds", "NumSystemHolds", - "AccountingGroup","ProjectName","JobStatus", - "SingularityImage", - "DESIRED_usage_model","DESIRED_Sites","JobUniverse", - "QDate","ServerTime", - "RequestMemory","RequestDisk","RequestCpus","RequestGpus"]) - -def get_running_jobs(job_q, schedd_ad, retry_delay=30, max_retries=4): - get_jobs(job_q, schedd_ad, constraint='JobStatus==2', retry_delay=retry_delay, max_retries=max_retries, - attrs=["ClusterId","ProcId","Owner", - "NumJobStarts", "NumShadowStarts", "NumHolds", "NumSystemHolds", - "MATCH_GLIDEIN_Site","MATCH_EXP_JOBGLIDEIN_ResourceName", - "AccountingGroup","ProjectName","JobStatus", - "SingularityImage", - "JobUniverse", - "ServerTime","JobCurrentStartDate","RemoteUserCpu", - "RequestMemory","ResidentSetSize_RAW", - "RequestDisk","DiskUsage_RAW","RequestCpus","RequestGpus"]) - -def get_held_jobs(job_q, schedd_ad, retry_delay=30, max_retries=4): - get_jobs(job_q, schedd_ad, constraint='JobStatus==5', retry_delay=retry_delay, max_retries=max_retries, - attrs=["ClusterId","ProcId","Owner", - "NumJobStarts", "NumShadowStarts", "NumHolds", "NumSystemHolds", - "AccountingGroup","ProjectName","JobStatus", - "SingularityImage", - "JobUniverse", - "ServerTime", - "EnteredCurrentStatus"]) - - -class Jobs(object): - def __init__(self, pool="localhost"): - self.pool = pool - self.collector = htcondor.Collector(pool) - self.bins=[(300, 'recent'), - (3600, 'one_hour'), - (3600*4, 'four_hours'), - (3600*8, 'eight_hours'), - (3600*24, 'one_day'), - (3600*24*2, 'two_days'), - (3600*24*7, 'one_week')] - - - def job_walltime(self, job_classad): - now = job_classad.get("ServerTime",0) - start = job_classad.get("JobCurrentStartDate",now) - return (now-start)*job_classad.geteval("RequestCpus",1) - - def job_cputime(self, job_classad): - return job_classad.get("RemoteUserCpu",0) - - def job_bin(self, job_classad): - bin = None - if job_classad["JobStatus"] == 1: - if "ServerTime" in job_classad and "QDate" in job_classad: - qage = job_classad["ServerTime"]-job_classad["QDate"] - bin = ".count_"+find_bin(qage, self.bins) - else: - bin = ".count_unknown" - elif job_classad["JobStatus"] == 2: - walltime = self.job_walltime(job_classad) - if walltime > 0: - bin = ".count_"+find_bin(walltime, self.bins) - else: - bin = ".count_unknown" - elif job_classad["JobStatus"] == 5: - if "ServerTime" in job_classad and "EnteredCurrentStatus" in job_classad: - holdage = job_classad["ServerTime"]-job_classad["EnteredCurrentStatus"] - bin = ".count_holdage_"+find_bin(holdage, self.bins) - else: - bin = ".count_holdage_unknown" - return bin - - - def get_job_count(self, retry_delay=30, max_retries=4, schedd_constraint=True): - if callable(schedd_constraint): - schedd_constraint=schedd_constraint(self.collector) - try: - ads = self.collector.query(htcondor.AdTypes.Schedd,schedd_constraint) - except: - logger.error("Trouble getting pool {0} schedds.".format(self.pool)) - return None - - job_q = deque() - ## spawn off workers to query each schedd and put metrics for each job in a queue - for a in ads: - get_idle_jobs(job_q,a,retry_delay,max_retries) - get_running_jobs(job_q,a,retry_delay,max_retries) - get_held_jobs(job_q,a,retry_delay,max_retries) - - logger.info("Processing jobs") - counts = defaultdict(int) - for r in job_q: - for m in job_metrics(r): - counts[m+".count"] += 1 - - bin = self.job_bin(r) - if bin is not None: - counts[m+bin] += 1 - - walltime = self.job_walltime(r) - cputime = self.job_cputime(r) - if walltime > 0 and cputime > 0: - try: - counts[m+".walltime"] += walltime - counts[m+".cputime"] += cputime - counts[m+".efficiency"] = max(min(counts[m+".cputime"]/counts[m+".walltime"]*100,100),0) - counts[m+".wastetime"] = counts[m+".walltime"]-counts[m+".cputime"] - if walltime > counts[m+".walltime_max"]: - counts[m+".walltime_max"] = walltime - if counts[m+".count"] > 0: - counts[m+".walltime_avg"] = counts[m+".walltime"]/counts[m+".count"] - counts[m+".wastetime_avg"] = counts[m+".wastetime"]/counts[m+".count"] - except: - logger.warning("Problem with walltime: {0} {1} {2}".format(walltime, counts[m+".walltime"], counts[m+".count"])) - - if "NumJobStarts" in r: - starts = r.eval("NumJobStarts") - try: - counts[m+".job_starts"] += starts - except: - pass - if "NumShadowStarts" in r: - starts = r.eval("NumShadowStarts") - try: - counts[m+".shadow_starts"] += starts - except: - pass - if "NumHolds" in r: - holds = r.eval("NumHolds") - try: - counts[m+".holds"] += holds - except: - pass - if "NumSystemHolds" in r: - holds = r.eval("NumSystemHolds") - try: - counts[m+".holds_system"] += holds - except: - pass - - ## one standard slot == 1 cpu and 2000 MB of memory (undefined amount of disk) - std_slots = 1 - if "RequestCpus" in r: - cpus = r.eval("RequestCpus") - try: - counts[m+".cpu_request"] += cpus - std_slots = max(std_slots,cpus) - except: - pass - if "RequestGpus" in r: - gpus = r.eval("RequestGpus") - try: - if int(gpus) == gpus: - counts[m+".gpu_request"] += gpus - std_slots = max(std_slots,gpus) - except: - pass - if "RequestMemory" in r: - mem = r.eval("RequestMemory") - try: - counts[m+".memory_request_b"] += mem*1024.0*1024.0 - std_slots = max(std_slots,mem/2000.0) - except: - pass - if "RequestDisk" in r: - try: - counts[m+".disk_request_b"] += r.eval("RequestDisk")*1024 - except: - pass - counts[m+".std_slots"] += std_slots - - if r["JobStatus"] == 2: - if "ResidentSetSize_RAW" in r: - counts[m+".memory_usage_b"] += r.eval("ResidentSetSize_RAW")*1024 - if "DiskUsage_RAW" in r: - counts[m+".disk_usage_b"] += r.eval("DiskUsage_RAW")*1024 - - return counts diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/priorities.py b/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/priorities.py deleted file mode 100755 index 87a34dff..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/priorities.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/python -import logging -import time - -import classad -import htcondor - -logger = logging.getLogger(__name__) - -def query_prio_from_neg(coll, negotiator_constraint): - ads = coll.query(htcondor.AdTypes.Negotiator, negotiator_constraint) - if len(ads) == 0: - raise Exception('no negotiator classads found') - elif len(ads) > 1: - logger.warning('multiple negotiators found, using %s'%(ads[0]['Name'])) - n = htcondor.Negotiator(ads[0]) - return n.getPriorities() - -def get_pool_priorities(pool, retry_delay=30, max_retries=4, last_data={}, negotiator_constraint=True, prefer_accounting=False): - coll = htcondor.Collector(pool) - if callable(negotiator_constraint): - negotiator_constraint = negotiator_constraint(coll) - prio = None - retries = 0 - while retries < max_retries: - if prefer_accounting: - prio = coll.query(constraint='MyType=="Accounting"') - if len(prio) > 0: - break - else: - logger.info("No Accounting ads found in collector {0}, trying negotiator".format(pool)) - try: - prio = query_prio_from_neg(coll, negotiator_constraint) - except Exception as e: - logger.info("Error querying negotiator: {0}".format(e)) - else: - break - else: - try: - prio = query_prio_from_neg(coll, negotiator_constraint) - except Exception as e: - logger.info("Trouble communicating with pool {0} negotiator, trying to get Accounting ads from collector: {1}".format(pool,e)) - prio = coll.query(constraint='MyType=="Accounting"') - if len(prio) > 0: - break - else: - logger.info("No Accounting ads found in collector {0}".format(pool)) - else: - break - logger.warning("Unable to collect pool {0} priorities, retrying in {1}s.".format(pool,retry_delay)) - retries += 1 - prio = None - time.sleep(retry_delay) - - if prio is None: - logger.error("Unable to collect pool {0} priorities, giving up.".format(pool)) - return {} - - data = last_data - for p in prio: - parts = p['Name'].split('@') - name = parts[0] - if name.startswith('group_'): - name = name[6:] - if len(parts) > 1: - domain = "".join(parts[1:]) - else: - domain="unknown" - if 'NegotiatorName' in p: - domain=p['NegotiatorName'] - if p.get('IsAccountingGroup',False) and p.get('ConfigQuota',0) > 0: - basename = "quotas.{0}.{1}".format( - domain.replace(".","_").replace("@","_"), - name) - for metric in ["EffectiveQuota", - "ConfigQuota", - "SubtreeQuota", - "Requested"]: - if metric in p: - data[basename+"."+metric] = p[metric] - else: - user=name.split('.')[-1] - group="_".join(name.split('.')[:-1]) - if group == '': - group = 'none' - basename = "priorities.{0}.{1}.{2}".format( - domain.replace(".","_").replace("@","_"), - group, - user) - for metric in ["ResourcesUsed", - "AccumulatedUsage", - "WeightedAccumulatedUsage", - "Priority", - "WeightedResourcesUsed", - "PriorityFactor"]: - if metric in p: - data[basename+"."+metric] = p[metric] - return data diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/slots.py b/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/slots.py deleted file mode 100755 index 1ae54c5d..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/slots.py +++ /dev/null @@ -1,244 +0,0 @@ -#!/usr/bin/python -from collections import defaultdict -import re -import logging -import time -import math - -import classad -import htcondor - -logger = logging.getLogger(__name__) - -def sanitize(key): - if key is None: - return None - return key.replace(".","_").replace("@","-").replace(" ","_") - -def slot_weight(cpus, memory, disk): - """ - Calculate slot weight, where one standard slot is 1 CPU and 2000 MB. - Weight is returned as float of fractional slots. Examples: - slot_weight(cpus=1,memory=3000) => 1.5 - slot_weight(cpus=2,memory=3000) => 2.0 - """ - return max(float(cpus),float(memory)/2000) - -def unclaimed_slot_weight(cpus, memory, disk): - """ - Calculate slot weight, where one standard slot is 1 CPU and 2000 MB. - Weight is returned as min of integral slots. Examples: - unclaimed_slot_weight(cpus=1,memory=3000) => 1.0 - unclaimed_slot_weight(cpus=2,memory=3000) => 1.0 - """ - return math.floor(min(float(cpus),float(memory)/2000)) - -def get_pool_resource_utilization(pool, retry_delay=30, max_retries=4, schedd_constraint=True): - coll = htcondor.Collector(pool) - retries = 0 - while retries < max_retries: - try: - schedd_ads = coll.query(htcondor.AdTypes.Schedd,schedd_constraint) - except: - logger.warning("trouble getting pool {0} schedds, retrying in {1}s.".format(pool,retry_delay)) - retries += 1 - schedd_ads = None - time.sleep(retry_delay) - else: - break - - if schedd_ads is None: - logger.error("trouble getting pool {0} schedds, giving up.".format(pool)) - return {} - - memory_usage = 0 - disk_usage = 0 - for ad in schedd_ads: - try: - schedd = htcondor.Schedd(ad) - results = schedd.query('jobstatus==2',['ResidentSetSize_RAW','DiskUsage_RAW']) - except Exception as e: - logger.error(e) - else: - for r in results: - memory_usage += r.get('ResidentSetSize_RAW',0) - disk_usage += r.get('DiskUsage_RAW',0) - return { - "MemoryUsage":memory_usage/1024, - "DiskUsage":disk_usage, - } - - -def get_pool_slots(pool, retry_delay=30, max_retries=4, totals_only=False, job_resources=True, constraint=True, schedd_constraint=True): - coll = htcondor.Collector(pool) - if callable(constraint): - constraint=constraint(coll) - if callable(schedd_constraint): - schedd_constraint = schedd_constraint(coll) - - retries = 0 - while retries < max_retries: - try: - #startd_ads = coll.locateAll(htcondor.DaemonTypes.Startd) - startd_ads = coll.query(htcondor.AdTypes.Startd, constraint, - ['SlotType','State','Name','SlotWeight', - 'Cpus','TotalSlotCpus','TotalCpus', - 'Gpus','TotalSlotGpus','TotalGpus', - 'Disk','TotalSlotDisk','TotalDisk', - 'Memory','TotalSlotMemory','TotalMemory', - 'LoadAvg','TotalCondorLoadAvg','TotalLoadAvg', - 'AccountingGroup','RemoteGroup','RemoteOwner', - 'kflops','IS_GLIDEIN']) - except: - logger.warning("trouble getting pool {0} startds, retrying in {1}s.".format(pool,retry_delay)) - retries += 1 - startd_ads = None - time.sleep(retry_delay) - else: - break - - if startd_ads is None: - logger.error("trouble getting pool {0} startds, giving up.".format(pool)) - return {} - - data = defaultdict(int) - load = defaultdict(float) - for a in startd_ads: - slot_type = a.get("SlotType", "Static") - if a.get('IS_GLIDEIN',False): - slot_type += 'Glidein' - state = a.get("State", "Unknown") - kflops = int(a.get("kflops", 0)) - - if slot_type in ["Partitionable","PartitionableGlidein"]: - for k in ["TotalDisk", "TotalSlotDisk", "Disk", - "TotalMemory", "TotalSlotMemory", "Memory", - "TotalCpus", "TotalSlotCpus", "Cpus", - "TotalGpus", "TotalSlotGpus", "Gpus", - "TotalLoadAvg", "LoadAvg", "TotalCondorLoadAvg"]: - metric = ".".join([slot_type, "totals", k]) - data[metric] += a.get(k,0) - metric = ".".join([slot_type, state, k]) - data[metric] += a.get(k,0) - metric = ".".join([slot_type, "totals", "NumSlots"]) - data[metric] += 1 - metric = ".".join([slot_type, "totals", "Mflops"]) - data[metric] += long(int(a.get("Cpus",1)) * kflops / 1024) - metric = ".".join([slot_type, "totals", "StdSlots"]) - data[metric] += unclaimed_slot_weight(a.get("Cpus",1),a.get("Memory",0),a.get("Disk",0)) - if a.get("Cpus",0) == 0 or a.get("Gpus",0) == 0 or a.get("Memory",0) < 2000 or a.get("Disk",0) < 1048576: - # slot is effectively fully utilized - for k in ["Disk", "Memory", "Cpus", "Gpus"]: - metric = ".".join([slot_type, "unusable", k]) - data[metric] += a.get(k,0) - elif state == "Claimed": - (group,owner) = ("Unknown","Unknown") - if "AccountingGroup" in a: - try: - m = re.match(r'group_(\S+)\.(\S+)@\S+$',a.eval("AccountingGroup")) - except: - pass - else: - if m: - group,owner = m.groups() - if group == "Unknown" and "RemoteGroup" in a: - group = a["RemoteGroup"] - if group == "": - group = "None" - if owner == "Unknown" and "RemoteOwner" in a: - owner = a["RemoteOwner"].split("@")[0] - - for k in ["Disk", "Memory", "Cpus", "Gpus", "LoadAvg"]: - if not totals_only: - metric = ".".join([slot_type, state, sanitize(group), sanitize(owner), k]) - data[metric] += a.get(k,0) - metric = ".".join([slot_type, "totals", k]) - data[metric] += a.get(k,0) - metric = ".".join([slot_type, "totals", "Mflops"]) - data[metric] += long(int(a.get("Cpus",1)) * kflops / 1024) - metric = ".".join([slot_type, state, sanitize(group), sanitize(owner), "Weighted"]) - data[metric] += a.eval("SlotWeight") - metric = ".".join([slot_type, state, sanitize(group), sanitize(owner), "NumSlots"]) - data[metric] += 1 - metric = ".".join([slot_type, state, sanitize(group), sanitize(owner), "StdSlots"]) - data[metric] += slot_weight(a.get("Cpus",1),a.get("Memory",0),a.get("Disk",0)) - else: - for k in ["Disk", "Memory", "Cpus", "Gpus"]: - metric = ".".join([slot_type, state, k]) - data[metric] += a.get(k,0) - metric = ".".join([slot_type, "totals", k]) - data[metric] += a.get(k,0) - metric = ".".join([slot_type, "totals", "Mflops"]) - data[metric] += int(int(a.get("Cpus",1)) * kflops / 1024) - metric = ".".join([slot_type, state, "NumSlots"]) - data[metric] += 1 - if job_resources: - for k,v in get_pool_resource_utilization(pool, retry_delay, max_retries, schedd_constraint).iteritems(): - metric = ".".join(["jobs", "totals", k]) - data[metric] = v - - return data - -def get_pool_glidein_slots(pool, retry_delay=30, max_retries=4): - coll = htcondor.Collector(pool) - retries = 0 - while retries < max_retries: - try: - startd_ads = coll.query(htcondor.AdTypes.Startd, 'is_glidein==True', - ['GLIDEIN_Site','GLIDEIN_Resource_Name','GLIDEIN_ResourceName','GLIDEIN_Entry_Name','State', - 'DaemonStartTime','Disk','Memory','Cpus','Gpus']) - except: - logger.warning("trouble getting pool {0} startds, retrying in {1}s.".format(pool,retry_delay)) - retries += 1 - startd_ads = None - time.sleep(retry_delay) - else: - break - - if startd_ads is None: - logger.error("trouble getting pool {0} startds, giving up.".format(pool)) - return {} - - data = defaultdict(int) - load = defaultdict(float) - for a in startd_ads: - site = sanitize(a.get("GLIDEIN_Site", "Unknown")) - resource = sanitize(a.get("GLIDEIN_Resource_Name",a.get("GLIDEIN_ResourceName","Unknown"))) - entry = sanitize(a.get("GLIDEIN_Entry_Name", "Unknown")) - state = sanitize(a.get("State", "Unknown")) - if (time.time() - a.get("DaemonStartTime",time.time())) < 300: - state = "New" - - metrics = [".".join(["glideins", "totals", "NumSlots"]), - ".".join(["glideins", state, "totals", "NumSlots"]), - ".".join(["glideins", state, "sites", site, "totals", "NumSlots"]), - ".".join(["glideins", state, "sites", site, "resources", resource, "NumSlots"]), - ".".join(["glideins", state, "sites", site, "entries", entry, "NumSlots"])] - for m in metrics: - data[m] += 1 - - metrics = [".".join(["glideins", "totals", "StdSlots"]), - ".".join(["glideins", state, "totals", "StdSlots"]), - ".".join(["glideins", state, "sites", site, "totals", "StdSlots"]), - ".".join(["glideins", state, "sites", site, "resources", resource, "StdSlots"]), - ".".join(["glideins", state, "sites", site, "entries", entry, "StdSlots"])] - for m in metrics: - if state=="Unclaimed": - data[m] += unclaimed_slot_weight(a.get("Cpus",1),a.get("Memory",0),a.get("Disk",0)) - else: - data[m] += slot_weight(a.get("Cpus",1),a.get("Memory",0),a.get("Disk",0)) - - for k in ["Disk", "Memory", "Cpus", "Gpus"]: - metrics = [".".join(["glideins", "totals", k]), - ".".join(["glideins", state, "totals", k]), - ".".join(["glideins", state, "sites", site, "totals", k]), - ".".join(["glideins", state, "sites", site, "resources", resource, k]), - ".".join(["glideins", state, "sites", site, "entries", entry, k])] - for m in metrics: - data[m] += a.get(k,0) - - return data - -if __name__ == "__main__": - import pprint - pprint.pprint(dict(get_pool_slots("cmssrv221"))) diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/status.py b/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/status.py deleted file mode 100755 index e1925a21..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor/status.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/python -import logging -import time - -import classad -import htcondor - -logger = logging.getLogger(__name__) - - -def get_pool_status(pool, retry_delay=30, max_retries=4, schedd_constraint=True, negotiator_constraint=True): - coll = htcondor.Collector(pool) - if callable(schedd_constraint): - schedd_constraint = schedd_constraint(coll) - if callable(negotiator_constraint): - negotiator_constraint = negotiator_constraint(coll) - - daemons = {"schedds": htcondor.DaemonTypes.Schedd, - "collectors": htcondor.DaemonTypes.Collector, - "negotiators": htcondor.DaemonTypes.Negotiator} - - data = { - "schema": "daemon.name.measurement", - "metrics": {}, - } - for daemon_type, daemon in daemons.iteritems(): - retries = 0 - while retries < max_retries: - try: - if daemon_type == "schedds": - ads = coll.query(htcondor.AdTypes.Schedd,schedd_constraint) - elif daemon_type == 'negotiators': - ads = coll.query(htcondor.AdTypes.Negotiator, negotiator_constraint) - else: - ads = coll.locateAll(daemon) - except Exception as e: - logger.warning("trouble getting pool {0} {1} status, retrying in {2}s: {3}".format(pool,daemon_type,retry_delay,e)) - ads = None - retries += 1 - time.sleep(retry_delay) - else: - break - if ads is None: - logger.error("trouble getting pool {0} {1} status, giving up.".format(pool,daemon_type)) - else: - for ad in ads: - # quick hack to skip schedds starting up on worker nodes - if ad['Name'].startswith('fnpc'): - logger.info('skipping worker node {}'.format(ad['Name'])) - continue - for k in ad: - if type(ad[k]) in [int,long,float]: - metric = ".".join([daemon_type, ad["Name"].replace(".","_").replace("@","-").replace(" ","_"), k]) - data["metrics"][metric] = ad[k] - return [data] diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor_probe.py b/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor_probe.py deleted file mode 100755 index 89167d50..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/condor_probe.py +++ /dev/null @@ -1,193 +0,0 @@ -#!/usr/bin/python -from collections import defaultdict -import re -from optparse import OptionParser -import logging -import time -import os -import ConfigParser -import pprint - -from fifemon import Probe, condor - -logger = logging.getLogger(__name__) - -class CondorProbe(Probe): - """ - Query HTCondor pool and post statistics to Graphite. - - Options: - post_pool_status: collect main daemon (schedd, collector, - negotiator) statistics - post_pool_slots: collect & aggregate slot (startd) status - slot_constraint: optional constraint to filter slots (bool, str, or func) - post_pool_glideins: collect & aggregate glidein slot status - post_pool_prio: collect user priorities - negotiator_constraint: optional constraint to filter negotiators queried (bool, str, or func) - prefer_accounting: prefer accounting classads from collector over directly querying negotiator - (in either case the other will be tried if there's a problem with the first) - post_pool_jobs: collect & aggregate user job status - schedd_constraint: optional constraint to filter schedds queried (bool, str, or func) - """ - - def __init__(self, *args, **kwargs): - self.pool = kwargs.pop('pool', 'localhost') - self.post_pool_status = kwargs.pop('post_pool_status',True) - self.post_pool_slots = kwargs.pop('post_pool_slots',True) - self.slot_constraint = kwargs.pop('slot_constraint',True) - self.post_pool_glideins = kwargs.pop('post_pool_glideins',False) - self.post_pool_prio = kwargs.pop('post_pool_prio',True) - self.negotiator_constraint = kwargs.pop('negotiator_constraint',True) - self.prefer_accounting = kwargs.pop('prefer_accounting', False) - self.post_pool_jobs = kwargs.pop('post_pool_jobs',False) - self.schedd_constraint = kwargs.pop('schedd_constraint',True) - self.use_gsi_auth = kwargs.pop('use_gsi_auth',False) - self.x509_user_key = kwargs.pop('x509_user_key',"") - self.x509_user_cert = kwargs.pop('x509_user_cert',"") - - if self.post_pool_jobs: - self.jobs = condor.Jobs(self.pool) - - self.last_prio={} - - super(CondorProbe, self).__init__(*args, **kwargs) - - def post(self): - if self.use_gsi_auth: - save_key = os.environ.get('X509_USER_KEY') - os.environ['X509_USER_KEY'] = self.x509_user_key - save_cert = os.environ.get('X509_USER_CERT') - os.environ['X509_USER_CERT'] = self.x509_user_cert - - if self.post_pool_status: - logger.info('querying pool {0} status'.format(self.pool)) - data = condor.get_pool_status(self.pool, self.delay, self.retries, - schedd_constraint=self.schedd_constraint, - negotiator_constraint=self.negotiator_constraint) - for dataset in data: - if self.use_graphite: - self.graphite.send_dict(self.namespace, - dataset["metrics"], - send_data=(not self.test)) - if self.use_influxdb: - self.influxdb.send_dict(dataset["metrics"], - send_data=(not self.test), - schema=dataset["schema"], - tags=self.influxdb_tags) - if self.post_pool_slots not in [False, 'false', 'False']: - if self.post_pool_slots == "totals": - logger.info('querying pool {0} slots (totals only)'.format(self.pool)) - data = condor.get_pool_slots(self.pool, self.delay, self.retries, - totals_only=True, job_resources=False, - constraint=self.slot_constraint, - schedd_constraint=self.schedd_constraint) - else: - logger.info('querying pool {0} slots'.format(self.pool)) - data = condor.get_pool_slots(self.pool, self.delay, self.retries, - constraint=self.slot_constraint, - schedd_constraint=self.schedd_constraint) - if self.use_graphite: - self.graphite.send_dict(self.namespace+".slots", data, send_data=(not self.test)) - if self.post_pool_glideins: - logger.info('querying pool {0} glidein slots'.format(self.pool)) - data = condor.get_pool_glidein_slots(self.pool, self.delay, self.retries) - if self.use_graphite: - self.graphite.send_dict(self.namespace+".slots", data, send_data=(not self.test)) - if self.post_pool_prio: - logger.info('querying pool {0} priorities'.format(self.pool)) - data = condor.get_pool_priorities(self.pool, self.delay, self.retries, self.last_prio, - negotiator_constraint=self.negotiator_constraint, - prefer_accounting=self.prefer_accounting) - self.last_prio = data - if self.use_graphite: - self.graphite.send_dict(self.namespace, data, send_data=(not self.test)) - if self.post_pool_jobs: - logger.info('querying pool {0} jobs'.format(self.pool)) - data = self.jobs.get_job_count(self.delay, self.retries, - schedd_constraint=self.schedd_constraint) - if self.use_graphite: - self.graphite.send_dict(self.namespace+".jobs", data, send_data=(not self.test)) - - if self.use_gsi_auth: - if save_key is None: - del os.environ['X509_USER_KEY'] - else: - os.environ['X509_USER_KEY'] = save_key - if save_cert is None: - del os.environ['X509_USER_CERT'] - else: - os.environ['X509_USER_CERT'] = save_cert - -def get_options(): - parser = OptionParser(usage="usage: %prog [options] [config file(s)]") - parser.add_option('-t','--test',action="store_true", - help="output data to stdout, don't send to graphite (implies --once)") - parser.add_option('-1','--once',action="store_true", - help="run once and exit") - parser.add_option('-p','--port',type='int',default=8100, - help="port on which to publish HTTP metrics") - (cmd_opts,args) = parser.parse_args() - - config = ConfigParser.SafeConfigParser() - config.read(args) - - def parse_tags(tags): - if tags is None or tags == "": - return None - r = {} - for k,v in [kv.split(":") for kv in tags.split(",")]: - r[k] = v - return r - - - opts = { - 'pool': config.get("condor", "pool"), - 'post_pool_status': config.getboolean("condor", "post_pool_status"), - 'post_pool_slots': config.get("condor", "post_pool_slots"), - 'slot_constraint': config.get("condor", "slot_constraint"), - 'post_pool_glideins':config.getboolean("condor", "post_pool_glideins"), - 'post_pool_prio': config.getboolean("condor", "post_pool_prio"), - 'negotiator_constraint': config.get("condor", "negotiator_constraint"), - 'prefer_accounting': config.getboolean("condor", "prefer_accounting"), - 'post_pool_jobs': config.getboolean("condor", "post_pool_jobs"), - 'schedd_constraint': config.get("condor", "schedd_constraint"), - 'use_gsi_auth': config.getboolean("condor", "use_gsi_auth"), - 'x509_user_key': config.get("condor", "X509_USER_KEY"), - 'x509_user_cert': config.get("condor", "X509_USER_CERT"), - 'use_graphite': config.getboolean("graphite", "enable"), - 'namespace': config.get("graphite", "namespace"), - 'meta_namespace': config.get("graphite", "meta_namespace"), - 'graphite_host': config.get("graphite", "host"), - 'graphite_pickle_port': config.getint("graphite", "port"), - 'use_influxdb': config.getboolean("influxdb", "enable"), - 'influxdb_host': config.get("influxdb", "host"), - 'influxdb_port': config.get("influxdb", "port"), - 'influxdb_db': config.get("influxdb", "db"), - 'influxdb_tags': parse_tags(config.get("influxdb", "tags")), - 'test': cmd_opts.test or config.getboolean("probe", "test"), - 'once': cmd_opts.once or config.getboolean("probe", "once"), - 'interval': config.getint("probe", "interval"), - 'delay': config.getint("probe", "delay"), - 'retries': config.getint("probe", "retries"), - 'publish_metrics': config.getboolean("probe", "publish_metrics"), - 'metrics_port': cmd_opts.port, - } - - return opts - -def main(): - opts = get_options() - if opts['test']: - loglevel = logging.DEBUG - else: - loglevel = logging.INFO - logging.basicConfig(level=loglevel, - format="%(asctime)s [%(levelname)s] %(name)s - %(message)s") - - logger.info('Probe configuraion: \n'+pprint.pformat(opts)) - - probe = CondorProbe(**opts) - probe.run() - -if __name__ == '__main__': - main() diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/graphite.py b/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/graphite.py deleted file mode 100755 index 404db657..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/graphite.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/python -import logging -import time -import cPickle -import struct -import socket -import sys - -logger = logging.getLogger(__name__) - -def sanitize_key(key): - if key is None: - return key - replacements = { - ".": "_", - " ": "_", - } - for old,new in replacements.iteritems(): - key = key.replace(old, new) - return key - -class Graphite(object): - def __init__(self,host="localhost",pickle_port=2004): - self.graphite_host = host - self.graphite_pickle_port = pickle_port - - def send_dict(self,namespace, data, send_data=True, timestamp=None, batch_size=1000): - """send data contained in dictionary as {k: v} to graphite dataset - $namespace.k with current timestamp""" - if data is None: - logger.warning("send_dict called with no data") - return - if timestamp is None: - timestamp=time.time() - post_data=[] - # turning data dict into [('$path.$key',($timestamp,$value)),...]] - for k,v in data.iteritems(): - t = (namespace+"."+k, (timestamp, v)) - post_data.append(t) - logger.debug(str(t)) - for i in xrange(len(post_data)//batch_size + 1): - # pickle data - payload = cPickle.dumps(post_data[i*batch_size:(i+1)*batch_size], protocol=2) - header = struct.pack("!L", len(payload)) - message = header + payload - # throw data at graphite - if send_data: - s=socket.socket() - try: - s.connect( (self.graphite_host, self.graphite_pickle_port) ) - s.sendall(message) - except socket.error as e: - logger.error("unable to send data to graphite at %s:%d\n" % (self.graphite_host,self.graphite_pickle_port)) - finally: - s.close() - -if __name__ == "__main__": - logging.basicConfig(level=logging.DEBUG) - data = {'count1': 5, 'count2': 0.5} - g = Graphite() - g.send_dict('test',data,send_data=False) diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/influx.py b/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/influx.py deleted file mode 100755 index 7f28640b..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/influx.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/python -import logging -import os - -from influxdb import InfluxDBClient - -logger = logging.getLogger(__name__) - -class Influxdb(object): - def __init__(self, host="localhost", port=8086, db="test", username=None, password=None): - self.host = host - self.port = port - if username is None: - username=os.getenv('INFLUXDB_USERNAME') - if password is None: - password=os.getenv('INFLUXDB_PASSWORD') - - self.client = InfluxDBClient(host, port, username, password, db) - - def send_metric(self, measurement, value, tags={}, timestamp=None, field="value"): - point = { - "measurement": measurement, - "tags": tags, - "fields": {field: value}, - } - if timestamp is not None: - point['time'] = timestamp - self.client.write_points([point]) - - def send_metrics(self, data, tags={}): - self.client.write_points(data,tags=tags) - - def send_dict(self, data, send_data=True, timestamp=None, schema=None, field="value", tags={}): - if data is None or len(data) == 0: - logger.warning("send_dict called with no data") - return - points = [] - if schema is None: - logger.warning("no schema provided, sending complete metric as measurement") - for k,v in data.iteritems(): - points.append({ - "measurement": k, - "fields": {field: v}, - }) - else: - schema_parts = schema.split(".") - for k,v in data.iteritems(): - parts = k.split(".") - if len(parts) != len(schema_parts): - logger.error("metric '{metric}' does not match schema '{schema}', skipping".format( - metric=k, - schema=schema)) - point = { - "measurement": None, - "tags": {}, - "fields": {field: v}, - } - for i in xrange(len(parts)): - if schema_parts[i] == "measurement": - point["measurement"] = parts[i] - elif schema_parts[i] != "_": - point["tags"][schema_parts[i]] = parts[i] - logger.debug("sending point %s"%point) - points.append(point) - logger.debug("sending points with tags %s"%tags) - if send_data: - self.client.write_points(points,tags=tags) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.DEBUG) - g = Influxdb() - g.send_metric('jobs', 2345, tags={'cluster':'fifebatch','user':'bob'}) diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/probe.py b/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/probe.py deleted file mode 100755 index 84c040d0..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/fifemon/probe.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/python -import logging -import time - -logger = logging.getLogger(__name__) - -class Probe(object): - def __init__(self, *args, **kwargs): - self.interval = kwargs.pop('interval', 240) - self.retries = kwargs.pop('retries', 10) - self.delay = kwargs.pop('delay', 30) - self.test = kwargs.pop('test',True) - self.once = self.test or kwargs.pop('once',False) - - self.use_graphite = kwargs.pop('use_graphite',True) - self.graphite_host = kwargs.pop('graphite_host','localhost') - self.graphite_pickle_port = kwargs.pop('graphite_pickle_port',2004) - self.namespace = kwargs.pop('namespace', 'test') - self.meta_namespace = kwargs.pop('meta_namespace', 'probes.test') - - self.use_influxdb = kwargs.pop('use_influxdb',False) - self.influxdb_host = kwargs.pop('influxdb_host','localhost') - self.influxdb_port = kwargs.pop('influxdb_port',8086) - self.influxdb_db = kwargs.pop('influxdb_db','test') - self.influxdb_tags = kwargs.pop('influxdb_tags',{}) - - self.publish_metrics = kwargs.pop('publish_metrics',True) - self.metrics_port = kwargs.pop('metrics_port',8100) - - if self.test: - logger.setLevel(logging.DEBUG) - else: - logger.setLevel(logging.INFO) - - if self.use_graphite: - from graphite import Graphite - self.graphite = Graphite(self.graphite_host, self.graphite_pickle_port) - if self.use_influxdb: - from influx import Influxdb - self.influxdb = Influxdb(self.influxdb_host, self.influxdb_port, self.influxdb_db) - if self.publish_metrics: - import prometheus_client as prom - self.last_runtime_metric = prom.Gauge('probe_runtime_seconds', - 'Probe last runtime') - self.interval_metric = prom.Gauge('probe_interval_seconds', - 'Probe run interval') - self.interval_metric.set(self.interval) - self.status_metric = prom.Gauge('probe_status', - 'Probe is running (1) or sleeping (0)') - prom.start_http_server(self.metrics_port) - - def __unicode__(self): - return """ -namespace: %s -meta_namespace: %s -interval: %d s -retries: %d -delay: %d s -test: %s -once: %s - -use_graphite: %s -graphite_host: %s -graphite_pickle_port: %s - -use_influxdb: %s -influxdb_host: %s -influxdb_port: %d -influxdb_db: %d - -publish_metrics: %s -metrics_port: %d - -""" % (self.namespace, - self.meta_namespace, - self.interval, - self.retries, - self.delay, - self.test, - self.once, - self.use_graphite, - self.graphite_host, - self.graphite_pickle_port, - self.use_influxdb, - self.influxdb_host, - self.influxdb_port, - self.influxdb_db, - self.publish_metrics, - self.metrics_port) - - def __str__(self): - return self.__unicode__() - - def post(self): - pass - - def run(self): - while True: - start = time.time() - if self.publish_metrics: - self.status_metric.set(1) - self.post() - duration = time.time()-start - if self.publish_metrics: - self.status_metric.set(0) - self.last_runtime_metric.set(duration) - logger.info("({0}) posted data in {1} s".format(self.namespace, duration)) - meta_data = { - "update_time": duration, - "update_interval": self.interval, - "duty_cycle": duration/self.interval, - } - if self.use_graphite: - self.graphite.send_dict(self.meta_namespace, meta_data, send_data = (not self.test)) - if self.use_influxdb: - pass - sleep = max(self.interval-duration-10,0) - logger.info("({0}) sleeping {1} s".format(self.namespace,sleep)) - if self.test or self.once: - return - time.sleep(sleep) - -if __name__ == "__main__": - logging.basicConfig(level=logging.DEBUG) - p = Probe(test=True) - logger.debug(p) - p.run() diff --git a/opensciencegrid/ospool-display/fifemon-condor-probe/setup.py b/opensciencegrid/ospool-display/fifemon-condor-probe/setup.py deleted file mode 100644 index 914d8e9c..00000000 --- a/opensciencegrid/ospool-display/fifemon-condor-probe/setup.py +++ /dev/null @@ -1,20 +0,0 @@ -from setuptools import setup - -setup( - name='fifemon-condor-probe', - version='1.0', - packages=['fifemon','fifemon.condor'], - include_package_data=True, - install_requires=[ - 'htcondor', - 'prometheus-client==0.10.1', - 'elasticsearch==7.17.6', - 'influxdb', - 'requests==2.27.1', - 'certifi==2021.10.8', - ], - entry_points=''' - [console_scripts] - fifemon=fifemon.condor_probe:main - ''', -) diff --git a/opensciencegrid/ospool-display/fifemon.cfg-development b/opensciencegrid/ospool-display/fifemon.cfg-development deleted file mode 100644 index ad2946c5..00000000 --- a/opensciencegrid/ospool-display/fifemon.cfg-development +++ /dev/null @@ -1,65 +0,0 @@ -[probe] -# how often to send data in seconds -interval = 240 -# how many times to retry condor queries -retries = 1 -# seconds to wait beteeen retries -delay = 5 -# if true, data is output to stdout and not sent downstream -test = false -# run one time and exit, i.e. for running wtih cron -once = false -# enable promethus metrics server for monitoring the probe -publish_metrics = true - -[graphite] -# enable output to graphite -enable = true -# graphite host -host = graphite.grid.uchicago.edu -# graphite pickle port -port = 2004 -# base namespace for metrics -namespace = test.osg-ospool -# namespace for probe monitoring metrics -meta_namespace = test.osg-ospool.probes - -[influxdb] -# enable output to influxdb (not fully supported) -enable = false -# influxdb host -host = localhost -# influxdb api port -port = 8086 -# influxdb database -db = test -# extra tags to include with all metrics (comma-separated key:value) -tags = cluster:test - -[condor] -# central manager/collector host -pool = cm-1.ospool-itb.osg-htc.org -# collect basic daemon (collector, negotiator, schedd) metrics? -post_pool_status = true -# collect machine/startd metrics? -post_pool_slots = true -# constraint to limit which startds metrics are collected for -slot_constraint = true -# collect glidein-specific startd metrics? -post_pool_glideins = true -# collect priorities and quotas? -post_pool_prio = true -# constraint to limit which negotiators are queried -negotiator_constraint = true -# If true, first try to get priorities and quotas from Accounting classads in collector, -# then fallback to negotiator. Opposite order if false. -prefer_accounting = false -# collect job metrics? -post_pool_jobs = true -# constraint to limit which schedds are queried -schedd_constraint = regexp("jupyter-notebook", Name) != True && regexp("cyverse", Name) != True -# Enable GSI (x509 certificate) authorization -use_gsi_auth = false -X509_USER_KEY = "/etc/condor/flock-key.pem" -X509_USER_CERT = "/etc/condor/flock-cert.pem" - diff --git a/opensciencegrid/ospool-display/fifemon.cfg-production b/opensciencegrid/ospool-display/fifemon.cfg-production deleted file mode 100644 index b0560187..00000000 --- a/opensciencegrid/ospool-display/fifemon.cfg-production +++ /dev/null @@ -1,65 +0,0 @@ -[probe] -# how often to send data in seconds -interval = 240 -# how many times to retry condor queries -retries = 1 -# seconds to wait beteeen retries -delay = 5 -# if true, data is output to stdout and not sent downstream -test = false -# run one time and exit, i.e. for running wtih cron -once = false -# enable promethus metrics server for monitoring the probe -publish_metrics = true - -[graphite] -# enable output to graphite -enable = true -# graphite host -host = graphite.grid.uchicago.edu -# graphite pickle port -port = 2004 -# base namespace for metrics -namespace = osg-ospool -# namespace for probe monitoring metrics -meta_namespace = osg-ospool.probes - -[influxdb] -# enable output to influxdb (not fully supported) -enable = false -# influxdb host -host = localhost -# influxdb api port -port = 8086 -# influxdb database -db = test -# extra tags to include with all metrics (comma-separated key:value) -tags = cluster:test - -[condor] -# central manager/collector host -pool = cm-1.ospool.osg-htc.org -# collect basic daemon (collector, negotiator, schedd) metrics? -post_pool_status = true -# collect machine/startd metrics? -post_pool_slots = true -# constraint to limit which startds metrics are collected for -slot_constraint = true -# collect glidein-specific startd metrics? -post_pool_glideins = true -# collect priorities and quotas? -post_pool_prio = true -# constraint to limit which negotiators are queried -negotiator_constraint = true -# If true, first try to get priorities and quotas from Accounting classads in collector, -# then fallback to negotiator. Opposite order if false. -prefer_accounting = false -# collect job metrics? -post_pool_jobs = true -# constraint to limit which schedds are queried -schedd_constraint = regexp("jupyter-notebook", Name) != True && regexp("cyverse", Name) != True -# Enable GSI (x509 certificate) authorization -use_gsi_auth = false -X509_USER_KEY = "/etc/condor/flock-key.pem" -X509_USER_CERT = "/etc/condor/flock-cert.pem" - diff --git a/opensciencegrid/ospool-display/image-config.d/50-fifemon.sh b/opensciencegrid/ospool-display/image-config.d/50-fifemon.sh deleted file mode 100755 index c018f1bb..00000000 --- a/opensciencegrid/ospool-display/image-config.d/50-fifemon.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -set -e - -# set up credentials -if [ ! -e /etc/ospool-creds/idkeys.d ]; then - echo "Please mount /etc/ospool-creds/idkeys.d/" - exit 1 -fi -if [ ! -e /etc/ospool-creds/idtokens.d ]; then - echo "Please mount /etc/ospool-creds/idtokens.d/" - exit 1 -fi - -echo "Installing HTCondor credentials..." -cd /etc/ospool-creds/idkeys.d -for FILE in *; do - install -o root -g root -m 0600 $FILE /etc/condor/passwords.d/$FILE -done -cd /etc/ospool-creds/idtokens.d -for FILE in *; do - install -o condor -g condor -m 0600 $FILE /etc/condor/tokens.d/$FILE -done - -# put the right config in place -rm -f /etc/fifemon.cfg -if [ "X$DEPLOYMENT_ENV" = "Xproduction" ]; then - ln -s /etc/fifemon.cfg-production /etc/fifemon.cfg -else - ln -s /etc/fifemon.cfg-development /etc/fifemon.cfg -fi - diff --git a/opensciencegrid/ospool-display/supervisor.d/10-fifemon.conf b/opensciencegrid/ospool-display/supervisor.d/10-fifemon.conf deleted file mode 100644 index a951d458..00000000 --- a/opensciencegrid/ospool-display/supervisor.d/10-fifemon.conf +++ /dev/null @@ -1,9 +0,0 @@ -[program:fifemon] -command=/usr/bin/fifemon /etc/fifemon.cfg -user=root -autorestart=true -startsecs=30 -stdout_logfile=/dev/stdout -stdout_logfile_maxbytes=0 -redirect_stderr=true -