Skip to content

Commit

Permalink
Merge pull request #164 from rynge/ospool-adjustments
Browse files Browse the repository at this point in the history
OSPool: Updated prometheus and prio scripts
  • Loading branch information
brianhlin authored Nov 15, 2023
2 parents 4a2dee9 + e87707a commit 5a8e83b
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 10 deletions.
26 changes: 18 additions & 8 deletions opensciencegrid/ospool-cm/opt/ospool/update-prios
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import re
import htcondor
import pprint

def update_user(negotiator, ad):
current_factor = int(ad["PriorityFactor"])
Expand All @@ -18,13 +19,22 @@ def update_user(negotiator, ad):
negotiator.setFactor(user, new_factor)


negotiator = htcondor.Negotiator()
prios = negotiator.getPriorities()
coll = htcondor.Collector()
negotiator_ads = coll.locateAll(htcondor.DaemonTypes.Negotiator)

for ad in negotiator.getPriorities():
try:
update_user(negotiator, ad)
except:
# ignore individual exceptions - we want to try all users
pass
for negotiator_ad in negotiator_ads:

if "ALLOCATED" in negotiator_ad["Name"]:
continue

negotiator = htcondor.Negotiator(negotiator_ad)
prios = negotiator.getPriorities()

for ad in negotiator.getPriorities():
pprint.pprint(ad)
try:
update_user(negotiator, ad)
except:
# ignore individual exceptions - we want to try all users
pass

23 changes: 21 additions & 2 deletions opensciencegrid/ospool-cm/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@
ospool_idle_diskstarvation_cpus_count = Gauge("ospool_idle_diskstarvation_cpus_count", "Idle CPUs due to disk starvation", ["resource_name"])
ospool_idle_other_cpus_count = Gauge("ospool_idle_other_cpus_count", "Idle CPUs due to other reasons", ["resource_name"])

# gpus
ospool_total_gpus_count = Gauge("ospool_total_gpus_count", "Total GPUs", ["resource_name"])
ospool_claimed_gpus_count = Gauge("ospool_claimed_gpus_count", "Claimed GPUs", ["resource_name"])
ospool_idle_gpus_count = Gauge("ospool_idle_gpus_count", "Idle GPUs", ["resource_name"])

# submitter metrics
ospool_submitter_idle_jobs_count = Gauge("ospool_submitter_idle_jobs_count", "Submitter idle jobs", ["submitter", "schedd"])
ospool_submitter_running_jobs_count = Gauge("ospool_submitter_running_jobs_count", "Submitter running jobs", ["submitter", "schedd"])
Expand All @@ -40,7 +45,7 @@ def cm_resources_info(collector):
# iterate over all resources
ads = collector.query(ad_type=htcondor.AdTypes.Startd,
constraint="!isUndefined(GLIDEIN_ResourceName)",
projection=["GLIDEIN_ResourceName", "CPUs", "State"])
projection=["GLIDEIN_ResourceName", "CPUs", "GPUs", "State"])
for ad in ads:
if ad["GLIDEIN_ResourceName"] not in resources:
resources[ad["GLIDEIN_ResourceName"]] = {
Expand All @@ -49,13 +54,23 @@ def cm_resources_info(collector):
"idle_retirement_cpus": 0,
"idle_memstarvation_cpus": 0,
"idle_diskstarvation_cpus": 0,
"idle_other_cpus": 0
"idle_other_cpus": 0,
"total_gpus": 0,
"claimed_gpus": 0,
"idle_gpus": 0,
}
r = resources[ad["GLIDEIN_ResourceName"]]
r["total_cpus"] += int(ad["CPUs"])
if ad["State"] != "Unclaimed":
r["claimed_cpus"] += int(ad["CPUs"])

# gpus
if "GPUs" in ad:
r["total_gpus"] += int(ad["GPUs"])
if ad["State"] != "Unclaimed":
r["claimed_gpus"] += int(ad["GPUs"])
r["idle_gpus"] = r["total_gpus"] - r["claimed_gpus"]

# classify idle CPUs
ads = collector.query(ad_type=htcondor.AdTypes.Startd,
constraint="!isUndefined(GLIDEIN_ResourceName) && PartitionableSlot == true && CPUs >= 1",
Expand Down Expand Up @@ -89,6 +104,10 @@ def cm_resources_info(collector):
- data["idle_diskstarvation_cpus"]
ospool_idle_other_cpus_count.labels(resource).set(idle_other_cpus)

ospool_total_gpus_count.labels(resource).set(data["total_gpus"])
ospool_claimed_gpus_count.labels(resource).set(data["claimed_gpus"])
ospool_idle_gpus_count.labels(resource).set(data["idle_gpus"])


def cm_submitters_info(collector):
'''
Expand Down

0 comments on commit 5a8e83b

Please sign in to comment.