Skip to content

Commit

Permalink
Prometheus metrics for infrastructure (#1135)
Browse files Browse the repository at this point in the history
This PR adds exporting of fine-grained metrics for infrastructure elements to prometheus. The metrics are defined as module level Counters/Gauges and can be labelled where necessary. The following metrics are available:

PathDB:

Total # and size of segments (can be labelled by type)
# of segments added/removed (can be labelled by type)
RevCache:

Total # and size of revocations
# of revocations added/removed
RequestHandler:

# of pending requests (can be labelled by type)
Path server:

Total # of path requests
# of pending path requests
Length of segments/revocations to ZK dict
Length of segments to master/prop dict
Beacon server:

# of beacons propagated
# of segments registered
# of revocation issued
Certificate server:

# of requests received (can be labelled by type)
  • Loading branch information
shitz authored Jun 28, 2017
1 parent 936f2d3 commit f0c774b
Show file tree
Hide file tree
Showing 23 changed files with 351 additions and 84 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:

build:
docker:
- image: kormat/scion_base@sha256:4b6bf6e422fc4e77d18ba952066cfa6a3b2624d1825178e1b5fe4f7a37f34500
- image: kormat/scion_base@sha256:f441df20a67b8968ba4a38a86eb47b80b835d68d3bd94d767af3b18408ece0b4
<<: *job
steps:
- checkout
Expand Down
1 change: 1 addition & 0 deletions env/pip3/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ cffi==1.10.0 --hash=sha256:b3b02911eb1f6ada203b0763ba924234629b51586f72a21faacc6
pyparsing==2.2.0 --hash=sha256:fee43f17a9c4087e7ed1605bd6df994c6173c1e977d7ade7b651292fab2bd010
appdirs==1.4.3 --hash=sha256:d8b24664561d0d34ddfaec54636d502d7cea6e29c3eaf68f3df6180863e2166e
pycparser==2.17 --hash=sha256:0aac31e917c24cb3357f5a4d5566f2cc91a19ca41862f6c3c22dc60a629673b6
prometheus-client==0.0.19 --hash=sha256:ce4ddcb89a870ee771ca5427df123029bf5344ea84f535ded4a1787e29a22a3f
29 changes: 25 additions & 4 deletions python/beacon_server/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

# External packages
from external.expiring_dict import ExpiringDict
from prometheus_client import Counter

# SCION
from beacon_server.if_state import InterfaceState
Expand Down Expand Up @@ -84,6 +85,15 @@
from scion_elem.scion_elem import SCIONElement


# Exported metrics.
BEACONS_PROPAGATED = Counter("bs_beacons_propagated_total", "# of propagated beacons",
["server_id", "isd_as", "type"])
SEGMENTS_REGISTERED = Counter("bs_segments_registered_total", "# of registered segments",
["server_id", "isd_as", "type"])
REVOCATIONS_ISSUED = Counter("bs_revocations_issued_total", "# of issued revocations",
["server_id", "isd_as"])


class BeaconServer(SCIONElement, metaclass=ABCMeta):
"""
The SCION PathConstructionBeacon Server.
Expand All @@ -104,12 +114,13 @@ class BeaconServer(SCIONElement, metaclass=ABCMeta):
# Interval to checked for timed out interfaces.
IF_TIMEOUT_INTERVAL = 1

def __init__(self, server_id, conf_dir):
def __init__(self, server_id, conf_dir, prom_export=None):
"""
:param str server_id: server identifier.
:param str conf_dir: configuration directory.
:param str prom_export: prometheus export address.
"""
super().__init__(server_id, conf_dir)
super().__init__(server_id, conf_dir, prom_export=prom_export)
# TODO: add 2 policies
self.path_policy = PathPolicy.from_file(
os.path.join(conf_dir, PATH_POLICY_FILE))
Expand Down Expand Up @@ -180,6 +191,7 @@ def propagate_downstream_pcb(self, pcb):
:type pcb: PathSegment
"""
propagated_pcbs = defaultdict(list)
prop_cnt = 0
for intf in self.topology.child_interfaces:
if not intf.to_if_id:
continue
Expand All @@ -189,6 +201,9 @@ def propagate_downstream_pcb(self, pcb):
continue
self.send_meta(new_pcb, meta)
propagated_pcbs[(intf.isd_as, intf.if_id)].append(pcb.short_id())
prop_cnt += 1
if self._labels:
BEACONS_PROPAGATED.labels(**self._labels, type="down").inc(prop_cnt)
return propagated_pcbs

def _mk_prop_pcb_meta(self, pcb, dst_ia, egress_if):
Expand Down Expand Up @@ -233,8 +248,8 @@ def handle_pcbs_propagation(self):

def _log_propagations(self, propagated_pcbs):
for (isd_as, if_id), pcbs in propagated_pcbs.items():
logging.debug("Propagated %d PCBs to %s via %s (%s)", len(pcbs), isd_as,
if_id, ", ".join(pcbs))
logging.debug("Propagated %d PCBs to %s via %s (%s)", len(pcbs),
isd_as, if_id, ", ".join(pcbs))

def _handle_pcbs_from_zk(self, pcbs):
"""
Expand Down Expand Up @@ -315,9 +330,13 @@ def register_segments(self):
raise NotImplementedError

def _log_registrations(self, registrations, seg_type):
reg_cnt = 0
for (dst_meta, dst_type), pcbs in registrations.items():
reg_cnt += len(pcbs)
logging.debug("Registered %d %s-segments @ %s:%s (%s)", len(pcbs),
seg_type, dst_type.upper(), dst_meta, ", ".join(pcbs))
if self._labels:
SEGMENTS_REGISTERED.labels(**self._labels, type=seg_type).inc(reg_cnt)

def _create_asm(self, in_if, out_if, ts, prev_hof):
pcbms = list(self._create_pcbms(in_if, out_if, ts, prev_hof))
Expand Down Expand Up @@ -568,6 +587,8 @@ def _issue_revocation(self, if_id):
return
rev_info = self._get_ht_proof(if_id)
logging.info("Issuing revocation: %s", rev_info.short_desc())
if self._labels:
REVOCATIONS_ISSUED.labels(**self._labels).inc()
# Issue revocation to all BRs.
info = IFStateInfo.from_values(if_id, False, rev_info)
pld = IFStatePayload.from_values([info])
Expand Down
11 changes: 8 additions & 3 deletions python/beacon_server/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from collections import defaultdict

# SCION
from beacon_server.base import BeaconServer
from beacon_server.base import BeaconServer, BEACONS_PROPAGATED
from lib.defines import PATH_SERVICE, SIBRA_SERVICE
from lib.errors import SCIONServiceLookupError
from lib.packet.opaque_field import InfoOpaqueField
Expand All @@ -38,12 +38,13 @@ class CoreBeaconServer(BeaconServer):
Starts broadcasting beacons down-stream within an ISD and across ISDs
towards other core beacon servers.
"""
def __init__(self, server_id, conf_dir):
def __init__(self, server_id, conf_dir, prom_export=None):
"""
:param str server_id: server identifier.
:param str conf_dir: configuration directory.
:param str prom_export: prometheus export address.
"""
super().__init__(server_id, conf_dir)
super().__init__(server_id, conf_dir, prom_export=prom_export)
# Sanity check that we should indeed be a core beacon server.
assert self.topology.is_core_as, "This shouldn't be a local BS!"
self.core_beacons = defaultdict(self._ps_factory)
Expand All @@ -61,6 +62,7 @@ def propagate_core_pcb(self, pcb):
Propagates the core beacons to other core ASes.
"""
propagated_pcbs = defaultdict(list)
prop_cnt = 0
for intf in self.topology.core_interfaces:
dst_ia = intf.isd_as
if not self._filter_pcb(pcb, dst_ia=dst_ia):
Expand All @@ -71,6 +73,9 @@ def propagate_core_pcb(self, pcb):
continue
self.send_meta(new_pcb, meta)
propagated_pcbs[(intf.isd_as, intf.if_id)].append(pcb.short_id())
prop_cnt += 1
if self._labels:
BEACONS_PROPAGATED.labels(**self._labels, type="core").inc(prop_cnt)
return propagated_pcbs

def handle_pcbs_propagation(self):
Expand Down
5 changes: 3 additions & 2 deletions python/beacon_server/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,13 @@ class LocalBeaconServer(BeaconServer):
servers.
"""

def __init__(self, server_id, conf_dir):
def __init__(self, server_id, conf_dir, prom_export=None):
"""
:param str server_id: server identifier.
:param str conf_dir: configuration directory.
:param str prom_export: prometheus export address.
"""
super().__init__(server_id, conf_dir)
super().__init__(server_id, conf_dir, prom_export)
# Sanity check that we should indeed be a local beacon server.
assert not self.topology.is_core_as, "This shouldn't be a core BS!"
self.beacons = PathStore(self.path_policy)
Expand Down
21 changes: 18 additions & 3 deletions python/cert_server/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@
import logging
import os
import threading
import time

# External packages
from nacl.exceptions import CryptoError
import time
from prometheus_client import Counter

# SCION
import lib.app.sciond as lib_sciond
Expand Down Expand Up @@ -76,6 +77,10 @@
from sciond.sciond import SCIOND_API_SOCKDIR
from scion_elem.scion_elem import SCIONElement


# Exported metrics.
REQS_TOTAL = Counter("cs_requests_total", "# of total requests", ["server_id", "isd_as", "type"])

# Timeout for API path requests
API_TOUT = 1
# Max amount of DRKey secret values. 1 current, 1 prefetch, 1 buffer.
Expand All @@ -99,20 +104,27 @@ class CertServer(SCIONElement):
ZK_TRC_CACHE_PATH = "trc_cache"
ZK_DRKEY_PATH = "drkey_cache"

def __init__(self, server_id, conf_dir):
def __init__(self, server_id, conf_dir, prom_export=None):
"""
:param str server_id: server identifier.
:param str conf_dir: configuration directory.
:param str prom_export: prometheus export address.
"""
super().__init__(server_id, conf_dir)
super().__init__(server_id, conf_dir, prom_export=prom_export)
cc_labels = {**self._labels, "type": "cc"} if self._labels else None
trc_labels = {**self._labels, "type": "trc"} if self._labels else None
drkey_labels = {**self._labels, "type": "drkey"} if self._labels else None
self.cc_requests = RequestHandler.start(
"CC Requests", self._check_cc, self._fetch_cc, self._reply_cc,
labels=cc_labels,
)
self.trc_requests = RequestHandler.start(
"TRC Requests", self._check_trc, self._fetch_trc, self._reply_trc,
labels=trc_labels,
)
self.drkey_protocol_requests = RequestHandler.start(
"DRKey Requests", self._check_drkey, self._fetch_drkey, self._reply_proto_drkey,
labels=drkey_labels,
)

self.CTRL_PLD_CLASS_MAP = {
Expand Down Expand Up @@ -229,6 +241,7 @@ def process_cert_chain_request(self, req, meta):
assert isinstance(req, CertChainRequest)
key = req.isd_as(), req.p.version
logging.info("Cert chain request received for %sv%s from %s", *key, meta)
REQS_TOTAL.labels(**self._labels, type="cc").inc()
local = meta.ia == self.addr.isd_as
if not self._check_cc(key):
if not local:
Expand Down Expand Up @@ -291,6 +304,7 @@ def process_trc_request(self, req, meta):
assert isinstance(req, TRCRequest)
key = req.isd_as()[0], req.p.version
logging.info("TRC request received for %sv%s from %s", *key, meta)
REQS_TOTAL.labels(**self._labels, type="trc").inc()
local = meta.ia == self.addr.isd_as
if not self._check_trc(key):
if not local:
Expand Down Expand Up @@ -362,6 +376,7 @@ def process_drkey_request(self, req, meta):
"""
assert isinstance(req, DRKeyRequest)
logging.info("DRKeyRequest received from %s: %s", meta, req.short_desc())
REQS_TOTAL.labels(**self._labels, type="drkey").inc()
try:
cert = self._verify_drkey_request(req, meta)
except SCIONVerificationError as e:
Expand Down
5 changes: 3 additions & 2 deletions python/dns_server/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,14 @@ class SCIONDnsServer(SCIONElement):
SRV_TYPES = (BEACON_SERVICE, CERTIFICATE_SERVICE,
DNS_SERVICE, PATH_SERVICE, SIBRA_SERVICE)

def __init__(self, server_id, conf_dir, setup=False): # pragma: no cover
def __init__(self, server_id, conf_dir, setup=False, prom_export=None): # pragma: no cover
"""
:param str server_id: server identifier.
:param str conf_dir: configuration directory.
:param str prom_export: prometheus export address.
:param bool setup: should setup() be called?
"""
super().__init__(server_id, conf_dir)
super().__init__(server_id, conf_dir, prom_export=prom_export)
self.domain = DNSLabel(self.topology.dns_domain)
self.lock = threading.Lock()
self.services = {}
Expand Down
7 changes: 4 additions & 3 deletions python/lib/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def main_default(type_, local_type=None, trace_=False, **kwargs):
"""
handle_signals()
parser = argparse.ArgumentParser()
parser.add_argument('--prom', type=str, help='Address to export prometheus metrics on')
parser.add_argument('server_id', help='Server identifier')
parser.add_argument('conf_dir', nargs='?', default='.',
help='Configuration directory (Default: ./)')
Expand All @@ -67,14 +68,14 @@ def main_default(type_, local_type=None, trace_=False, **kwargs):
init_logging(os.path.join(args.log_dir, args.server_id))

if local_type is None:
inst = type_(args.server_id, args.conf_dir, **kwargs)
inst = type_(args.server_id, args.conf_dir, prom_export=args.prom, **kwargs)
else:
# Load the topology to check if this is a core AD or not
topo = Topology.from_file(os.path.join(args.conf_dir, TOPO_FILE))
if topo.is_core_as:
inst = type_(args.server_id, args.conf_dir, **kwargs)
inst = type_(args.server_id, args.conf_dir, prom_export=args.prom, **kwargs)
else:
inst = local_type(args.server_id, args.conf_dir, **kwargs)
inst = local_type(args.server_id, args.conf_dir, prom_export=args.prom, **kwargs)
if trace_:
trace(inst.id)
logging.info("Started %s", args.server_id)
Expand Down
2 changes: 1 addition & 1 deletion python/lib/packet/packet_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def __bool__(self):
return True

def __len__(self):
raise NotImplementedError
return self.p.total_size.word_count * 8

def copy(self):
return type(self)(self.p.copy())
Expand Down
4 changes: 4 additions & 0 deletions python/lib/packet/path_mgmt/seg_recs.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ def iter_rev_infos(self, start=0):
for i in range(start, len(self.p.revInfos)):
yield self.rev_info(i)

def num_segs(self):
"""Returns the total number of path segments."""
return len(self.p.recs)

def __str__(self):
s = []
s.append("%s:" % self.NAME)
Expand Down
Loading

0 comments on commit f0c774b

Please sign in to comment.