Skip to content

Commit

Permalink
router: Improve the accuracy and stability of the benchmark.
Browse files Browse the repository at this point in the history
Summary:

* In the router, prevent the Go runtime from trying to use more cores than
  the specified number of processor threads. Reason:
  * Be consistent with the default where there are as many processor threads as
    there are cores.
  * In non-default cases, we want to be able to support multiple routers on the
    same host. When doing that, Go's assumption that all the cores are fair game
	distorts the performance as all router instances compete for the same
	cores and so incurr far more preemption than they would in a normal deployment.
* In the test harness, configure the number of cores allowed to each router
  based on what's available on the host running the test.
* In the end2end binary, terminate the pong drain task quickly instead of
  waiting up to 10s. This allows to report the end-time of the test accurately,
  thereby removing a lot of variability in the results (before, the metrics
  capture window could overlap with the end of the active phase of the test).
* Because the test goes faster, increase the number of packets to 1.5M, so we
  are sure we have a reliable 10s metrics capture window in the middle.
  • Loading branch information
jiceatscion committed Nov 9, 2023
1 parent dbcf003 commit 25b6f74
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 9 deletions.
46 changes: 39 additions & 7 deletions acceptance/router_benchmark/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@
from http.client import HTTPConnection
from urllib.parse import urlencode
from plumbum import cli
from plumbum.cmd import cat, grep, wc

from acceptance.common import base, docker
from acceptance.common import base, docker, scion

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -54,6 +55,38 @@ class Test(base.TestTopogen):
help="Do extra checks for CI",
)

def setup_prepare(self):
super().setup_prepare()

# The expected topology for this test is well-known: see router_bm.topo
# This test is configured to match.

# Distribute available cores among routers (keep one free for nity-grity).
availCores = int((cat['/proc/cpuinfo'] | grep['processor\\s:'] | wc['-l'])())
childRouterCores = 1
farRouterCores = 1
centerRouterCores = 1

if availCores >= 7:
availCores -= (2 * childRouterCores + farRouterCores)
centerRouterCores = int(availCores / 2)
availCores -= (2 * centerRouterCores)
if availCores > 0:
# There is one left. Give it to the far router
farRouterCores += 1

coreCountUpdates = {
self.artifacts / 'gen' / 'ASff00_0_110' / 'br1-ff00_0_110-1.toml': centerRouterCores,
self.artifacts / 'gen' / 'ASff00_0_110' / 'br1-ff00_0_110-2.toml': centerRouterCores,
self.artifacts / 'gen' / 'ASff00_0_111' / 'br1-ff00_0_111-1.toml': childRouterCores,
self.artifacts / 'gen' / 'ASff00_0_112' / 'br1-ff00_0_112-1.toml': childRouterCores,
self.artifacts / 'gen' / 'ASff00_0_120' / 'br2-ff00_0_120-1.toml': farRouterCores,
}

# Edit all the configuration files of br services with the required core allowance.
for configFile, coreCnt in coreCountUpdates.items():
scion.update_toml({"router.num_processors": coreCnt}, [configFile])

def setup(self):
super().setup()
self.monitoring_dc = docker.Compose(compose_file=self.artifacts / "gen/monitoring-dc.yml")
Expand All @@ -80,7 +113,7 @@ def _run(self):
"-outDir", self.artifacts,
"-name", "router_benchmark",
"-game", "packetflood",
"-attempts", 1000000,
"-attempts", 1500000,
"-parallelism", 100,
"-subset", "noncore#core#remoteISD"
].run_tee()
Expand Down Expand Up @@ -150,7 +183,7 @@ def _run(self):
"-outDir", self.artifacts,
"-name", "router_benchmark",
"-game", "packetflood",
"-attempts", 1000000,
"-attempts", 1500000,
"-parallelism", 100,
"-subset", "noncore#noncore#remoteAS"
].run_tee()
Expand Down Expand Up @@ -201,7 +234,7 @@ def _run(self):
# modeling later.
logger.info('==> Collecting number of cores...')
promQuery = urlencode({
'query': 'avg by (job) (go_sched_maxprocs_threads{job="BR"})'
'query': 'go_sched_maxprocs_threads{job="BR"}'
})

conn = HTTPConnection("localhost:9090")
Expand All @@ -212,11 +245,10 @@ def _run(self):

pld = json.loads(resp.read().decode('utf-8'))
results = pld['data']['result']
numCores = 0
for result in results:
instance = result['metric']['instance']
_, val = result['value']
numCores = int(float(val))
logger.info(f'Router Cores: {numCores}')
logger.info(f'Router Cores for {instance}: {int(val)}')

# Log and check the performance...
# If this is used as a CI test. Make sure that the performance is within the expected
Expand Down
14 changes: 14 additions & 0 deletions router/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,22 @@ func (cfg *RouterConfig) Validate() error {
}

func (cfg *RouterConfig) InitDefaults() {
// NumProcessors is the number of goroutines used to handle the processing queue.
// By default, there are as many as cores allowed by Go and other goroutines displace
// the packet processors sporadically.
if cfg.NumProcessors == 0 {
// Use everything (and then some :-) ).
cfg.NumProcessors = runtime.GOMAXPROCS(0)
} else {
// On the contrary; prevent Go from using more cores. The packet processors are still
// allowed to use it all. The other goroutines borrow sporadically. The goal is truly
// to avoid using all cores. This is typically necessary when testing multiple routers on
// one machine to avoid measuring the effect of them competing for CPUs.
realMax := runtime.GOMAXPROCS(0)
if cfg.NumProcessors > realMax {
cfg.NumProcessors = realMax
}
runtime.GOMAXPROCS(cfg.NumProcessors)
}
if cfg.NumSlowPathProcessors == 0 {
cfg.NumSlowPathProcessors = 1
Expand Down
8 changes: 6 additions & 2 deletions tools/end2end/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,10 @@ func (c *client) run() int {

// We return a "number of failures". So 0 means everything is fine.
ping_result := integration.RepeatUntilFail("End2End", c.blindPing)

// Stop drainPongs, so we're not stuck here for up to 10s.
c.conn.Close()

pong_result := <-pong_out
if pong_result != 0 {
log.Info("Never got a single pong")
Expand Down Expand Up @@ -591,7 +595,7 @@ func (c *client) getRemote(ctx context.Context, n int) (snet.Path, error) {
return path, nil
}

func (c *client) pong(ctx context.Context, log_ok bool) error {
func (c *client) pong(ctx context.Context, logIfOk bool) error {
if err := c.conn.SetReadDeadline(getDeadline(ctx)); err != nil {
return serrors.WrapStr("setting read deadline", err)
}
Expand Down Expand Up @@ -619,7 +623,7 @@ func (c *client) pong(ctx context.Context, log_ok bool) error {
if pld.Client != expected.Client || pld.Server != expected.Server || pld.Message != pong {
return serrors.New("unexpected contents received", "data", pld, "expected", expected)
}
if log_ok {
if logIfOk {
log.Info("Received pong", "server", p.Source)
}
return nil
Expand Down

0 comments on commit 25b6f74

Please sign in to comment.