router: Improve the accuracy and stability of the benchmark.

Summary: * In the router, prevent the Go runtime from trying to use more cores than the specified number of processor threads. Reason: * Be consistent with the default where there are as many processor threads as there are cores. * In non-default cases, we want to be able to support multiple routers on the same host. When doing that, Go's assumption that all the cores are fair game distorts the performance as all router instances compete for the same cores and so incurr far more preemption than they would in a normal deployment. * In the test harness, configure the number of cores allowed to each router based on what's available on the host running the test. * In the end2end binary, terminate the pong drain task quickly instead of waiting up to 10s. This allows to report the end-time of the test accurately, thereby removing a lot of variability in the results (before, the metrics capture window could overlap with the end of the active phase of the test). * Because the test goes faster, increase the number of packets to 1.5M, so we are sure we have a reliable 10s metrics capture window in the middle.
scionproto · Nov 9, 2023 · 25b6f74 · 25b6f74
1 parent dbcf003
commit 25b6f74
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 9 deletions.
diff --git a/acceptance/router_benchmark/test.py b/acceptance/router_benchmark/test.py
@@ -19,8 +19,9 @@
 from http.client import HTTPConnection
 from urllib.parse import urlencode
 from plumbum import cli
+from plumbum.cmd import cat, grep, wc
 
-from acceptance.common import base, docker
+from acceptance.common import base, docker, scion
 
 logger = logging.getLogger(__name__)
 
@@ -54,6 +55,38 @@ class Test(base.TestTopogen):
         help="Do extra checks for CI",
     )
 
+    def setup_prepare(self):
+        super().setup_prepare()
+
+        # The expected topology for this test is well-known: see router_bm.topo
+        # This test is configured to match.
+
+        # Distribute available cores among routers (keep one free for nity-grity).
+        availCores = int((cat['/proc/cpuinfo'] | grep['processor\\s:'] | wc['-l'])())
+        childRouterCores = 1
+        farRouterCores = 1
+        centerRouterCores = 1
+
+        if availCores >= 7:
+            availCores -= (2 * childRouterCores + farRouterCores)
+            centerRouterCores = int(availCores / 2)
+            availCores -= (2 * centerRouterCores)
+            if availCores > 0:
+                # There is one left. Give it to the far router
+                farRouterCores += 1
+
+        coreCountUpdates = {
+            self.artifacts / 'gen' / 'ASff00_0_110' / 'br1-ff00_0_110-1.toml': centerRouterCores,
+            self.artifacts / 'gen' / 'ASff00_0_110' / 'br1-ff00_0_110-2.toml': centerRouterCores,
+            self.artifacts / 'gen' / 'ASff00_0_111' / 'br1-ff00_0_111-1.toml': childRouterCores,
+            self.artifacts / 'gen' / 'ASff00_0_112' / 'br1-ff00_0_112-1.toml': childRouterCores,
+            self.artifacts / 'gen' / 'ASff00_0_120' / 'br2-ff00_0_120-1.toml': farRouterCores,
+        }
+
+        # Edit all the configuration files of br services with the required core allowance.
+        for configFile, coreCnt in coreCountUpdates.items():
+            scion.update_toml({"router.num_processors": coreCnt}, [configFile])
+
     def setup(self):
         super().setup()
         self.monitoring_dc = docker.Compose(compose_file=self.artifacts / "gen/monitoring-dc.yml")
@@ -80,7 +113,7 @@ def _run(self):
             "-outDir", self.artifacts,
             "-name", "router_benchmark",
             "-game", "packetflood",
-            "-attempts", 1000000,
+            "-attempts", 1500000,
             "-parallelism", 100,
             "-subset", "noncore#core#remoteISD"
         ].run_tee()
@@ -150,7 +183,7 @@ def _run(self):
             "-outDir", self.artifacts,
             "-name", "router_benchmark",
             "-game", "packetflood",
-            "-attempts", 1000000,
+            "-attempts", 1500000,
             "-parallelism", 100,
             "-subset", "noncore#noncore#remoteAS"
         ].run_tee()
@@ -201,7 +234,7 @@ def _run(self):
         # modeling later.
         logger.info('==> Collecting number of cores...')
         promQuery = urlencode({
-            'query': 'avg by (job) (go_sched_maxprocs_threads{job="BR"})'
+            'query': 'go_sched_maxprocs_threads{job="BR"}'
         })
 
         conn = HTTPConnection("localhost:9090")
@@ -212,11 +245,10 @@ def _run(self):
 
         pld = json.loads(resp.read().decode('utf-8'))
         results = pld['data']['result']
-        numCores = 0
         for result in results:
+            instance = result['metric']['instance']
             _, val = result['value']
-            numCores = int(float(val))
-        logger.info(f'Router Cores: {numCores}')
+            logger.info(f'Router Cores for {instance}: {int(val)}')
 
         # Log and check the performance...
         # If this is used as a CI test. Make sure that the performance is within the expected

diff --git a/router/config/config.go b/router/config/config.go
@@ -71,8 +71,22 @@ func (cfg *RouterConfig) Validate() error {
 }
 
 func (cfg *RouterConfig) InitDefaults() {
+	// NumProcessors is the number of goroutines used to handle the processing queue.
+	// By default, there are as many as cores allowed by Go and other goroutines displace
+	// the packet processors sporadically.
 	if cfg.NumProcessors == 0 {
+		// Use everything (and then some :-) ).
 		cfg.NumProcessors = runtime.GOMAXPROCS(0)
+	} else {
+		// On the contrary; prevent Go from using more cores. The packet processors are still
+		// allowed to use it all. The other goroutines borrow sporadically. The goal is truly
+		// to avoid using all cores. This is typically necessary when testing multiple routers on
+		// one machine to avoid measuring the effect of them competing for CPUs.
+		realMax := runtime.GOMAXPROCS(0)
+		if cfg.NumProcessors > realMax {
+			cfg.NumProcessors = realMax
+		}
+		runtime.GOMAXPROCS(cfg.NumProcessors)
 	}
 	if cfg.NumSlowPathProcessors == 0 {
 		cfg.NumSlowPathProcessors = 1

diff --git a/tools/end2end/main.go b/tools/end2end/main.go
@@ -347,6 +347,10 @@ func (c *client) run() int {
 
 		// We return a "number of failures". So 0 means everything is fine.
 		ping_result := integration.RepeatUntilFail("End2End", c.blindPing)
+
+		// Stop drainPongs, so we're not stuck here for up to 10s.
+		c.conn.Close()
+
 		pong_result := <-pong_out
 		if pong_result != 0 {
 			log.Info("Never got a single pong")
@@ -591,7 +595,7 @@ func (c *client) getRemote(ctx context.Context, n int) (snet.Path, error) {
 	return path, nil
 }
 
-func (c *client) pong(ctx context.Context, log_ok bool) error {
+func (c *client) pong(ctx context.Context, logIfOk bool) error {
 	if err := c.conn.SetReadDeadline(getDeadline(ctx)); err != nil {
 		return serrors.WrapStr("setting read deadline", err)
 	}
@@ -619,7 +623,7 @@ func (c *client) pong(ctx context.Context, log_ok bool) error {
 	if pld.Client != expected.Client || pld.Server != expected.Server || pld.Message != pong {
 		return serrors.New("unexpected contents received", "data", pld, "expected", expected)
 	}
-	if log_ok {
+	if logIfOk {
 		log.Info("Received pong", "server", p.Source)
 	}
 	return nil