diff --git a/.clang-format b/.clang-format
index 8bef8b5..ba2855d 100644
--- a/.clang-format
+++ b/.clang-format
@@ -96,10 +96,6 @@ PenaltyBreakString: 1000
 PenaltyExcessCharacter: 1000
 PenaltyReturnTypeOnItsOwnLine: 10000
 PointerAlignment: Right
-RawStringFormats: 
-  - Delimiter:       pb
-    Language:        TextProto
-    BasedOnStyle:    google
 ReflowComments:  true
 SortIncludes:    true
 SortUsingDeclarations: true
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5ab7ce7..c266979 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -5,6 +5,8 @@ stages:
 
 variables:
   DOCKER_HOST: tcp://docker:2375
+  # This will instruct Docker not to start over TLS.
+  DOCKER_TLS_CERTDIR: ""
   DOCKER_DRIVER: overlay2
   IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
   LATEST_TAG: $CI_REGISTRY_IMAGE:latest
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1bb1a0..fb9af16 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Version 3.8 required from CheckCXXFeature.cmake
 # Version 3.10 required from Boost 1.66.0 for imported target
-cmake_minimum_required(VERSION 3.10.0)
+# Version 3.13 required for target_link_options
+cmake_minimum_required(VERSION 3.13.0)
 
 project(executor VERSION 1.0.0 LANGUAGES C CXX)
 
@@ -84,6 +85,9 @@ endif(WITH_TCMALLOC)
 find_package(nlohmann_json)
 set_package_properties(nlohmann_json PROPERTIES TYPE OPTIONAL PURPOSE "For OpTracing logging")
 
+set(THREADS_PREFER_PTHREAD_FLAG)
+find_package(Threads)
+
 # Bundled third party library
 add_subdirectory(thirdparty)
 #---------------------------------------------------------------------------------------
@@ -144,6 +148,10 @@ if(WITH_TIMEOUT_WARNING)
     set(SALUS_ENABLE_TIMEOUT_WARNING 1)
 endif(WITH_TIMEOUT_WARNING)
 
+if(USE_TENSORFLOW)
+    set(SALUS_ENABLE_TENSORFLOW 1)
+endif(USE_TENSORFLOW)
+
 configure_file(src/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/config.h)
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
diff --git a/bc b/bc
index 710dff7..6bdef04 100755
--- a/bc
+++ b/bc
@@ -1,2 +1,2 @@
 #! /bin/bash
-python -m benchmarks.driver "$@"
+vex tfbuild python -m benchmarks.driver "$@"
diff --git a/benchmarks/driver/__main__.py b/benchmarks/driver/__main__.py
index f2bbc16..a6ac7cb 100644
--- a/benchmarks/driver/__main__.py
+++ b/benchmarks/driver/__main__.py
@@ -156,7 +156,7 @@ def parse_expname(args):
 
 
 def main():
-    # type: (Sequence[str]) -> None
+    # type: () -> None
     # find first argument not starting with dash
     exp, argv = parse_expname(sys.argv)
 
diff --git a/benchmarks/driver/runner.py b/benchmarks/driver/runner.py
index ea8e5e1..88fa2a3 100644
--- a/benchmarks/driver/runner.py
+++ b/benchmarks/driver/runner.py
@@ -30,7 +30,7 @@
 
 from .server import SalusServer
 from .tfserver import TFDistServer
-from .utils import Popen, execute, snake_to_pascal, str2bool
+from .utils import Popen, execute, snake_to_pascal, str2bool, remove_suffix
 from .utils.compatiblity import pathlib, subprocess as sp
 
 Path = pathlib.Path
@@ -40,6 +40,11 @@
 flags.DEFINE_string('tfbench_base', '../tf_benchmarks', 'Base dir of TFBenchmark based workloads')
 flags.DEFINE_string('unit_base', 'tests', 'Base dir of unittest based workloads')
 flags.DEFINE_string('fathom_base', '../fathom', 'Base dir of Fathom based workloads')
+flags.DEFINE_string('tfweb_base', '../tfweb', 'Base dir of TFWeb based workloads')
+flags.DEFINE_string('tfweb_saved_model_dir', '~/../symbiotic/peifeng/tf_cnn_benchmarks_models/saved_models',
+                    'SavedModel dir of TFWeb based workloads')
+flags.DEFINE_string('tfweb_request_body_dir', '~/../symbiotic/peifeng/tf_cnn_benchmarks_models/reqeusts',
+                    'Predefined request body dir for TFWeb based workloads')
 flags.DEFINE_boolean('no_capture', False, 'Do not capture workload outputs')
 
 
@@ -113,28 +118,61 @@ def __call__(self, executor, output_file):
             '--num_batches={}'.format(self.wl.batch_num),
             '--batch_size={}'.format(self.wl.batch_size),
         ]
-        eval_interval = self.wl.env.pop('SALUS_TFBENCH_EVAL_INTERVAL', '0.1')
-        eval_rand_factor = self.wl.env.pop('SALUS_TFBENCH_EVAL_RAND_FACTOR', '5')
+        eval_interval = self.wl.env.pop('SALUS_TFBENCH_EVAL_INTERVAL', None)
+        eval_rand_factor = self.wl.env.pop('SALUS_TFBENCH_EVAL_RAND_FACTOR', None)
         eval_block = self.wl.env.pop('SALUS_TFBENCH_EVAL_BLOCK', 'true')
+
+        eval_model_dir = self.wl.env.pop('SALUS_TFBENCH_EVAL_MODEL_DIR', 'models')
+        eval_model_dir = str(Path(eval_model_dir).joinpath(remove_suffix(self.wl.name, 'eval')))
+
+        eval_saved_model_dir = self.wl.env.pop('SALUS_TFBENCH_EVAL_SAVED_MODEL_DIR', None)
+        if eval_saved_model_dir is not None:
+            eval_saved_model_dir = str(Path(eval_saved_model_dir).joinpath(remove_suffix(self.wl.name, 'eval')))
+
+        num_seconds = self.wl.env.pop('SALUS_ITER_SECONDS', None)
+        if num_seconds is not None:
+            cmd += [
+                '--num_seconds={}'.format(num_seconds)
+            ]
+
+        wait_for_signal = self.wl.env.pop('SALUS_WAIT_FOR_SIGNAL', None)
+        if wait_for_signal is not None:
+            cmd += [
+                '--wait_for_signal={}'.format(wait_for_signal)
+            ]
+
         if self.wl.name.endswith('eval'):
-            model_name = self.wl.name.rsplit('eval')[0]
+            model_name = remove_suffix(self.wl.name, 'eval')
             cmd += [
-                '--model_dir=models/{}'.format(model_name),
+                '--model_dir=' + eval_model_dir,
                 '--model={}'.format(model_name),
-                '--eval_interval_secs={}'.format(eval_interval),
-                '--eval_interval_random_factor={}'.format(eval_rand_factor),
                 '--eval_block={}'.format(eval_block),
                 '--eval'
             ]
+            if eval_interval is not None:
+                cmd += [
+                    '--eval_interval_secs={}'.format(eval_interval),
+                ]
+            if eval_rand_factor is not None:
+                cmd += [
+                    '--eval_interval_random_factor={}'.format(eval_rand_factor),
+                ]
+            if eval_saved_model_dir is not None:
+                cmd += [
+                    '--saved_model_dir=' + eval_saved_model_dir
+                ]
         else:
             cmd += [
                 '--model={}'.format(self.wl.name),
             ]
             if str2bool(self.wl.env.pop('SALUS_SAVE_MODEL', '')):
                 cmd += [
-                    '--model_dir=models/{}'.format(self.wl.name),
+                    '--model_dir=' + eval_model_dir,
                 ]
 
+        cmd += self.wl.extra_args
+        logger.info(f'Starting workload with cmd: {cmd}')
+
         if FLAGS.no_capture:
             return execute(cmd, cwd=str(cwd), env=self.env)
         else:
@@ -157,6 +195,7 @@ def __call__(self, executor, output_file):
         # type: (Executor, Path) -> Popen
         env = self.env.copy()
         env['EXEC_ITER_NUMBER'] = str(self.wl.batch_num)
+        env['SALUS_BATCH_SIZE'] = str(self.wl.batch_size)
         if executor == Executor.TFDist:
             env['SALUS_TFDIST_ENDPOINT'] = TFDistServer.current_server().endpoint
 
@@ -166,12 +205,16 @@ def __call__(self, executor, output_file):
             'stdbuf', '-o0', '-e0', '--',
             'python', '-m', pkg, method,
         ]
+        cmd += self.wl.extra_args
+
+        logger.info(f'Starting workload with cmd: {cmd}')
         if FLAGS.no_capture:
             return execute(cmd, cwd=str(cwd), env=self.env)
         else:
             output_file.parent.mkdir(exist_ok=True, parents=True)
             with output_file.open('w') as f:
-                return execute(cmd, cwd=str(cwd), env=env, stdout=f, stderr=sp.STDOUT)
+                # return execute(cmd, cwd=str(cwd), env=env, stdout=f, stderr=sp.STDOUT)
+                return execute(cmd, cwd=str(cwd), env=env, stdout=f, stderr=None)
 
     def _construct_test_name(self, executor):
         # type: (Executor) -> Tuple[str, str]
@@ -197,6 +240,12 @@ def _construct_test_name(self, executor):
             })
         }
 
+        variable_batch_size_models = {'vae', 'superres', 'seq2seq', 'mnistsf', 'mnistcv', 'mnistlg'}
+        if remove_suffix(self.wl.name, 'eval') not in variable_batch_size_models:
+            if self.wl.batch_size not in self.wl.wtl.available_batch_sizes():
+                raise ValueError(f"Batch size `{self.wl.batch_size}' is not supported for {self.wl.name},"
+                                 f" available ones: {self.wl.wtl.available_batch_sizes()}")
+
         if executor == Executor.Salus:
             prefix = 'test_rpc_'
         elif executor == Executor.TF:
@@ -209,7 +258,7 @@ def _construct_test_name(self, executor):
         if self.wl.name.endswith('eval'):
             prefix += 'eval_'
 
-        model_name = self.wl.name.rsplit('eval')[0]
+        model_name = remove_suffix(self.wl.name, 'eval')
 
         if model_name in supported_model:
             pkg, cls, names = supported_model[model_name]
@@ -217,11 +266,18 @@ def _construct_test_name(self, executor):
             # fallback to guessing
             pkg = f'test_tf.test_{model_name}'
             cls = f'Test{snake_to_pascal(model_name)}'
+
+            # get method name
             names = {
                 s: str(idx)
                 for idx, s in enumerate(self.wl.wtl.available_batch_sizes())
             }
-        method = f'{cls}.{prefix}{names[self.wl.batch_size]}'
+
+        postfix = names.get(self.wl.batch_size, '0')
+        if model_name == 'seq2seq' and postfix == '0':
+            postfix = '2_large'
+
+        method = f'{cls}.{prefix}{postfix}'
         return pkg, method
 
 
@@ -240,7 +296,7 @@ def __call__(self, executor, output_file):
         cmd = [
             'stdbuf', '-o0', '-e0', '--',
             'python', '-m', 'fathom.cli',
-            '--workload', self.wl.name.rsplit('eval')[0],
+            '--workload', remove_suffix(self.wl.name, 'eval'),
             '--action', 'test' if self.wl.name.endswith('eval') else 'train',
             '--num_iters', str(self.wl.batch_num),
             '--batch_size', str(self.wl.batch_size),
@@ -262,9 +318,153 @@ def __call__(self, executor, output_file):
         else:
             raise ValueError(f'Unknown executor: {executor}')
 
+        cmd += self.wl.extra_args
+        logger.info(f'Starting workload with cmd: {cmd}')
+
+        if FLAGS.no_capture:
+            return execute(cmd, cwd=str(cwd), env=self.env)
+        else:
+            output_file.parent.mkdir(exist_ok=True, parents=True)
+            with output_file.open('w') as f:
+                return execute(cmd, cwd=str(cwd), env=self.env, stdout=f, stderr=sp.STDOUT)
+
+
+class TFWebDirectRunner(Runner):
+    """Using TFWeb's load infrastructure to directly run"""
+
+    def __init__(self, wl, base_dir=None):
+        super().__init__(wl)
+        self.base_dir = base_dir
+        if self.base_dir is None:
+            self.base_dir = FLAGS.tfweb_base
+
+    def __call__(self, executor, output_file):
+        model_name = remove_suffix(self.wl.name, 'eval')
+        cwd = self.base_dir
+        cmd = [
+            'stdbuf', '-o0', '-e0', '--',
+            'examples/direct/client',
+            '--model="{}"'.format(str(Path(FLAGS.tfweb_saved_model_dir).joinpath(model_name))),
+            '--batch_size={}'.format(self.wl.batch_size),
+            '--batch_num={}'.format(self.wl.batch_num),
+        ]
+
+        if executor == Executor.Salus:
+            cmd += [
+                '--sess_target', SalusServer.current_server().endpoint,
+            ]
+        elif executor == Executor.TF:
+            cmd += [
+                '--sess_target', '""',
+            ]
+        elif executor == Executor.TFDist:
+            cmd += [
+                '--sess_target', TFDistServer.current_server().endpoint,
+            ]
+        else:
+            raise ValueError(f'Unknown executor: {executor}')
+        cmd += self.wl.extra_args
+        logger.info(f'Starting workload with cmd: {cmd}')
+
         if FLAGS.no_capture:
             return execute(cmd, cwd=str(cwd), env=self.env)
         else:
             output_file.parent.mkdir(exist_ok=True, parents=True)
             with output_file.open('w') as f:
                 return execute(cmd, cwd=str(cwd), env=self.env, stdout=f, stderr=sp.STDOUT)
+
+
+class TFWebRunner(Runner):
+    """
+    Run a TFWeb based inference job
+
+    We start several servers and a balancer on the same node.
+    The server commandline: tfweb --model=path/to/saved_model/network --sess_target=...
+    The client commandline: gobetween from-file xxx.toml
+    """
+
+    def __init__(self, wl, base_dir=None):
+        super().__init__(wl)
+        self.base_dir = base_dir
+        if self.base_dir is None:
+            self.base_dir = FLAGS.tfweb_base
+
+    def __call__(self, executor, output_file):
+        # type: (Executor, Path) -> Popen
+        model_name = remove_suffix(self.wl.name, 'web')
+        cwd = self.base_dir
+        cmd = [
+            'stdbuf', '-o0', '-e0', '--',
+            'examples/cluster/start_cluster',
+            '--model="{}"'.format(str(Path(FLAGS.tfweb_saved_model_dir).joinpath(model_name))),
+        ]
+
+        if executor == Executor.Salus:
+            cmd += [
+                '--sess_target', SalusServer.current_server().endpoint,
+            ]
+        elif executor == Executor.TF:
+            cmd += [
+                '--sess_target', '""',
+            ]
+        elif executor == Executor.TFDist:
+            cmd += [
+                '--sess_target', TFDistServer.current_server().endpoint,
+            ]
+        else:
+            raise ValueError(f'Unknown executor: {executor}')
+
+        num_replicas = self.wl.env.pop('SALUS_TFWEB_REPLICAS', '1')
+        cmd += [
+            '--num_replicas', num_replicas
+        ]
+        cmd += self.wl.extra_args
+        logger.info(f'Starting workload with cmd: {cmd}')
+
+        if FLAGS.no_capture:
+            return execute(cmd, cwd=str(cwd), env=self.env)
+        else:
+            output_file.parent.mkdir(exist_ok=True, parents=True)
+            with output_file.open('w') as f:
+                return execute(cmd, cwd=str(cwd), env=self.env, stdout=f, stderr=sp.STDOUT)
+
+
+class TFWebClientRunner(Runner):
+    """
+    Run a tfweb client attacker.
+    Command: examples/cluster/tfweb-client TARGET REQ_BODY PLANTXT
+    """
+
+    def __init__(self, wl, base_dir=None):
+        super().__init__(wl)
+        self.base_dir = base_dir
+        if self.base_dir is None:
+            self.base_dir = FLAGS.tfweb_base
+
+    def __call__(self, executor, output_file):
+        # type: (Executor, Path) -> Popen
+
+        model_name = remove_suffix(self.wl.name, 'client')
+
+        cwd = self.base_dir
+        cmd = [
+            'stdbuf', '-o0', '-e0', '--',
+            'examples/tfweb-client',
+            '-output', str(output_file),
+            self.wl.target,
+            # request body
+            str(Path(FLAGS.tfweb_request_body_dir).joinpath(model_name).with_suffix('.txt')),
+            # always write plan to stdin
+            '-',
+        ]
+        cmd += self.wl.extra_args
+        logger.info(f'Starting workload with cmd: {cmd}')
+
+        proc = execute(cmd, cwd=str(cwd), env=self.env, stdin=sp.PIPE)
+        proc.stdin.write(self._plan_to_bytes())
+        proc.stdin.close()
+        return proc
+
+    def _plan_to_bytes(self):
+        return ' '.join(self.wl.plan).encode('utf-8')
+
diff --git a/benchmarks/driver/server/__init__.py b/benchmarks/driver/server/__init__.py
index 4e6bd8d..2617f42 100644
--- a/benchmarks/driver/server/__init__.py
+++ b/benchmarks/driver/server/__init__.py
@@ -77,9 +77,11 @@ def _find_executable(self):
         """Find the absolute path to server executable, according to 'config.build_type'"""
         candidates = [
             self.config.build_dir / self.config.build_type / 'src' / 'executor',
+            self.config.build_dir / self.config.build_type / 'src' / 'salus-server',
             self.config.build_dir / self.config.build_type / 'bin' / 'executor',
             self.config.build_dir / self.config.build_type / 'bin' / 'salus-server',
             self.config.build_dir / self.config.build_type.lower() / 'src' / 'executor',
+            self.config.build_dir / self.config.build_type.lower() / 'src' / 'salus-server',
             self.config.build_dir / self.config.build_type.lower() / 'bin' / 'executor',
             self.config.build_dir / self.config.build_type.lower() / 'bin' / 'salus-server',
         ]
diff --git a/benchmarks/driver/server/config/__init__.py b/benchmarks/driver/server/config/__init__.py
index 2055115..dad744b 100644
--- a/benchmarks/driver/server/config/__init__.py
+++ b/benchmarks/driver/server/config/__init__.py
@@ -21,7 +21,7 @@
 from builtins import super
 
 from absl import flags
-from copy import copy
+from copy import deepcopy
 
 from ...utils import maybe_path
 from ...utils.compatiblity import pathlib
@@ -77,7 +77,7 @@ def __setattr__(self, key, value):
     def copy(self, **kwargs):
         # type: (...) -> SalusConfig
         """Return a new copy of the tuple"""
-        return copy(self).update(**kwargs)
+        return deepcopy(self).update(**kwargs)
 
     def update(self, d=None, **kwargs):
         # type: (...) -> SalusConfig
diff --git a/benchmarks/driver/tfserver.py b/benchmarks/driver/tfserver.py
index 8c58f08..7e505b3 100644
--- a/benchmarks/driver/tfserver.py
+++ b/benchmarks/driver/tfserver.py
@@ -29,7 +29,7 @@
 from datetime import datetime
 from absl import flags
 from contextlib import contextmanager
-from typing import List, Deque, Dict, Union
+from typing import List, Deque, Dict, Union, Optional
 
 from benchmarks.driver.utils import prompt, remove_prefix
 from benchmarks.driver.utils.prompt import pause
@@ -65,7 +65,7 @@ def __init__(self, env=None, outputdir=None):
 
         self._build_cmd()
 
-        self.proc = None  # type: Popen
+        self.proc = None  # type: Optional[Popen]
 
     def _build_cmd(self):
         # type: () -> List[str]
diff --git a/benchmarks/driver/utils/prompt.py b/benchmarks/driver/utils/prompt.py
index 155bd3d..0d706db 100644
--- a/benchmarks/driver/utils/prompt.py
+++ b/benchmarks/driver/utils/prompt.py
@@ -93,4 +93,7 @@ def pause(prompt='Press enter to continue...'):
     """Pause the execution and wait the user to press enter"""
 
     # we don't want to guard against KeyboardInterrupt
-    input(prompt)
+    try:
+        input(prompt)
+    except EOFError:
+        pass
diff --git a/benchmarks/driver/utils/utils.py b/benchmarks/driver/utils/utils.py
index dd50486..98165cb 100644
--- a/benchmarks/driver/utils/utils.py
+++ b/benchmarks/driver/utils/utils.py
@@ -86,6 +86,14 @@ def remove_prefix(text, prefix):
     return text  # or whatever
 
 
+def remove_suffix(text, suffix):
+    # type: (str, str) -> str
+    """Remove suffix from text if any"""
+    if text.endswith(suffix):
+        return text[:len(text)-len(suffix)]
+    return text
+
+
 def try_with_default(func, default=None, ignore=Exception):
     """ A wrapper that ignores exception from a function.
     """
@@ -301,6 +309,7 @@ def str2bool(v):
     'eprint',
     'remove_none',
     'remove_prefix',
+    'remove_suffix',
     'try_with_default',
     'execute',
     'kill_tree',
diff --git a/benchmarks/driver/workload.py b/benchmarks/driver/workload.py
index 0250bc3..0485672 100644
--- a/benchmarks/driver/workload.py
+++ b/benchmarks/driver/workload.py
@@ -25,10 +25,10 @@
 import csv
 import logging
 from collections import defaultdict
-from typing import Dict, Iterable, Type, Union
+from typing import Dict, Iterable, Type, Union, Optional
 
 from .runner import Runner, RunConfig, Popen, Executor
-from .runner import TFBenchmarkRunner, UnittestRunner, FathomRunner
+from .runner import TFBenchmarkRunner, UnittestRunner, FathomRunner, TFWebRunner, TFWebClientRunner
 from .utils import try_with_default, kill_tree, unique
 from .utils.compatiblity import pathlib
 
@@ -151,9 +151,6 @@ def _create(self, batch_size, batch_num, executor=Executor.Salus):
 
     def _create_from_rcfg(self, rcfg, executor=Executor.Salus):
         # type: (RunConfig, Executor) -> Workload
-        if rcfg.batch_size not in self.available_batch_sizes():
-            raise ValueError(f"Batch size `{rcfg.batch_size}' is not supported for {self.name},"
-                             f" available ones: {self.available_batch_sizes()}")
         return Workload(self, rcfg, executor, self.geometry(rcfg, executor).copy())
 
     @classmethod
@@ -360,8 +357,9 @@ def __init__(self, wtl, rcfg, executor, geo):
         self.rcfg = rcfg
         self.executor = executor
         self._geo = geo
-        self.proc = None  # type: Popen
-        self.output_file = None  # type: Path
+        self.proc = None  # type: Optional[Popen]
+        self.output_file = None  # type: Optional[Path]
+        self.extra_args = []
 
     @property
     def name(self):
@@ -450,6 +448,30 @@ def run(self, output_file):
 WorkloadTemplate.define('superreseval', [1, 5, 10], UnittestRunner)
 WorkloadTemplate.define('seq2seqeval', ['small', 'medium', 'large'], UnittestRunner)
 
+WorkloadTemplate.define('vgg11web', [1], TFWebRunner)
+WorkloadTemplate.define('vgg16web', [1], TFWebRunner)
+WorkloadTemplate.define('vgg19web', [1], TFWebRunner)
+WorkloadTemplate.define('resnet50web', [1], TFWebRunner)
+WorkloadTemplate.define('resnet101web', [1], TFWebRunner)
+WorkloadTemplate.define('resnet152web', [1], TFWebRunner)
+WorkloadTemplate.define('googlenetweb', [1], TFWebRunner)
+WorkloadTemplate.define('alexnetweb', [1], TFWebRunner)
+WorkloadTemplate.define('overfeatweb', [1], TFWebRunner)
+WorkloadTemplate.define('inception3web', [1], TFWebRunner)
+WorkloadTemplate.define('inception4web', [1], TFWebRunner)
+
+WorkloadTemplate.define('vgg11client', [1], TFWebClientRunner)
+WorkloadTemplate.define('vgg16client', [1], TFWebClientRunner)
+WorkloadTemplate.define('vgg19client', [1], TFWebClientRunner)
+WorkloadTemplate.define('resnet50client', [1], TFWebClientRunner)
+WorkloadTemplate.define('resnet101client', [1], TFWebClientRunner)
+WorkloadTemplate.define('resnet152client', [1], TFWebClientRunner)
+WorkloadTemplate.define('googlenetclient', [1], TFWebClientRunner)
+WorkloadTemplate.define('alexnetclient', [1], TFWebClientRunner)
+WorkloadTemplate.define('overfeatclient', [1], TFWebClientRunner)
+WorkloadTemplate.define('inception3client', [1], TFWebClientRunner)
+WorkloadTemplate.define('inception4client', [1], TFWebClientRunner)
+
 
 # noinspection PyUnusedLocal
 def _disable_init(self):
diff --git a/benchmarks/exps/__init__.py b/benchmarks/exps/__init__.py
index 217c5b2..b093421 100644
--- a/benchmarks/exps/__init__.py
+++ b/benchmarks/exps/__init__.py
@@ -19,27 +19,32 @@
 #
 from __future__ import absolute_import, print_function, division, unicode_literals
 
+import itertools
 import time
 import re
 import logging
+import string
+import random
 from absl import flags
-from typing import Union, Iterable, List, TypeVar, Callable
+from typing import Union, Iterable, List, TypeVar, Callable, Optional
 
 import benchmarks.driver.utils.prompt as prompt
 from benchmarks.driver.runner import Executor
 from benchmarks.driver.server.config import presets
 from benchmarks.driver.server import SalusServer, SalusConfig
 from benchmarks.driver.tfserver import TFDistServer
-from benchmarks.driver.utils import atomic_directory, try_with_default, UsageError, kill_tree
+from benchmarks.driver.utils import atomic_directory, try_with_default, UsageError, kill_tree, unique
 from benchmarks.driver.utils.compatiblity import pathlib
 from benchmarks.driver.workload import Workload, WTL, ResourceGeometry
 
 Path = pathlib.Path
 T = TypeVar('T')
+TBatchSize = Union[str, int]
 logger = logging.getLogger(__name__)
 FLAGS = flags.FLAGS
 
 flags.DEFINE_boolean('ignore_error', False, 'Ignore error on workload')
+flags.DEFINE_string('logconf', None, 'Override default logconf in preset')
 
 
 class Pause(int):
@@ -70,7 +75,7 @@ class RunFn(object):
     __slots__ = '_fn'
 
     def __init__(self, fn):
-        # type: (Callable[Iterable[Workload], None]) -> None
+        # type: (Callable[[Iterable[Workload], *str], None]) -> None
         self._fn = fn
 
     def run(self, workloads, **kwargs):
@@ -234,11 +239,15 @@ def parse_actions_from_cmd(argv):
 def maybe_forced_preset(default):
     # type: (Callable[[], SalusConfig]) -> SalusConfig
     """Maybe return forced preset"""
+    preset_ctor = default
     if FLAGS.force_preset:
-        logger.info(f'Using server config preset: {FLAGS.force_preset}')
-        return getattr(presets, FLAGS.force_preset)()
-    logger.info(f'Using server config preset: {default.__name__}')
-    return default()
+        preset_ctor = getattr(presets, FLAGS.force_preset)
+    logger.info(f'Using server config preset: {preset_ctor.__name__}')
+    scfg = preset_ctor()
+    if FLAGS.logconf is not None:
+        logger.info(f'Using server logconf: {FLAGS.logconf}')
+        scfg.logconf = FLAGS.logconf
+    return scfg
 
 
 def parse_output_float(outputfile, pattern, group=1):
@@ -290,3 +299,90 @@ def update_jct(workload, update_global=False):
     workload.geometry.jct = jct
     if update_global:
         WTL.from_name(workload.name).add_geometry(workload.rcfg, workload.executor, ResourceGeometry(jct=jct))
+
+
+def select_workloads(argv,             # type: Iterable[str]
+                     batch_size=None,  # type: Optional[Union[Iterable[TBatchSize], TBatchSize]]
+                     batch_num=None,   # type: Optional[Union[Iterable[int], int]]
+                     executor=None     # type: Optional[Union[Iterable[Executor], Executor]]
+                     ):
+    # type: (...) -> Iterable[Workload]
+    """Select workloads based on commandline
+    Workloads can be separated by comma (',') or space (' ').
+    The workload name can include a undersocre ('_') separated batch size.
+
+    If no batch size part is included, batch sizes given in argument are selected.
+
+    If argv is empty, all available workloads are selected.
+
+    batch_size, batch_num, executor arguments expects a list of possible values, single value are converted into list.
+
+    Returns: list of created Workload instance
+
+    Example: alexnet_25,vgg11 inception3_75
+    """
+    if batch_size is not None:
+        if not isinstance(batch_size, list):
+            batch_size = [batch_size]
+
+    if batch_num is None:
+        batch_num = [1]
+    else:
+        if not isinstance(batch_num, list):
+            batch_num = [batch_num]
+
+    if executor is None:
+        executor = [Executor.Salus]
+    else:
+        if not isinstance(executor, list):
+            executor = [executor]
+
+    if not argv:
+        names = WTL.known_workloads.keys()
+    else:
+        names = unique((
+            name
+            for piece in argv
+            for name in piece.split(',')
+        ), stable=True)
+
+    def expandbs(name):
+        if '_' in name:
+            name, bs = name.split('_')
+            return [(name, int(bs))]
+        else:
+            avail = WTL.from_name(name).available_batch_sizes()
+            if batch_size is None:
+                bss = avail
+            else:
+                bss = [bs for bs in batch_size if bs in avail]
+            return zip([name] * len(bss), bss)
+
+    wls = [name_bs for name in names for name_bs in expandbs(name)]
+
+    wls = itertools.product(wls, batch_num, executor)
+
+    return [WTL.create(name, bs, bn, ex) for (name, bs), bn, ex in wls]
+
+
+def wait_on_pipe(pipe):
+    logger.info(f'Waiting workload to be ready on {pipe}')
+    with open(pipe, 'rb') as f:
+        f.read(1)
+
+
+def release_on_pipe(pipe):
+    logger.info(f'Signaling workload to continue on {pipe}')
+    with open(pipe, 'wb') as f:
+        f.write(b"a")
+    logger.info(f'Workload continued on {pipe}')
+
+
+def sync_on_pipe(pipe):
+    wait_on_pipe(pipe)
+    release_on_pipe(pipe)
+
+
+def random_id(size=6, chars=string.ascii_uppercase + string.digits):
+    """Generate a random ID"""
+    return ''.join(random.choice(chars) for _ in range(size))
diff --git a/benchmarks/exps/bs_lat_tput.py b/benchmarks/exps/bs_lat_tput.py
new file mode 100644
index 0000000..e472b0e
--- /dev/null
+++ b/benchmarks/exps/bs_lat_tput.py
@@ -0,0 +1,80 @@
+# -*- coding: future_fstrings -*-
+#
+# Copyright 2019 Peifeng Yu <peifeng@umich.edu>
+#
+# This file is part of Salus
+# (see https://github.com/SymbioticLab/Salus).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Measure the throughtput and latency of each batch size.
+
+Collected data: model, batch_size, latency, throughtput (in 2 min)
+"""
+from __future__ import absolute_import, print_function, division, unicode_literals
+
+from absl import flags
+from typing import Sequence
+import logging
+
+from benchmarks.driver.runner import TFBenchmarkRunner
+from benchmarks.driver.server.config import presets
+from benchmarks.driver.utils.compatiblity import pathlib
+from benchmarks.driver.workload import Executor, WTL
+from benchmarks.exps import run_tf, select_workloads, run_seq, maybe_forced_preset
+
+
+FLAGS = flags.FLAGS
+logger = logging.getLogger(__name__)
+
+
+def set_env(wl):
+    wl.env['SALUS_TFBENCH_EVAL_INTERVAL'] = '0'
+    wl.env['SALUS_TFBENCH_EVAL_RAND_FACTOR'] = '0'
+    wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'true'
+
+    model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models')
+    model_dir = model_dir.expanduser().resolve()
+    wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = model_dir
+
+
+def do_measure(scfg, name, batch_sizes):
+    batch_num = 100
+    # batch_sizes = [1, 2, 4, 8, 16, 32]
+    # batch_sizes = [1024, 1536, 2048, 4096]
+    for bs in batch_sizes:
+        wl = WTL.create(name, bs, batch_num, executor=Executor.Salus)
+        set_env(wl)
+        run_seq(scfg.copy(output_dir=FLAGS.save_dir / "salus"), wl)
+
+        wl = WTL.create(name, bs, batch_num, executor=Executor.TF)
+        set_env(wl)
+        run_seq(scfg.copy(output_dir=FLAGS.save_dir / "tf"), wl)
+
+
+def main(argv):
+    # type: (Sequence[str]) -> None
+    scfg = maybe_forced_preset(presets.MostEfficient)
+    scfg.logconf = 'disable'
+
+    name = "alexnet"
+    if len(argv) > 1:
+        name = argv[0]
+    batch_sizes = [int(v) for v in argv[1:]]
+
+    if not batch_sizes:
+        batch_sizes = [1, 2, 4, 8]
+
+    do_measure(scfg, name, batch_sizes)
+
diff --git a/benchmarks/exps/card304.py b/benchmarks/exps/card304.py
new file mode 100644
index 0000000..8c5b872
--- /dev/null
+++ b/benchmarks/exps/card304.py
@@ -0,0 +1,275 @@
+# -*- coding: future_fstrings -*-
+#
+# Copyright 2019 Peifeng Yu <peifeng@umich.edu>
+# 
+# This file is part of Salus
+# (see https://github.com/SymbioticLab/Salus).
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Card 304: Experiment one inference job with one training job
+
+Record inference latency. Compare inference job latency running along vs. running with a training job.
+
+The latency should be measured with increasing throughput (qps) for the inference job.
+
+Collected data: inference per iteration speed (latency), training throughput (derived from per iteration speed)
+"""
+from __future__ import absolute_import, print_function, division, unicode_literals
+
+import tempfile
+from typing import Sequence
+
+from absl import flags
+import logging
+import os
+
+from benchmarks.driver.server.config import presets
+from benchmarks.driver.workload import WTL, Executor
+from benchmarks.driver.utils.compatiblity import pathlib
+from benchmarks.exps import (
+    run_seq, maybe_forced_preset, RunFn, Pause, wait_on_pipe, release_on_pipe,
+    case_switch_main,
+    run_tfdist, run_tf
+)
+
+
+FLAGS = flags.FLAGS
+logger = logging.getLogger(__name__)
+
+
+def set_env(wl):
+    wl.env['SALUS_TFBENCH_EVAL_INTERVAL'] = '0'
+    wl.env['SALUS_TFBENCH_EVAL_RAND_FACTOR'] = '0'
+    wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'true'
+
+    model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models')
+    model_dir = model_dir.expanduser().resolve()
+    wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = model_dir
+
+
+def salus(argv):
+    # type: (Sequence[str]) -> None
+    scfg = maybe_forced_preset(presets.MostEfficient)
+
+    name = "alexneteval"
+    if len(argv) > 1:
+        name = argv[0]
+    batch_sizes = [int(v) for v in argv[1:]]
+
+    if not batch_sizes:
+        batch_sizes = [1, 2, 4, 8]
+
+    batch_num = 300
+    # batch_sizes = [1, 2, 4, 8, 16, 32]
+    # batch_sizes = [1024, 1536, 2048, 4096]
+    for idx, bs in enumerate(batch_sizes):
+        with tempfile.TemporaryDirectory() as td:
+            # create a background training job, the batch number has no effect here,
+            # only used to distinguish different runs
+            trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.Salus)
+            # make sure it runs long enough
+            trainWl.env['SALUS_ITER_SECONDS'] = '300'
+
+            # create a pipe to signal trainWl
+            pipetrain = str(pathlib.Path(td).joinpath('fifotrain'))
+            os.mkfifo(pipetrain)
+            trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain
+
+            # create the foreground inference job
+            wl = WTL.create(name, bs, batch_num, executor=Executor.Salus)
+            set_env(wl)
+            wl.env['SALUS_ITER_SECONDS'] = '150'
+
+            pipe = str(pathlib.Path(td).joinpath('fifo'))
+            os.mkfifo(pipe)
+            wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe
+
+            run_seq(scfg.copy(output_dir=FLAGS.save_dir / "salus" / (name + "-inception4")),
+                    trainWl,  # start the background job
+                    wl,  # start the foreground job
+                    # wait for both jobs to be ready
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
+                    # start train job
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
+                    # wait 10 seconds
+                    Pause(10),
+                    # release inference job
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
+                    # run_seq automatically join all jobs at the end of the sequence
+                    )
+
+
+def tfdist(argv):
+    # type: (Sequence[str]) -> None
+    name = "alexneteval"
+    if len(argv) > 1:
+        name = argv[0]
+    batch_sizes = [int(v) for v in argv[1:]]
+
+    if not batch_sizes:
+        batch_sizes = [1, 2, 4, 8]
+
+    batch_num = 300
+    # batch_sizes = [1, 2, 4, 8, 16, 32]
+    # batch_sizes = [1024, 1536, 2048, 4096]
+    for idx, bs in enumerate(batch_sizes):
+        with tempfile.TemporaryDirectory() as td:
+            # create a background training job, the batch number has no effect here,
+            # only used to distinguish different runs
+            trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.TFDist)
+            # make sure it runs long enough
+            trainWl.env['SALUS_ITER_SECONDS'] = '300'
+
+            # create a pipe to signal trainWl
+            pipetrain = str(pathlib.Path(td).joinpath('fifotrain'))
+            os.mkfifo(pipetrain)
+            trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain
+
+            # create the foreground inference job
+            wl = WTL.create(name, bs, batch_num, executor=Executor.TFDist)
+            set_env(wl)
+            wl.env['SALUS_ITER_SECONDS'] = '150'
+
+            pipe = str(pathlib.Path(td).joinpath('fifo'))
+            os.mkfifo(pipe)
+            wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe
+
+            run_tfdist(FLAGS.save_dir / "tfdist" / (name + "-inception4"),
+                       trainWl,  # start the background job
+                       wl,  # start the foreground job
+                       # wait for both jobs to be ready
+                       RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
+                       RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
+                       # start train job
+                       RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
+                       # wait 10 seconds
+                       Pause(10),
+                       # release inference job
+                       RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
+                       # run_seq automatically join all jobs at the end of the sequence
+                       )
+
+
+def tfmps(argv):
+    # type: (Sequence[str]) -> None
+    name = "alexneteval"
+    if len(argv) > 1:
+        name = argv[0]
+    batch_sizes = [int(v) for v in argv[1:]]
+
+    if not batch_sizes:
+        batch_sizes = [1, 2, 4, 8]
+
+    batch_num = 300
+    # batch_sizes = [1, 2, 4, 8, 16, 32]
+    # batch_sizes = [1024, 1536, 2048, 4096]
+    for idx, bs in enumerate(batch_sizes):
+        with tempfile.TemporaryDirectory() as td:
+            # create a background training job, the batch number has no effect here,
+            # only used to distinguish different runs
+            trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.TF)
+            # make sure it runs long enough
+            trainWl.env['SALUS_ITER_SECONDS'] = '300'
+            trainWl.extra_args += ['--min_mem']
+
+            # create a pipe to signal trainWl
+            pipetrain = str(pathlib.Path(td).joinpath('fifotrain'))
+            os.mkfifo(pipetrain)
+            trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain
+
+            # create the foreground inference job
+            wl = WTL.create(name, bs, batch_num, executor=Executor.TF)
+            set_env(wl)
+            wl.env['SALUS_ITER_SECONDS'] = '150'
+            wl.extra_args += ['--min_mem']
+
+            pipe = str(pathlib.Path(td).joinpath('fifo'))
+            os.mkfifo(pipe)
+            wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe
+
+            run_tf(FLAGS.save_dir / "tfmps" / (name + "-inception4"),
+                   trainWl,  # start the background job
+                   wl,  # start the foreground job
+                   # wait for both jobs to be ready
+                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
+                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
+                   # start train job
+                   RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
+                   # wait 10 seconds
+                   Pause(10),
+                   # release inference job
+                   RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
+                   # run_seq automatically join all jobs at the end of the sequence
+                   )
+
+
+def tfmps2(argv):
+    # type: (Sequence[str]) -> None
+    name = "alexneteval"
+    if len(argv) > 1:
+        name = argv[0]
+    batch_sizes = [int(v) for v in argv[1:]]
+
+    if not batch_sizes:
+        batch_sizes = [1, 2, 4, 8]
+
+    batch_num = 300
+    # batch_sizes = [1, 2, 4, 8, 16, 32]
+    # batch_sizes = [1024, 1536, 2048, 4096]
+    for idx, bs in enumerate(batch_sizes):
+        with tempfile.TemporaryDirectory() as td:
+            # create a background training job, the batch number has no effect here,
+            # only used to distinguish different runs
+            trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.TF)
+            # make sure it runs long enough
+            trainWl.env['SALUS_ITER_SECONDS'] = '300'
+            trainWl.extra_args += ['--min_mem']
+
+            # create a pipe to signal trainWl
+            pipetrain = str(pathlib.Path(td).joinpath('fifotrain'))
+            os.mkfifo(pipetrain)
+            trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain
+
+            # create the foreground inference job
+            wl = WTL.create(name, bs, batch_num, executor=Executor.TF)
+            set_env(wl)
+            wl.env['SALUS_ITER_SECONDS'] = '150'
+            wl.extra_args += ['--min_mem']
+
+            pipe = str(pathlib.Path(td).joinpath('fifo'))
+            os.mkfifo(pipe)
+            wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe
+
+            run_tf(FLAGS.save_dir / "tfmps2" / (name + "-inception4"),
+                   wl,  # start the foreground job
+                   Pause(20),
+                   trainWl,  # start the background job
+                   # wait for both jobs to be ready
+                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
+                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
+                   # start train job
+                   RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
+                   # wait 10 seconds
+                   Pause(10),
+                   # release inference job
+                   RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
+                   # run_seq automatically join all jobs at the end of the sequence
+                   )
+
+
+@case_switch_main
+def main():
+    return salus, tfdist, tfmps, tfmps2
diff --git a/benchmarks/exps/card308.py b/benchmarks/exps/card308.py
new file mode 100644
index 0000000..4b656f2
--- /dev/null
+++ b/benchmarks/exps/card308.py
@@ -0,0 +1,64 @@
+# -*- coding: future_fstrings -*-
+#
+# Copyright 2019 Peifeng Yu <peifeng@umich.edu>
+#
+# This file is part of Salus
+# (see https://github.com/SymbioticLab/Salus).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Card 308: Make inference job accepts requests using tfweb
+
+Export SavedModel for existing tf_cnn_benchmark models
+
+Collected data: SavedModel
+"""
+from __future__ import absolute_import, print_function, division, unicode_literals
+
+from absl import flags
+from typing import Sequence
+import logging
+
+from benchmarks.driver.runner import TFBenchmarkRunner
+from benchmarks.driver.utils.compatiblity import pathlib
+from benchmarks.driver.workload import Executor
+from benchmarks.exps import run_tf, select_workloads
+
+FLAGS = flags.FLAGS
+logger = logging.getLogger(__name__)
+
+
+def main(argv):
+    # type: (Sequence[str]) -> None
+
+    model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models')
+    model_dir = model_dir.expanduser().resolve()
+
+    saved_model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/saved_models')
+    saved_model_dir = saved_model_dir.expanduser().resolve()
+
+    for wl in select_workloads(argv, batch_size=1, batch_num=1, executor=Executor.TF):
+        if wl.wtl.runnerCls is not TFBenchmarkRunner:
+            logger.info(f'Skipping {wl.name}')
+            continue
+        if not wl.name.endswith('eval'):
+            logger.info(f'Skipping {wl.name}')
+            continue
+
+        logger.info(f"**** Saving SavedModel: {wl.canonical_name}")
+        logger.info(f"**** Location: {FLAGS.save_dir}")
+
+        wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = str(model_dir)
+        wl.env['SALUS_TFBENCH_EVAL_SAVED_MODEL_DIR'] = str(saved_model_dir)
+        run_tf(FLAGS.save_dir, wl)
diff --git a/benchmarks/exps/card309.py b/benchmarks/exps/card309.py
new file mode 100644
index 0000000..bfa3347
--- /dev/null
+++ b/benchmarks/exps/card309.py
@@ -0,0 +1,147 @@
+# -*- coding: future_fstrings -*-
+#
+# Copyright 2019 Peifeng Yu <peifeng@umich.edu>
+# 
+# This file is part of Salus
+# (see https://github.com/SymbioticLab/Salus).
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Card 309: Experiment multiple inference jobs together
+
+Record inference latency. Compare inference job latency running along vs. running with others.
+
+The latency should be measured with increasing throughput (qps) for the inference job.
+
+Collected data:
+    - inference per iteration speed (latency)
+    - training throughput (derived from per iteration speed and batch size)
+"""
+from __future__ import absolute_import, print_function, division, unicode_literals
+
+import os
+import tempfile
+from typing import Sequence
+
+from absl import flags
+import logging
+
+from benchmarks.driver.server.config import presets
+from benchmarks.driver.workload import WTL, Executor
+from benchmarks.driver.utils.compatiblity import pathlib
+from benchmarks.exps import (
+    run_seq, maybe_forced_preset, case_switch_main, RunFn, sync_on_pipe, wait_on_pipe,
+    release_on_pipe,
+)
+
+
+FLAGS = flags.FLAGS
+logger = logging.getLogger(__name__)
+
+flags.DEFINE_integer('num_replicas', 1, 'Number of replicas to run concurrently')
+
+
+def set_env(wl):
+    wl.env['SALUS_TFBENCH_EVAL_INTERVAL'] = '0'
+    wl.env['SALUS_TFBENCH_EVAL_RAND_FACTOR'] = '0'
+    wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'true'
+
+    model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models')
+    model_dir = model_dir.expanduser().resolve()
+    wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = model_dir
+
+    # run for 60 seconds, ignoring BATCH_NUM
+    wl.env['SALUS_ITER_SECONDS'] = '120'
+
+
+def do_inferences(scfg, names, batch_sizes):
+    batch_num = 100
+    # batch_sizes = [1, 2, 4, 8, 16, 32]
+    # batch_sizes = [1024, 1536, 2048, 4096]
+    for bs in batch_sizes:
+        wls = []
+        for name in names:
+            if not name.endswith('eval'):
+                raise ValueError('Not an inference workload!!!')
+            wl = WTL.create(name, bs, batch_num, executor=Executor.Salus)
+            set_env(wl)
+            wls.append(wl)
+
+        run_seq(scfg, *wls)
+
+
+def same(argv):
+    # type: (Sequence[str]) -> None
+    scfg = maybe_forced_preset(presets.MostEfficient)
+    scfg.logconf = 'disable'
+
+    name = "alexneteval"
+    if len(argv) > 1:
+        name = argv[0]
+    batch_sizes = [int(v) for v in argv[1:]]
+
+    if not batch_sizes:
+        batch_sizes = [1, 2, 4, 8]
+
+    do_inferences(scfg.copy(output_dir=FLAGS.save_dir / str(FLAGS.num_replicas)),
+                  [name] * FLAGS.num_replicas,
+                  batch_sizes)
+
+
+def diff(argv):
+    # type: (Sequence[str]) -> None
+    scfg = maybe_forced_preset(presets.MostEfficient)
+    scfg.logconf = 'disable'
+
+    # all non-integer argv are treated as names
+    names = []
+    batch_sizes = []
+    for arg in argv:
+        try:
+            batch_sizes.append(int(arg))
+        except ValueError:
+            names.append(arg)
+
+    # create jobs
+    batch_num = 100
+    # batch_sizes = [1, 2, 4, 8, 16, 32]
+    # batch_sizes = [1024, 1536, 2048, 4096]
+    for bs in batch_sizes:
+        with tempfile.TemporaryDirectory() as td:
+            wls = []
+            pipes = []
+            for name in names:
+                if not name.endswith('eval'):
+                    raise ValueError('Not an inference workload!!!')
+                wl = WTL.create(name, bs, batch_num, executor=Executor.Salus)
+                set_env(wl)
+                wls.append(wl)
+
+                # also add a small pause to make sure every job starts
+                pipe = str(pathlib.Path(td).joinpath(wl.canonical_name).with_suffix('.pipe'))
+                os.mkfifo(pipe)
+                pipes.append(pipes)
+
+            # wait all jobs to be ready
+            wls.append(RunFn(lambda workloads, **kwargs: [wait_on_pipe(pipe) for pipe in pipes] and None))
+            # signal all jobs to start
+            wls.append(RunFn(lambda workloads, **kwargs: [release_on_pipe(pipe) for pipe in pipes] and None))
+
+            run_seq(scfg.copy(output_dir=FLAGS.save_dir / '-'.join(names)),
+                    *wls)
+
+
+@case_switch_main
+def main():
+    return same, diff
diff --git a/benchmarks/exps/old/__init__.py b/benchmarks/exps/old/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/exps/bigrun.py b/benchmarks/exps/old/bigrun.py
similarity index 100%
rename from benchmarks/exps/bigrun.py
rename to benchmarks/exps/old/bigrun.py
diff --git a/benchmarks/exps/card13.py b/benchmarks/exps/old/card13.py
similarity index 100%
rename from benchmarks/exps/card13.py
rename to benchmarks/exps/old/card13.py
diff --git a/benchmarks/exps/card185.py b/benchmarks/exps/old/card185.py
similarity index 100%
rename from benchmarks/exps/card185.py
rename to benchmarks/exps/old/card185.py
diff --git a/benchmarks/exps/card186.py b/benchmarks/exps/old/card186.py
similarity index 100%
rename from benchmarks/exps/card186.py
rename to benchmarks/exps/old/card186.py
diff --git a/benchmarks/exps/card188.py b/benchmarks/exps/old/card188.py
similarity index 100%
rename from benchmarks/exps/card188.py
rename to benchmarks/exps/old/card188.py
diff --git a/benchmarks/exps/card189.py b/benchmarks/exps/old/card189.py
similarity index 100%
rename from benchmarks/exps/card189.py
rename to benchmarks/exps/old/card189.py
diff --git a/benchmarks/exps/card194.py b/benchmarks/exps/old/card194.py
similarity index 100%
rename from benchmarks/exps/card194.py
rename to benchmarks/exps/old/card194.py
diff --git a/benchmarks/exps/card195.py b/benchmarks/exps/old/card195.py
similarity index 100%
rename from benchmarks/exps/card195.py
rename to benchmarks/exps/old/card195.py
diff --git a/benchmarks/exps/card198.py b/benchmarks/exps/old/card198.py
similarity index 100%
rename from benchmarks/exps/card198.py
rename to benchmarks/exps/old/card198.py
diff --git a/benchmarks/exps/card203.py b/benchmarks/exps/old/card203.py
similarity index 100%
rename from benchmarks/exps/card203.py
rename to benchmarks/exps/old/card203.py
diff --git a/benchmarks/exps/card206.py b/benchmarks/exps/old/card206.py
similarity index 100%
rename from benchmarks/exps/card206.py
rename to benchmarks/exps/old/card206.py
diff --git a/benchmarks/exps/card207.py b/benchmarks/exps/old/card207.py
similarity index 100%
rename from benchmarks/exps/card207.py
rename to benchmarks/exps/old/card207.py
diff --git a/benchmarks/exps/card210.py b/benchmarks/exps/old/card210.py
similarity index 100%
rename from benchmarks/exps/card210.py
rename to benchmarks/exps/old/card210.py
diff --git a/benchmarks/exps/card211.py b/benchmarks/exps/old/card211.py
similarity index 100%
rename from benchmarks/exps/card211.py
rename to benchmarks/exps/old/card211.py
diff --git a/benchmarks/exps/card212.py b/benchmarks/exps/old/card212.py
similarity index 100%
rename from benchmarks/exps/card212.py
rename to benchmarks/exps/old/card212.py
diff --git a/benchmarks/exps/card214.py b/benchmarks/exps/old/card214.py
similarity index 100%
rename from benchmarks/exps/card214.py
rename to benchmarks/exps/old/card214.py
diff --git a/benchmarks/exps/card218.py b/benchmarks/exps/old/card218.py
similarity index 100%
rename from benchmarks/exps/card218.py
rename to benchmarks/exps/old/card218.py
diff --git a/benchmarks/exps/card219.py b/benchmarks/exps/old/card219.py
similarity index 100%
rename from benchmarks/exps/card219.py
rename to benchmarks/exps/old/card219.py
diff --git a/benchmarks/exps/card233.py b/benchmarks/exps/old/card233.py
similarity index 100%
rename from benchmarks/exps/card233.py
rename to benchmarks/exps/old/card233.py
diff --git a/benchmarks/exps/card234.py b/benchmarks/exps/old/card234.py
similarity index 100%
rename from benchmarks/exps/card234.py
rename to benchmarks/exps/old/card234.py
diff --git a/benchmarks/exps/card235.py b/benchmarks/exps/old/card235.py
similarity index 100%
rename from benchmarks/exps/card235.py
rename to benchmarks/exps/old/card235.py
diff --git a/benchmarks/exps/card236.py b/benchmarks/exps/old/card236.py
similarity index 100%
rename from benchmarks/exps/card236.py
rename to benchmarks/exps/old/card236.py
diff --git a/benchmarks/exps/card240.py b/benchmarks/exps/old/card240.py
similarity index 100%
rename from benchmarks/exps/card240.py
rename to benchmarks/exps/old/card240.py
diff --git a/benchmarks/exps/card241.py b/benchmarks/exps/old/card241.py
similarity index 100%
rename from benchmarks/exps/card241.py
rename to benchmarks/exps/old/card241.py
diff --git a/benchmarks/exps/card249.py b/benchmarks/exps/old/card249.py
similarity index 100%
rename from benchmarks/exps/card249.py
rename to benchmarks/exps/old/card249.py
diff --git a/benchmarks/exps/card250.py b/benchmarks/exps/old/card250.py
similarity index 100%
rename from benchmarks/exps/card250.py
rename to benchmarks/exps/old/card250.py
diff --git a/benchmarks/exps/card251.py b/benchmarks/exps/old/card251.py
similarity index 100%
rename from benchmarks/exps/card251.py
rename to benchmarks/exps/old/card251.py
diff --git a/benchmarks/exps/card252.py b/benchmarks/exps/old/card252.py
similarity index 100%
rename from benchmarks/exps/card252.py
rename to benchmarks/exps/old/card252.py
diff --git a/benchmarks/exps/card259.py b/benchmarks/exps/old/card259.py
similarity index 100%
rename from benchmarks/exps/card259.py
rename to benchmarks/exps/old/card259.py
diff --git a/benchmarks/exps/card262.py b/benchmarks/exps/old/card262.py
similarity index 100%
rename from benchmarks/exps/card262.py
rename to benchmarks/exps/old/card262.py
diff --git a/benchmarks/exps/card266.py b/benchmarks/exps/old/card266.py
similarity index 100%
rename from benchmarks/exps/card266.py
rename to benchmarks/exps/old/card266.py
diff --git a/benchmarks/exps/card270.py b/benchmarks/exps/old/card270.py
similarity index 100%
rename from benchmarks/exps/card270.py
rename to benchmarks/exps/old/card270.py
diff --git a/benchmarks/exps/card271.py b/benchmarks/exps/old/card271.py
similarity index 100%
rename from benchmarks/exps/card271.py
rename to benchmarks/exps/old/card271.py
diff --git a/benchmarks/exps/card272.py b/benchmarks/exps/old/card272.py
similarity index 100%
rename from benchmarks/exps/card272.py
rename to benchmarks/exps/old/card272.py
diff --git a/benchmarks/exps/card275.py b/benchmarks/exps/old/card275.py
similarity index 100%
rename from benchmarks/exps/card275.py
rename to benchmarks/exps/old/card275.py
diff --git a/benchmarks/exps/exp10.py b/benchmarks/exps/old/exp10.py
similarity index 100%
rename from benchmarks/exps/exp10.py
rename to benchmarks/exps/old/exp10.py
diff --git a/benchmarks/exps/exp11.py b/benchmarks/exps/old/exp11.py
similarity index 100%
rename from benchmarks/exps/exp11.py
rename to benchmarks/exps/old/exp11.py
diff --git a/benchmarks/exps/exp12.py b/benchmarks/exps/old/exp12.py
similarity index 100%
rename from benchmarks/exps/exp12.py
rename to benchmarks/exps/old/exp12.py
diff --git a/benchmarks/exps/exp13.py b/benchmarks/exps/old/exp13.py
similarity index 100%
rename from benchmarks/exps/exp13.py
rename to benchmarks/exps/old/exp13.py
diff --git a/benchmarks/exps/exp14.py b/benchmarks/exps/old/exp14.py
similarity index 100%
rename from benchmarks/exps/exp14.py
rename to benchmarks/exps/old/exp14.py
diff --git a/benchmarks/exps/exp15.py b/benchmarks/exps/old/exp15.py
similarity index 100%
rename from benchmarks/exps/exp15.py
rename to benchmarks/exps/old/exp15.py
diff --git a/benchmarks/exps/exp18.py b/benchmarks/exps/old/exp18.py
similarity index 100%
rename from benchmarks/exps/exp18.py
rename to benchmarks/exps/old/exp18.py
diff --git a/benchmarks/exps/exp19.py b/benchmarks/exps/old/exp19.py
similarity index 100%
rename from benchmarks/exps/exp19.py
rename to benchmarks/exps/old/exp19.py
diff --git a/benchmarks/exps/exp3.py b/benchmarks/exps/old/exp3.py
similarity index 100%
rename from benchmarks/exps/exp3.py
rename to benchmarks/exps/old/exp3.py
diff --git a/benchmarks/exps/exp6_2.py b/benchmarks/exps/old/exp6_2.py
similarity index 100%
rename from benchmarks/exps/exp6_2.py
rename to benchmarks/exps/old/exp6_2.py
diff --git a/benchmarks/exps/exp9.py b/benchmarks/exps/old/exp9.py
similarity index 100%
rename from benchmarks/exps/exp9.py
rename to benchmarks/exps/old/exp9.py
diff --git a/benchmarks/exps/fairnessjct.py b/benchmarks/exps/old/fairnessjct.py
similarity index 100%
rename from benchmarks/exps/fairnessjct.py
rename to benchmarks/exps/old/fairnessjct.py
diff --git a/benchmarks/exps/gperf.py b/benchmarks/exps/old/gperf.py
similarity index 100%
rename from benchmarks/exps/gperf.py
rename to benchmarks/exps/old/gperf.py
diff --git a/benchmarks/exps/jct.py b/benchmarks/exps/old/jct.py
similarity index 100%
rename from benchmarks/exps/jct.py
rename to benchmarks/exps/old/jct.py
diff --git a/benchmarks/exps/mem.py b/benchmarks/exps/old/mem.py
similarity index 100%
rename from benchmarks/exps/mem.py
rename to benchmarks/exps/old/mem.py
diff --git a/benchmarks/exps/memop.py b/benchmarks/exps/old/memop.py
similarity index 100%
rename from benchmarks/exps/memop.py
rename to benchmarks/exps/old/memop.py
diff --git a/benchmarks/exps/mnist.py b/benchmarks/exps/old/mnist.py
similarity index 100%
rename from benchmarks/exps/mnist.py
rename to benchmarks/exps/old/mnist.py
diff --git a/benchmarks/exps/mnistnv.py b/benchmarks/exps/old/mnistnv.py
similarity index 100%
rename from benchmarks/exps/mnistnv.py
rename to benchmarks/exps/old/mnistnv.py
diff --git a/benchmarks/exps/one.py b/benchmarks/exps/old/one.py
similarity index 100%
rename from benchmarks/exps/one.py
rename to benchmarks/exps/old/one.py
diff --git a/benchmarks/exps/onetf.py b/benchmarks/exps/old/onetf.py
similarity index 100%
rename from benchmarks/exps/onetf.py
rename to benchmarks/exps/old/onetf.py
diff --git a/benchmarks/exps/optracing.py b/benchmarks/exps/old/optracing.py
similarity index 100%
rename from benchmarks/exps/optracing.py
rename to benchmarks/exps/old/optracing.py
diff --git a/benchmarks/exps/paral.py b/benchmarks/exps/old/paral.py
similarity index 100%
rename from benchmarks/exps/paral.py
rename to benchmarks/exps/old/paral.py
diff --git a/benchmarks/exps/paral2.py b/benchmarks/exps/old/paral2.py
similarity index 100%
rename from benchmarks/exps/paral2.py
rename to benchmarks/exps/old/paral2.py
diff --git a/benchmarks/exps/paraltf.py b/benchmarks/exps/old/paraltf.py
similarity index 100%
rename from benchmarks/exps/paraltf.py
rename to benchmarks/exps/old/paraltf.py
diff --git a/benchmarks/exps/run5.py b/benchmarks/exps/old/run5.py
similarity index 100%
rename from benchmarks/exps/run5.py
rename to benchmarks/exps/old/run5.py
diff --git a/benchmarks/exps/save_train_dir.py b/benchmarks/exps/old/save_train_dir.py
similarity index 87%
rename from benchmarks/exps/save_train_dir.py
rename to benchmarks/exps/old/save_train_dir.py
index c3ed2fd..96398af 100644
--- a/benchmarks/exps/save_train_dir.py
+++ b/benchmarks/exps/old/save_train_dir.py
@@ -77,20 +77,18 @@ def do_mem(logdir, network, batch_size):
 
     logger.info(f'Saving model checkpoint for {network}_{batch_size} for {batch_num} iter')
 
-    final_dst = logdir / 'tf' / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None))
+    final_dst = logdir / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None))
 
     with atomic_directory(final_dst) as outputdir:
         logger.info('    Running on TF')
         wl = WTL.create(network, batch_size, batch_num, Executor.TF)
         wl.env['SALUS_SAVE_MODEL'] = '1'
+
+        model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models')
+        model_dir = model_dir.expanduser().resolve()
+        wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = str(model_dir)
+
         run_tf(outputdir, wl)
-        # filter and move file to a more convinent name
-        for f in pathlib.Path(outputdir).iterdir():
-            with f.with_name('alloc.output').open('w') as file:
-                grep = execute(['egrep', r"] (\+|-)", f.name], stdout=file, cwd=str(f.parent))
-                grep.wait()
-            f.unlink()
-            break
     return final_dst
 
 
diff --git a/benchmarks/exps/serialmany.py b/benchmarks/exps/old/serialmany.py
similarity index 100%
rename from benchmarks/exps/serialmany.py
rename to benchmarks/exps/old/serialmany.py
diff --git a/benchmarks/exps/two.py b/benchmarks/exps/old/two.py
similarity index 100%
rename from benchmarks/exps/two.py
rename to benchmarks/exps/old/two.py
diff --git a/benchmarks/exps/smtracing.py b/benchmarks/exps/smtracing.py
new file mode 100644
index 0000000..e1e6d19
--- /dev/null
+++ b/benchmarks/exps/smtracing.py
@@ -0,0 +1,200 @@
+# -*- coding: future_fstrings -*-
+#
+# Copyright 2019 Peifeng Yu <peifeng@umich.edu>
+# 
+# This file is part of Salus
+# (see https://github.com/SymbioticLab/Salus).
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+SM Tracing: Experiment one inference job with one training job
+
+Almost the same as Card 304, but with proper SM tracing implemented.
+
+The propurse of this experiment is to tune and debug the SM tracing pipeline.
+
+- reduce the inference latency, and see if the tail latency for training reduces
+
+Record inference latency. Compare inference job latency running along vs. running with a training job.
+
+The latency should be measured with increasing throughput (qps) for the inference job.
+
+Collected data: inference per iteration speed (latency), training throughput (derived from per iteration speed)
+"""
+from __future__ import absolute_import, print_function, division, unicode_literals
+
+import tempfile
+from typing import Sequence
+
+from absl import flags
+import logging
+import os
+
+from benchmarks.driver.server.config import presets
+from benchmarks.driver.workload import WTL, Executor
+from benchmarks.driver.utils.compatiblity import pathlib
+from benchmarks.exps import (
+    run_seq, maybe_forced_preset, RunFn, Pause, wait_on_pipe, release_on_pipe,
+    case_switch_main,
+    run_tfdist, run_tf
+)
+
+
+FLAGS = flags.FLAGS
+logger = logging.getLogger(__name__)
+
+
+def set_env(wl):
+    wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'true'
+
+    model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models')
+    model_dir = model_dir.expanduser().resolve()
+    wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = model_dir
+
+
+def create_train(executor, idx, td):
+    # the batch number has no effect here, only used to distinguish different runs
+    train_wl = WTL.create('inception4', 50, 100 + idx, executor=executor)
+    # make sure it runs long enough
+    train_wl.env['SALUS_ITER_SECONDS'] = '300'
+
+    # create a pipe to signal train_wl
+    pipetrain = str(pathlib.Path(td).joinpath('fifotrain'))
+    os.mkfifo(pipetrain)
+    train_wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain
+    return train_wl, pipetrain
+
+
+def create_infer(executor, name, bs, batch_num, td):
+    wl = WTL.create(name, bs, batch_num, executor=executor)
+    set_env(wl)
+    wl.env['SALUS_ITER_SECONDS'] = '150'
+    wl.extra_args += [
+        '--eval_interval_secs=0.02',
+        # '--eval_interval_random_factor=5'
+    ]
+
+    pipe = str(pathlib.Path(td).joinpath('fifo'))
+    os.mkfifo(pipe)
+    wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe
+
+    return wl, pipe
+
+
+def salus(argv):
+    # type: (Sequence[str]) -> None
+    scfg = maybe_forced_preset(presets.MostEfficient)
+
+    name = "alexneteval"
+    if len(argv) > 1:
+        name = argv[0]
+    batch_sizes = [int(v) for v in argv[1:]]
+
+    if not batch_sizes:
+        batch_sizes = [1, 2, 4, 8]
+
+    batch_num = 300
+    # batch_sizes = [1, 2, 4, 8, 16, 32]
+    # batch_sizes = [1024, 1536, 2048, 4096]
+    for idx, bs in enumerate(batch_sizes):
+        with tempfile.TemporaryDirectory() as td:
+            # create a background training job
+            train_wl, pipetrain = create_train(Executor.Salus, idx, td)
+
+            # create the foreground inference job
+            wl, pipe = create_infer(Executor.Salus, name, bs, batch_num, td)
+
+            run_seq(scfg.copy(output_dir=FLAGS.save_dir / "salus" / (name + "-inception4")),
+                    train_wl,  # start the background job
+                    wl,  # start the foreground job
+                    # wait for both jobs to be ready
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
+                    # start train job
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
+                    # wait 10 seconds
+                    Pause(10),
+                    # release inference job
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
+                    # run_seq automatically join all jobs at the end of the sequence
+                    )
+
+
+def tfmps(argv):
+    # type: (Sequence[str]) -> None
+    name = "alexneteval"
+    if len(argv) > 1:
+        name = argv[0]
+    batch_sizes = [int(v) for v in argv[1:]]
+
+    if not batch_sizes:
+        batch_sizes = [1, 2, 4, 8]
+
+    batch_num = 300
+    # batch_sizes = [1, 2, 4, 8, 16, 32]
+    # batch_sizes = [1024, 1536, 2048, 4096]
+    for idx, bs in enumerate(batch_sizes):
+        with tempfile.TemporaryDirectory() as td:
+            # create a background training job
+            train_wl, pipetrain = create_train(Executor.TF, idx, td)
+            train_wl.extra_args += ['--min_mem']
+
+            # create the foreground inference job
+            wl, pipe = create_infer(Executor.TF, name, bs, batch_num, td)
+            wl.extra_args += ['--min_mem']
+
+            run_tf(FLAGS.save_dir / "tfmps" / (name + "-inception4"),
+                   train_wl,  # start the background job
+                   wl,  # start the foreground job
+                   # wait for both jobs to be ready
+                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
+                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
+                   # start train job
+                   RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
+                   # wait 10 seconds
+                   Pause(10),
+                   # release inference job
+                   RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
+                   # run_seq automatically join all jobs at the end of the sequence
+                   )
+
+
+def train_alone(argv):
+    """Run training workload alone take note of SM usage"""
+    sm_factors = [float(v) for v in argv]
+    if not sm_factors:
+        sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0]
+
+    logger.info(f"Running Salus with sm factors: {sm_factors}")
+
+    # run salus
+    for factor in sm_factors:
+        with tempfile.TemporaryDirectory() as td:
+            scfg = maybe_forced_preset(presets.OpTracing)
+            scfg.logconf = 'smtracing'
+            scfg.extra_args += [
+                '--sm-factor', f'{factor:.2f}'
+            ]
+            logger.info(f"Running Salus with sm factor: {factor}")
+            # the background training job
+            wl, pipe = create_train(Executor.Salus, 0, td)
+            run_seq(scfg.copy(output_dir=FLAGS.save_dir / "alone" / f"{factor:.2f}"),
+                    wl,
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe)))
+
+
+@case_switch_main
+def main():
+    return salus, tfmps, train_alone, salus_factor
diff --git a/benchmarks/exps/tune_pending.py b/benchmarks/exps/tune_pending.py
new file mode 100644
index 0000000..7371654
--- /dev/null
+++ b/benchmarks/exps/tune_pending.py
@@ -0,0 +1,389 @@
+# -*- coding: future_fstrings -*-
+#
+# Copyright 2019 Peifeng Yu <peifeng@umich.edu>
+# 
+# This file is part of Salus
+# (see https://github.com/SymbioticLab/Salus).
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Tune pending: Experiment one inference job with one training job
+
+Almost the same as Card 304, but try to tune the pending parameter.
+
+The propurse of this experiment is to tune and debug the SM tracing pipeline.
+
+- reduce the inference latency, and see if the tail latency for training reduces
+
+Record inference latency. Compare inference job latency running along vs. running with a training job.
+
+The latency should be measured with increasing throughput (qps) for the inference job.
+
+Collected data: inference per iteration speed (latency), training throughput (derived from per iteration speed)
+"""
+from __future__ import absolute_import, print_function, division, unicode_literals
+
+import tempfile
+from typing import Sequence
+
+from absl import flags
+import logging
+import os
+
+from benchmarks.driver.server.config import presets
+from benchmarks.driver.workload import WTL, Executor
+from benchmarks.driver.utils.compatiblity import pathlib
+from benchmarks.exps import (
+    run_seq, maybe_forced_preset, RunFn, Pause, wait_on_pipe, release_on_pipe,
+    case_switch_main,
+    run_tfdist, run_tf,
+    random_id,
+)
+
+
+FLAGS = flags.FLAGS
+logger = logging.getLogger(__name__)
+
+
+def set_env(wl):
+    wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'true'
+
+    model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models')
+    model_dir = model_dir.expanduser().resolve()
+    wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = model_dir
+
+
+def create_train(executor, idx, td=None):
+    # the batch number has no effect here, only used to distinguish different runs
+    train_wl = WTL.create('inception4', 50, 100 + idx, executor=executor)
+    # make sure it runs long enough
+    train_wl.env['SALUS_ITER_SECONDS'] = '300'
+
+    if td is not None:
+        # create a pipe to signal train_wl
+        pipetrain = str(pathlib.Path(td) / f'{train_wl.canonical_name}-{random_id()}-fifo')
+        os.mkfifo(pipetrain)
+        train_wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain
+        return train_wl, pipetrain
+    return train_wl
+
+
+def create_infer(executor, bs, td=None):
+    wl = WTL.create('vgg11eval', bs, 300, executor=executor)
+    set_env(wl)
+    wl.env['SALUS_ITER_SECONDS'] = '150'
+    wl.extra_args += [
+        # '--eval_interval_secs=0.02',
+        # '--eval_interval_random_factor=5'
+    ]
+
+    if td is not None:
+        pipe = str(pathlib.Path(td) / f'{wl.canonical_name}-{random_id()}-fifo')
+        os.mkfifo(pipe)
+        wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe
+        return wl, pipe
+
+    return wl
+
+
+def alone_tf(_argv):
+    # run tf
+    # the foreground inference job
+    wl = create_infer(Executor.TF, 10)
+    wl.extra_args += ['--min_mem']
+    run_tf(FLAGS.save_dir / "alone", wl)
+
+    # the background training job
+    wl = create_train(Executor.TF, 0)
+    wl.extra_args += ['--min_mem']
+    run_tf(FLAGS.save_dir / "alone", wl)
+
+
+def alone(argv):
+    """Run each workload alone for reference"""
+    sm_factors = [float(v) for v in argv]
+    if not sm_factors:
+        sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0]
+
+    logger.info(f"Running Salus with sm factors: {sm_factors}")
+
+    # run salus
+    for factor in sm_factors:
+        scfg = maybe_forced_preset(presets.MostEfficient)
+        scfg.extra_args += [
+            '--sm-factor', f'{factor:.2f}'
+        ]
+        logger.info(f"Running Salus with sm factor: {factor}")
+        wl = create_infer(Executor.Salus, 10)
+        run_seq(scfg.copy(output_dir=FLAGS.save_dir / "alone" / f"{factor:.2f}"), wl)
+
+        # the background training job
+        wl = create_train(Executor.Salus, 0)
+        run_seq(scfg.copy(output_dir=FLAGS.save_dir / "alone" / f"{factor:.2f}"), wl)
+
+
+def salus(argv):
+    # type: (Sequence[str]) -> None
+    base_cfg = maybe_forced_preset(presets.MostEfficient)
+
+    sm_factors = [float(v) for v in argv]
+    if not sm_factors:
+        sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0]
+
+    for idx, factor in enumerate(sm_factors):
+        scfg = base_cfg.copy(output_dir=FLAGS.save_dir / "salus" / f"{factor:.2f}")
+        scfg.extra_args += [
+            '--sm-factor', f'{factor:.2f}'
+        ]
+        with tempfile.TemporaryDirectory() as td:
+            # create a background training job
+            train_wl, pipetrain = create_train(Executor.Salus, 0, td)
+
+            # create the foreground inference job
+            wl, pipe = create_infer(Executor.Salus, 10, td)
+
+            run_seq(scfg,
+                    train_wl,  # start the background job
+                    wl,  # start the foreground job
+                    # wait for both jobs to be ready
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
+                    # start train job
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
+                    # wait 10 seconds
+                    Pause(10),
+                    # release inference job
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
+                    # run_seq automatically join all jobs at the end of the sequence
+                    )
+
+
+def inverse_salus(argv):
+    # type: (Sequence[str]) -> None
+    """Inversed priority for training and inference"""
+    base_cfg = maybe_forced_preset(presets.MostEfficient)
+
+    sm_factors = [float(v) for v in argv]
+    if not sm_factors:
+        sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0]
+
+    for idx, factor in enumerate(sm_factors):
+        scfg = base_cfg.copy(output_dir=FLAGS.save_dir / "inverse" / f"{factor:.2f}")
+        scfg.extra_args += [
+            '--sm-factor', f'{factor:.2f}'
+        ]
+        with tempfile.TemporaryDirectory() as td:
+            # create a background training job
+            train_wl, pipetrain = create_train(Executor.Salus, 0, td)
+
+            # create the foreground inference job
+            wl, pipe = create_infer(Executor.Salus, 10, td)
+            wl.extra_args += [
+                '--eval_sched_priority', '40'
+            ]
+
+            run_seq(scfg,
+                    train_wl,  # start the background job
+                    wl,  # start the foreground job
+                    # wait for both jobs to be ready
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
+                    # start train job
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
+                    # wait 10 seconds
+                    Pause(10),
+                    # release inference job
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
+                    # run_seq automatically join all jobs at the end of the sequence
+                    )
+
+
+def same_pri_salus(argv):
+    # type: (Sequence[str]) -> None
+    """Inversed priority for training and inference"""
+    base_cfg = maybe_forced_preset(presets.MostEfficient)
+
+    sm_factors = [float(v) for v in argv]
+    if not sm_factors:
+        sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0]
+
+    for idx, factor in enumerate(sm_factors):
+        scfg = base_cfg.copy(output_dir=FLAGS.save_dir / "same_pri" / f"{factor:.2f}")
+        scfg.extra_args += [
+            '--sm-factor', f'{factor:.2f}'
+        ]
+        with tempfile.TemporaryDirectory() as td:
+            # create a background training job
+            train_wl, pipetrain = create_train(Executor.Salus, 0, td)
+
+            # create the foreground inference job
+            wl, pipe = create_infer(Executor.Salus, 10, td)
+            wl.extra_args += [
+                '--eval_sched_priority', '20'
+            ]
+
+            run_seq(scfg,
+                    train_wl,  # start the background job
+                    wl,  # start the foreground job
+                    # wait for both jobs to be ready
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
+                    # start train job
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
+                    # wait 10 seconds
+                    Pause(10),
+                    # release inference job
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
+                    # run_seq automatically join all jobs at the end of the sequence
+                    )
+
+
+def tfmps(argv):
+    # type: (Sequence[str]) -> None
+    batch_sizes = [int(v) for v in argv[1:]]
+
+    if not batch_sizes:
+        batch_sizes = [1, 2, 4, 8]
+
+    for idx, bs in enumerate(batch_sizes):
+        with tempfile.TemporaryDirectory() as td:
+            # create a background training job
+            train_wl, pipetrain = create_train(Executor.TF, idx, td)
+            train_wl.extra_args += ['--min_mem']
+
+            # create the foreground inference job
+            wl, pipe = create_infer(Executor.TF, bs, td)
+            wl.extra_args += ['--min_mem']
+
+            run_tf(FLAGS.save_dir / "tfmps",
+                   train_wl,  # start the background job
+                   wl,  # start the foreground job
+                   # wait for both jobs to be ready
+                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
+                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
+                   # start train job
+                   RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
+                   # wait 10 seconds
+                   Pause(10),
+                   # release inference job
+                   RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
+                   # run_seq automatically join all jobs at the end of the sequence
+                   )
+
+
+def twoinfer_tfmps(argv):
+    # type: (Sequence[str]) -> None
+    batch_sizes = [int(v) for v in argv]
+
+    if not batch_sizes:
+        batch_sizes = [1, 2, 4, 8]
+
+    for idx, bs in enumerate(batch_sizes):
+        with tempfile.TemporaryDirectory() as td:
+            # create the foreground inference job
+            wl1, pipe1 = create_infer(Executor.TF, bs, td)
+            wl1.extra_args += ['--min_mem']
+            # create the foreground inference job
+            wl2, pipe2 = create_infer(Executor.TF, bs, td)
+            wl2.extra_args += ['--min_mem']
+
+            run_tf(FLAGS.save_dir / "twoinfer" / "tfmps",
+                   wl1,  # start the background job
+                   wl2,  # start the foreground job
+                   # wait for both jobs to be ready
+                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipe1)),
+                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipe2)),
+                   # start train job
+                   RunFn(lambda *args, **kwargs: release_on_pipe(pipe1)),
+                   # release inference job
+                   RunFn(lambda *args, **kwargs: release_on_pipe(pipe2)),
+                   # run_seq automatically join all jobs at the end of the sequence
+                   )
+
+
+def twoinfer(argv):
+    # type: (Sequence[str]) -> None
+    base_cfg = maybe_forced_preset(presets.MostEfficient)
+
+    sm_factors = [float(v) for v in argv]
+    if not sm_factors:
+        sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0]
+
+    for idx, factor in enumerate(sm_factors):
+        scfg = base_cfg.copy(output_dir=FLAGS.save_dir / "twoinfer" / "salus" / f"{factor:.2f}")
+        scfg.extra_args += [
+            '--sm-factor', f'{factor:.2f}'
+        ]
+        with tempfile.TemporaryDirectory() as td:
+            # create the foreground inference job
+            wl1, pipe1 = create_infer(Executor.Salus, 10, td)
+
+            # create the foreground inference job
+            wl2, pipe2 = create_infer(Executor.Salus, 10, td)
+
+            run_seq(scfg,
+                    wl1,  # start the first job
+                    wl2,  # start the second job
+                    # wait for both jobs to be ready
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe1)),
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe2)),
+                    # start 1st job
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe1)),
+                    # release 2nd job
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe2)),
+                    # run_seq automatically join all jobs at the end of the sequence
+                    )
+
+
+def twoinfer_pri(argv):
+    # type: (Sequence[str]) -> None
+    """Two inferences with difference priority"""
+    base_cfg = maybe_forced_preset(presets.MostEfficient)
+
+    sm_factors = [float(v) for v in argv]
+    if not sm_factors:
+        sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0]
+
+    for idx, factor in enumerate(sm_factors):
+        scfg = base_cfg.copy(output_dir=FLAGS.save_dir / "twoinfer_pri" / "salus" / f"{factor:.2f}")
+        scfg.extra_args += [
+            '--sm-factor', f'{factor:.2f}'
+        ]
+        with tempfile.TemporaryDirectory() as td:
+            # create the foreground inference job
+            wl1, pipe1 = create_infer(Executor.Salus, 10, td)
+
+            # create the background inference job
+            wl2, pipe2 = create_infer(Executor.Salus, 10, td)
+            wl2.extra_args += [
+                '--eval_sched_priority', '20'
+            ]
+
+            run_seq(scfg,
+                    wl1,  # start the first job
+                    wl2,  # start the second job
+                    # wait for both jobs to be ready
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe1)),
+                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe2)),
+                    # start 1st job
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe1)),
+                    # release 2nd job
+                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe2)),
+                    # run_seq automatically join all jobs at the end of the sequence
+                    )
+
+
+@case_switch_main
+def main():
+    return alone, salus, tfmps, twoinfer, twoinfer_tfmps, inverse_salus, same_pri_salus, twoinfer_pri
diff --git a/cmake/Defaults.cmake b/cmake/Defaults.cmake
index 2335e10..99b553c 100644
--- a/cmake/Defaults.cmake
+++ b/cmake/Defaults.cmake
@@ -12,7 +12,7 @@ endif()
 list(APPEND CMAKE_PREFIX_PATH spack-packages)
 list(REMOVE_DUPLICATES CMAKE_PREFIX_PATH)
 
-# Use C++14 standard
+# Use C++17 standard
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
diff --git a/scripts/Note b/scripts/Note
index 39b1651..3b949d4 100644
--- a/scripts/Note
+++ b/scripts/Note
@@ -9,3 +9,13 @@ Without aggregate mode, metrics shows up in CUPTI_ACTIVITY_KIND_METRIC table, ot
 The value field of these table is blob, which can be parsed as double
 
 tail -f server.output | egrep -e 'OpItem ExecTask' | egrep --color=always -e 'failures=[[:digit:]]+' -e '[[:digit:]]+ ms'
+
+bazel test $(bazel query --keep_going 'let base = //tensorflow/python/kernel_tests/...:all in kind(test, $base) intersect attr(tags, "cuda-py-test", $base)')
+
+nvprof --aggregate-mode off --metrics sm_efficiency --csv
+
+nvprof --aggregate-mode off --metrics sm_efficiency python test.py
+nvprof --print-gpu-trace python test.py
+
+All tests:
+bazel query --keep_going 'let base = //tensorflow/python/kernel_tests/...:all in kind(test, $base) intersect attr(tags, "cuda-py-test", $base)' | sed 's#//#bazel-bin/#;s#:#/#'
diff --git a/scripts/parse_card250.py b/scripts/parse_card250.py
index 3b33080..f347c47 100644
--- a/scripts/parse_card250.py
+++ b/scripts/parse_card250.py
@@ -1,16 +1,16 @@
 #!/usr/bin/env python3
 #
 # Copyright 2019 Peifeng Yu <peifeng@umich.edu>
-# 
+#
 # This file is part of Salus
 # (see https://github.com/SymbioticLab/Salus).
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -127,12 +127,11 @@ def plot_speeds(df, total_kws=None, **kwargs):
 
     ax.legend().remove()
     ax.set_xlabel('Time (s)')
-    ax.set_ylabel('Images per second')
+    ax.set_ylabel('Images\nper second')
     return ax
 
 
-path = 'logs/nsdi19'
-def prepare_paper(path):
+def prepare_paper(path='logs/nsdi19'):
     path = Path(path)
     df = load_speeds(path/'card250'/'case1')
 
@@ -166,6 +165,6 @@ def prepare_paper(path):
                     total_kws={'marker': 'None', 'zorder': -1, 'linewidth': 1})
 
         fig.tight_layout()
-        fig.set_size_inches(3.25, 2.35, forward=True)
-        fig.savefig('/tmp/workspace/card250.pdf', dpi=300)
+        fig.set_size_inches(3.25, 1.5, forward=True)
+        fig.savefig('/tmp/workspace/card250.pdf', dpi=300, bbox_inches='tight')
         plt.close()
diff --git a/scripts/parse_card260.py b/scripts/parse_card260.py
index 317f0bd..a466f66 100644
--- a/scripts/parse_card260.py
+++ b/scripts/parse_card260.py
@@ -1,16 +1,16 @@
 #!/usr/bin/env python3
 #
 # Copyright 2019 Peifeng Yu <peifeng@umich.edu>
-# 
+#
 # This file is part of Salus
 # (see https://github.com/SymbioticLab/Salus).
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -112,14 +112,14 @@ def plot_ratio(df, **kwargs):
 try:
     path
 except NameError:
-    path = '/tmp/workspace'
+    path = 'logs/nsdi19'
 
-def prepare_paper(path):
+def prepare_paper(path='logs/nsdi19'):
     with plt.style.context(['seaborn-paper', 'mypaper', 'gray']):
         df = load_data(path)
 
         fig, ax = plt.subplots()
-        fig.set_size_inches(3.25, 1.5, forward=True)
+        fig.set_size_inches(3.25, 1.2, forward=True)
 
         #plot_eval_pit_vs_speed(df, ax=ax)
         #ax.set_xlabel('Time (s)')
@@ -132,4 +132,4 @@ def prepare_paper(path):
 
         fig.tight_layout()
         fig.savefig('/tmp/workspace/card260.pdf', dpi=300, bbox_inches='tight', pad_inches = .015)
-        plt.close()
\ No newline at end of file
+        plt.close()
diff --git a/scripts/parse_card271.py b/scripts/parse_card271.py
index 03999be..1764e8b 100644
--- a/scripts/parse_card271.py
+++ b/scripts/parse_card271.py
@@ -1,16 +1,16 @@
 #!/usr/bin/env python3
 #
 # Copyright 2019 Peifeng Yu <peifeng@umich.edu>
-# 
+#
 # This file is part of Salus
 # (see https://github.com/SymbioticLab/Salus).
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -129,15 +129,14 @@ def plot_makespan(df, **kwargs):
     return ax
 
 
-path = 'logs/nsdi19'
-def prepare_paper(path):
+def prepare_paper(path='logs/nsdi19'):
     path = Path(path)
     with plt.style.context(['seaborn-paper', 'mypaper', 'color3']):
         # fifo = ju.load_trace(path/'card266'/'salus'/'trace.csv')
         df = load_data(path/'card271')
 
         fig, ax = plt.subplots()
-        fig.set_size_inches(3.25, 1.85, forward=True)
+        fig.set_size_inches(3.25, 1.3, forward=True)
 
         # set col order
         df = df[['Network', 'Salus', 'TF']]
@@ -150,4 +149,4 @@ def prepare_paper(path):
         fig.tight_layout()
         fig.savefig('/tmp/workspace/card271.pdf', dpi=300)
         plt.close()
-    return df
\ No newline at end of file
+    return df
diff --git a/scripts/parse_card272.py b/scripts/parse_card272.py
index 523b9e0..1ded432 100644
--- a/scripts/parse_card272.py
+++ b/scripts/parse_card272.py
@@ -1,16 +1,16 @@
 #!/usr/bin/env python3
 #
 # Copyright 2019 Peifeng Yu <peifeng@umich.edu>
-# 
+#
 # This file is part of Salus
 # (see https://github.com/SymbioticLab/Salus).
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -116,7 +116,7 @@ def do_timelines(path):
 
 
 path = 'logs/nsdi19'
-def prepare_paper(path):
+def prepare_paper(path='logs/nsdi19'):
     path = Path(path)
     with plt.style.context(['seaborn-paper', 'mypaper', 'line12']):
         # also use color
@@ -131,7 +131,7 @@ def prepare_paper(path):
         pack = load_data(path/'card272'/'case1'/'salus', 'case1.output')
 
         fig, ax = plt.subplots()
-        fig.set_size_inches(3.25, 1.85, forward=True)
+        fig.set_size_inches(3.25, 1.5, forward=True)
 
         jcts = pd.DataFrame({'FIFO': fifo.JCT, 'SRTF': srtf.JCT, 'PACK': pack.JCT, 'FAIR': fair.JCT})
         plot_jcts(jcts, ax=ax, markevery=0.1, markersize=4, linewidth=1)
@@ -141,4 +141,4 @@ def prepare_paper(path):
         fig.savefig('/tmp/workspace/card272-jct.pdf', dpi=300)
         plt.close()
 
-    return fifo, srtf, srtf_refine, fair, pack
\ No newline at end of file
+    return fifo, srtf, srtf_refine, fair, pack
diff --git a/scripts/parse_card274.py b/scripts/parse_card274.py
index fa6584c..4b2fb60 100644
--- a/scripts/parse_card274.py
+++ b/scripts/parse_card274.py
@@ -1,16 +1,16 @@
 #!/usr/bin/env python3
 #
 # Copyright 2019 Peifeng Yu <peifeng@umich.edu>
-# 
+#
 # This file is part of Salus
 # (see https://github.com/SymbioticLab/Salus).
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,6 +35,7 @@
 #import seaborn as sns
 import numpy as np
 import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
 from matplotlib import cycler
 
 import plotutils as pu
@@ -94,11 +95,17 @@ def do_srtf2(path):
                        plot_offset=-st_sec
                        )
         ax.set_xlim([0, ed_sec-st_sec])
+        
+        # add a legend
+        ax.legend(handles=[
+            mpatches.Patch(color='#b6b6b6', label='Queuing'),
+            mpatches.Patch(color='black', label='Active')
+        ], bbox_to_anchor=(0.85, 0.03), loc='lower right')
 
         ax.set_ylabel('Job #')
         ax.yaxis.set_ticks([0, 1, 2, 3, 4, 5])
 
-        fig.set_size_inches(4.875, 2, forward=True)
+        fig.set_size_inches(4.875, 1.5, forward=True)
         fig.savefig('/tmp/workspace/card274-srtf-compute.pdf', dpi=300,
                     bbox_inches='tight', pad_inches = .015)
         plt.close()
@@ -146,7 +153,7 @@ def do_srtf3(path):
         ax.legend().remove()
 
         #fig.tight_layout()
-        fig.set_size_inches(1.625, 2, forward=True)
+        fig.set_size_inches(1.625, 1.5, forward=True)
         fig.savefig('/tmp/workspace/card274-srtf-mem.pdf', dpi=300, bbox_inches='tight', pad_inches = .015)
         plt.close()
 
diff --git a/scripts/parse_exp17.py b/scripts/parse_exp17.py
index e050424..1b4d4bb 100644
--- a/scripts/parse_exp17.py
+++ b/scripts/parse_exp17.py
@@ -1,16 +1,16 @@
 #!/usr/bin/env python3
 #
 # Copyright 2019 Peifeng Yu <peifeng@umich.edu>
-# 
+#
 # This file is part of Salus
 # (see https://github.com/SymbioticLab/Salus).
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -73,20 +73,25 @@ def load_exp17(path):
 pits = pits.drop(['index', 'BatchSize', 'Network'], axis=1)
 pits = pits.rename(columns={'Model': 'Network'}).set_index('Network')
 
+old_vae = pits.at['vae', 'Salus']
+pits.loc['vae', 'Salus'] = 1.2
+old_superres = pits.at['superres', 'Salus']
+pits.loc['superres', 'Salus'] = 1.2
+
 with plt.style.context(['seaborn-paper', 'mypaper', 'color3']):
     ax = pits.plot.bar(legend=None)
     pu.axhlines(1.0, ax=ax, color='k', linestyle='--', linewidth=1)
-    pu.bar_show_data(ax, pits.index.get_loc('superres'), pits.at['superres', 'Salus'])
-    pu.bar_show_data(ax, pits.index.get_loc('vae'), pits.at['vae', 'Salus'])
+    pu.bar_show_data(ax, pits.index.get_loc('superres'), 1.15, data_y=old_superres, fmt='{:.2f}')
+    pu.bar_show_data(ax, pits.index.get_loc('vae'), 1.13, data_y=old_vae, fmt='{:.2f}')
 
-    ax.set_ylim(0.9, 1.9)
+    ax.set_ylim(0.9, 1.15)
     ax.set_xlabel('Workloads')
-    ax.set_ylabel('Normalized Per Iteration\nTraining Time')
+    ax.set_ylabel('Normalized\nPer Iteration\nTraining Time')
     # ax.legend()
 
     ax.tick_params(axis='x', labelsize=7)
 
-    ax.figure.set_size_inches(3.25, 2.35, forward=True)
+    ax.figure.set_size_inches(3.25, 1.8, forward=True)
     ax.figure.tight_layout()
-    ax.figure.savefig('/tmp/workspace/exp17.pdf', dpi=300, bbox_inches='tight', pad_inches = .015)
+    ax.figure.savefig('/tmp/workspace/exp17.pdf', dpi=300, bbox_inches='tight', pad_inches=.015)
     plt.close()
diff --git a/scripts/plotutils.py b/scripts/plotutils.py
index cb4c45c..41da8e8 100644
--- a/scripts/plotutils.py
+++ b/scripts/plotutils.py
@@ -1,15 +1,15 @@
 #
 # Copyright 2019 Peifeng Yu <peifeng@umich.edu>
-# 
+#
 # This file is part of Salus
 # (see https://github.com/SymbioticLab/Salus).
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -435,7 +435,7 @@ def bar(df, width=0.8, ax=None, **kwargs):
     return ax
 
 
-def bar_show_data(ax, x, y, fmt='{:.1f}', **kwargs):
+def bar_show_data(ax, x, y, data_y=None, fmt='{:.1f}', **kwargs):
     kws = {
         'xytext': [0, 7],
         'textcoords': 'offset points',
@@ -443,7 +443,9 @@ def bar_show_data(ax, x, y, fmt='{:.1f}', **kwargs):
         'horizontalalignment': 'center',
         'verticalalignment': 'top'
     }
-    ax.annotate(fmt.format(y),
+    if data_y is None:
+        data_y = y
+    ax.annotate(fmt.format(data_y),
                 xy=[x, y],
                 **{**kws, **kwargs})
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c4a350d..df96701 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -62,19 +62,21 @@ if(USE_TENSORFLOW)
 
         "oplibraries/tensorflow/v3/sigraphmgr.cpp"
         "oplibraries/tensorflow/v3/tf_executor.cpp"
+        "oplibraries/tensorflow/v3/smblocker.cpp"
 
         "oplibraries/tensorflow/device/shadowdevices.cpp"
         "oplibraries/tensorflow/device/salusdevices.cpp"
         "oplibraries/tensorflow/device/cpu.cpp"
         "oplibraries/tensorflow/device/gpu/gpu.cpp"
+        "oplibraries/tensorflow/device/gpu/smeventpoller.cpp"
         "oplibraries/tensorflow/device/gpu/lane/lanemgr.cpp"
         "oplibraries/tensorflow/device/gpu/sessiondevice.cpp"
         "oplibraries/tensorflow/device/sessionallocator.cpp"
     )
 endif(USE_TENSORFLOW)
 
-add_executable(salus-server ${SRC_LIST})
-target_link_libraries(salus-server
+add_executable(salus-server-exec ${SRC_LIST})
+target_link_libraries(salus-server-exec
     protos_gen
     platform
 
@@ -87,37 +89,65 @@ target_link_libraries(salus-server
 )
 
 if(USE_TENSORFLOW)
-    target_link_libraries(salus-server
+    target_link_libraries(salus-server-exec
         tensorflow::kernels
     )
-    target_compile_definitions(salus-server
+    target_compile_definitions(salus-server-exec
         PRIVATE
             GOOGLE_CUDA=1
     )
 endif(USE_TENSORFLOW)
 
+target_link_options(salus-server-exec
+    PRIVATE
+        "LINKER:--dynamic-list=${CMAKE_CURRENT_SOURCE_DIR}/salus-server.list"
+)
+set_target_properties(salus-server-exec PROPERTIES
+    LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/salus-server.list
+)
+
 #---------------------------------------------------------------------------------------
 # Instrucment
 #---------------------------------------------------------------------------------------
 if(WITH_GPERFTOOLS AND WITH_TCMALLOC)
-    target_link_libraries(salus-server gperftools::tcmalloc_and_profiler)
+    target_link_libraries(salus-server-exec gperftools::tcmalloc_and_profiler)
 elseif(WITH_GPERFTOOLS)
-    target_link_libraries(salus-server gperftools::profiler)
+    target_link_libraries(salus-server-exec gperftools::profiler)
 elseif(WITH_TCMALLOC)
-    target_link_libraries(salus-server gperftools::tcmalloc)
+    target_link_libraries(salus-server-exec gperftools::tcmalloc)
 endif()
 
+#---------------------------------------------------------------------------------------
+# CUDA Hooker
+#---------------------------------------------------------------------------------------
+add_subdirectory(cudahook)
+
+#---------------------------------------------------------------------------------------
+# Exec Wrapper
+#---------------------------------------------------------------------------------------
+add_subdirectory(wrapper)
+file(COPY ${CMAKE_CURRENT_BINARY_DIR}/wrapper/salus-server
+    DESTINATION ${CMAKE_CURRENT_BINARY_DIR}
+    FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE
+    GROUP_READ GROUP_WRITE GROUP_EXECUTE
+    WORLD_EXECUTE
+)
+install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/salus-server
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
+)
+
 #---------------------------------------------------------------------------------------
 # Installation
 #---------------------------------------------------------------------------------------
-#set_target_properties(salus-server PROPERTIES
+#set_target_properties(salus-server-exec PROPERTIES
 #    INSTALL_RPATH "${SALUS_DEPS_PATH}/lib"
 #)
 
-install(TARGETS salus-server
+install(TARGETS salus-server-exec
     RUNTIME DESTINATION bin
 )
 
+
 # also install dependencies in spack-packages
 #if (DEFINED SALUS_DEPS_PATH)
 #    if (EXISTS ${SALUS_DEPS_PATH}/lib)
diff --git a/src/config.h.in b/src/config.h.in
index 16f5518..60bca31 100644
--- a/src/config.h.in
+++ b/src/config.h.in
@@ -12,6 +12,7 @@
 #cmakedefine SALUS_ENABLE_EXCLUSIVE_ITER
 #cmakedefine SALUS_ENABLE_TIMEOUT_WARNING
 #cmakedefine SALUS_ENABLE_JSON_LOG
+#cmakedefine SALUS_ENABLE_TENSORFLOW
 
 #define SALUS_BUILD_TYPE "@CMAKE_BUILD_TYPE@"
 
diff --git a/src/cudahook/CMakeLists.txt b/src/cudahook/CMakeLists.txt
new file mode 100644
index 0000000..a83369a
--- /dev/null
+++ b/src/cudahook/CMakeLists.txt
@@ -0,0 +1,32 @@
+set(SRC_LIST
+    cudahook.cpp
+    realdlsym.cpp
+    kernellaunches.cpp)
+
+add_library(cudahook SHARED ${SRC_LIST})
+
+set_target_properties(cudahook PROPERTIES
+    VISIBILITY_INLINES_HIDDEN 1
+    CXX_VISIBILITY_PRESET hidden
+)
+
+include(GenerateExportHeader)
+generate_export_header(cudahook)
+target_include_directories(cudahook
+    PUBLIC
+        ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+target_compile_definitions(cudahook
+    PRIVATE
+        _GNU_SOURCE=1
+)
+
+target_link_libraries(cudahook
+    PRIVATE
+        ${CMAKE_DL_LIBS}
+)
+
+install(TARGETS cudahook
+    LIBRARY DESTINATION lib
+)
diff --git a/src/cudahook/cudahook.cpp b/src/cudahook/cudahook.cpp
new file mode 100644
index 0000000..f0b2f68
--- /dev/null
+++ b/src/cudahook/cudahook.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2019, peifeng <email>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *     * Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice,
+ *       this list of conditions and the following disclaimer in the documentation
+ *       and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cudahook.h"
+#include "cudahook_export.h"
+
+#include "realdlsym.h"
+
+#include <dlfcn.h>
+
+#include <iostream>
+#include <cstring>
+#include <cstdlib>
+
+using salus::real_dlsym;
+
+/*
+ * We need to give the pre-processor a chance to replace a function, such as:
+ * cuMemAlloc => cuMemAlloc_v2
+ */
+#define STRINGIFY(x) #x
+#define CUDA_SYMBOL_STRING(x) STRINGIFY(x)
+
+namespace salus {
+
+CudaHook::CudaHook(const char *dl)
+{
+    // Load the libcuda.so library with RTLD_GLOBAL so we can hook the function calls
+    m_handle = dlopen(dl, RTLD_LAZY | RTLD_GLOBAL);
+    if (!m_handle) {
+        std::cerr << "Error to open library " << dl << ": " << dlerror() << std::endl;
+        std::exit(-1);
+    }
+
+#define USE_FUNC(funcname, ret, params, ...) \
+    m_orig.funcname = func_cast<Fn_##funcname*>(real_dlsym(m_handle, CUDA_SYMBOL_STRING(funcname))); \
+    if (!m_orig.funcname) { \
+        std::cerr << "Error to find symbol " CUDA_SYMBOL_STRING(funcname) ": " << dlerror() << std::endl; \
+        std::exit(-2); \
+    }
+#include "functions.def"
+
+    auto envDebug = std::getenv("CUDA_HOOK_DEBUG");
+    if (envDebug && envDebug[0] == '1') {
+        m_debugging = true;
+        std::cerr << "CUDA HOOK Library loaded." << std::endl;
+    }
+}
+
+CudaHook &CudaHook::instance()
+{
+    static CudaHook hook("libcuda.so");
+    return hook;
+}
+
+CudaHook::~CudaHook() {
+    if (m_handle) {
+        dlclose(m_handle);
+    }
+}
+
+struct HookAccessor
+{
+    const salus::CudaHook &hook;
+
+    bool debugging() const
+    {
+        return hook.m_debugging;
+    }
+
+    const auto &orig() const
+    {
+        return hook.m_orig;
+    }
+
+    const auto &pre() const
+    {
+        return hook.m_pre;
+    }
+
+    const auto &post() const
+    {
+        return hook.m_post;
+    }
+};
+
+} // namespace salus
+
+
+/*
+ * Interposed Functions
+ */
+extern "C" {
+
+/*
+ * Other interposed functions
+ */
+#define USE_FUNC(funcname, ret, params, ...) \
+    CUDAHOOK_EXPORT int funcname params \
+    { \
+        const salus::HookAccessor hook{salus::CudaHook::instance()}; \
+        if (hook.pre().funcname) { \
+            hook.pre().funcname(__VA_ARGS__); \
+        } \
+        if (hook.debugging()) { \
+            std::cerr << "Hooked function " CUDA_SYMBOL_STRING(funcname) " is called\n";\
+        } \
+        auto res = hook.orig().funcname(__VA_ARGS__); \
+        if (hook.post().funcname) { \
+            hook.post().funcname(__VA_ARGS__); \
+        } \
+        return res; \
+    }
+#include "functions.def"
+
+/*
+ * We need to interpose dlsym since anyone using dlopen+dlsym to get the CUDA driver symbols will bypass
+ * the hooking mechanism (this includes the CUDA runtime). Its tricky though, since if we replace the
+ * real dlsym with ours, we can't dlsym() the real dlsym. To get around that, call the 'private'
+ * libc interface called __libc_dlsym to get the real dlsym.
+ */
+CUDAHOOK_EXPORT void* dlsym(void *handle, const char *symbol) noexcept
+{
+    // Early out if not a CUDA driver symbol
+    if (strncmp(symbol, "cu", 2) != 0) {
+        return real_dlsym(handle, symbol);
+    }
+
+    const salus::HookAccessor hook{salus::CudaHook::instance()};
+    if (hook.debugging()) {
+        std::cerr << "Hooked dlsym: requesting " << symbol << "\n";
+    }
+
+#define USE_FUNC(funcname, ret, params, ...) \
+    if (strcmp(symbol, CUDA_SYMBOL_STRING(funcname)) == 0) { \
+        return reinterpret_cast<void*>(&funcname);\
+    }
+#include "functions.def"
+
+    return real_dlsym(handle, symbol);
+}
+
+} // extern "C"
diff --git a/src/cudahook/cudahook.h b/src/cudahook/cudahook.h
new file mode 100644
index 0000000..c6cf2b7
--- /dev/null
+++ b/src/cudahook/cudahook.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019, peifeng <email>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *     * Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice,
+ *       this list of conditions and the following disclaimer in the documentation
+ *       and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SALUS_CUDAHOOK_H
+#define SALUS_CUDAHOOK_H
+
+#include <functional>
+
+namespace salus {
+
+struct HookAccessor;
+
+/*
+ * Typedefs of function types
+ */
+#define USE_FUNC(funcname, ret, params, ...) using Fn_##funcname = ret params;
+#include "functions.def"
+
+/**
+ * @brief Callback structure for each of hooked function
+ */
+struct HookedFunctions
+{
+#define USE_FUNC(funcname, ret, params, ...) \
+        std::function<Fn_##funcname> funcname = nullptr;
+#include "functions.def"
+};
+
+class CudaHook
+{
+    HookedFunctions m_orig;
+
+    HookedFunctions m_pre;
+    HookedFunctions m_post;
+
+    void *m_handle = nullptr;
+
+    bool m_debugging = false;
+
+    explicit CudaHook(const char *dl);
+
+    friend struct HookAccessor;
+
+public:
+    ~CudaHook();
+
+    static CudaHook &instance();
+
+    HookedFunctions &post()
+    {
+        return m_post;
+    }
+
+    HookedFunctions &pre()
+    {
+        return m_pre;
+    }
+};
+
+} // salus
+
+#endif // SALUS_CUDAHOOK_H
diff --git a/src/cudahook/functions.def b/src/cudahook/functions.def
new file mode 100644
index 0000000..027256b
--- /dev/null
+++ b/src/cudahook/functions.def
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019, peifeng <email>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *     * Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice,
+ *       this list of conditions and the following disclaimer in the documentation
+ *       and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef USE_FUNC
+#error USE_FUNC(funcname, ret, params, ...) should be defined before including hooked_functions.def
+#endif
+
+USE_FUNC(cuLaunch, int, (void* f), f)
+USE_FUNC(cuLaunchGrid, int, (void* f, int grid_width, int grid_height), f, grid_width, grid_height)
+USE_FUNC(cuLaunchGridAsync, int, (void* f, int grid_width, int grid_height, void* stream), f, grid_width, grid_height, stream)
+USE_FUNC(cuFuncSetBlockShape, int, (void* f, int x, int y, int z), f, x, y, z)
+USE_FUNC(cuFuncSetSharedSize, int, (void* f, unsigned int bytes), f, bytes)
+
+USE_FUNC(cuLaunchKernel, int, (void* f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
+                               unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+                               unsigned int sharedMemBytes, void* hStream, void** kernelParams, void** extra),
+                              f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+                              sharedMemBytes, hStream, kernelParams, extra)
+
+#undef USE_FUNC
diff --git a/src/cudahook/kernellaunches.cpp b/src/cudahook/kernellaunches.cpp
new file mode 100644
index 0000000..bb4481a
--- /dev/null
+++ b/src/cudahook/kernellaunches.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2019, peifeng <email>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *     * Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice,
+ *       this list of conditions and the following disclaimer in the documentation
+ *       and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "kernellaunches.h"
+
+#include "realdlsym.h"
+#include "cudahook.h"
+
+#include <dlfcn.h>
+
+#include <cstdlib>
+#include <iostream>
+
+namespace {
+} // namespace
+
+namespace salus {
+
+KernelLaunches kl [[maybe_unused]];
+
+KernelLaunches::KernelLaunches() noexcept
+{
+    // get callback function
+    m_selfHandle = dlopen(nullptr, RTLD_LAZY);
+    if (!m_selfHandle) {
+        std::cerr << "Error to get handle to self executable: " << dlerror() << std::endl;
+        std::exit(-4);
+    }
+    m_kernelLaunchCallback = func_cast<FnKernelLaunchCallback*>(real_dlsym(m_selfHandle, KernelLaunchCallbackFuncationName));
+    if (!m_kernelLaunchCallback) {
+        std::cerr << "Error to find symbol " << KernelLaunchCallbackFuncationName << ": " << dlerror() << std::endl;
+        std::exit(-5);
+    }
+    DetectorCuLaunchKernel::setCallback(m_kernelLaunchCallback);
+    DetectorCuLaunch::setCallback(m_kernelLaunchCallback);
+
+    // install hooks to detect kernel launches
+    DetectorCuLaunchKernel::installHooks();
+    DetectorCuLaunch::installHooks();
+
+    // debug
+    auto envDebug = std::getenv("CUDA_HOOK_DEBUG");
+    if (envDebug && envDebug[0] == '1') {
+        m_debugging = true;
+        std::cerr << "CUDA Kernel launch recording started." << std::endl;
+    }
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// cuLaunchKernel detector
+// ---------------------------------------------------------------------------------------------------------------------
+
+FnKernelLaunchCallback *DetectorCuLaunchKernel::m_callback = nullptr;
+
+void DetectorCuLaunchKernel::installHooks()
+{
+    CudaHook::instance().pre().cuLaunchKernel = [](auto, auto gridX, auto gridY, auto gridZ,
+                                                   auto blkX, auto blkY, auto blkZ,
+                                                   auto shdMem, auto stream, auto, auto) {
+        auto &detector = localInstance();
+        detector.onCuLaunchKernel({gridX, gridY, gridZ, blkX, blkY, blkZ, shdMem, stream});
+        return 0;
+    };
+}
+
+DetectorCuLaunchKernel &DetectorCuLaunchKernel::localInstance()
+{
+    static thread_local DetectorCuLaunchKernel detector;
+    return detector;
+}
+
+void DetectorCuLaunchKernel::onCuLaunchKernel(details::KernelParams params)
+{
+    m_kernelParams = params;
+    m_state = State::Found;
+    fire();
+}
+
+void DetectorCuLaunchKernel::fire()
+{
+    if (m_state != State::Found) {
+        return;
+    }
+    if (m_callback) {
+        m_callback(m_kernelParams.gridX, m_kernelParams.gridY, m_kernelParams.gridZ,
+                   m_kernelParams.blkX, m_kernelParams.blkY, m_kernelParams.blkZ,
+                   m_kernelParams.shdMem, m_kernelParams.stream);
+    }
+    m_state = State::Idle;
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// cuLaunch detector
+// ---------------------------------------------------------------------------------------------------------------------
+
+FnKernelLaunchCallback *DetectorCuLaunch::m_callback = nullptr;
+
+void DetectorCuLaunch::installHooks()
+{
+    CudaHook::instance().pre().cuFuncSetBlockShape = [](auto f, auto x, auto y, auto z) {
+        localInstance().onCuFuncSetBlockShape(f, x, y, z);
+        return 0;
+    };
+    CudaHook::instance().pre().cuFuncSetSharedSize = [](auto f, auto size) {
+        localInstance().onCuFuncSetSharedSize(f, size);
+        return 0;
+    };
+    CudaHook::instance().pre().cuLaunch = [](auto f) {
+        localInstance().onCuLaunch(f);
+        return 0;
+    };
+    CudaHook::instance().pre().cuLaunchGrid = [](auto f, auto w, auto h) {
+        localInstance().onCuLaunchGrid(f, w, h);
+        return 0;
+    };
+    CudaHook::instance().pre().cuLaunchGridAsync = [](auto f, auto w, auto h, auto stream) {
+        localInstance().onCuLaunchGridAsync(f, w, h, stream);
+        return 0;
+    };
+}
+
+DetectorCuLaunch &DetectorCuLaunch::localInstance()
+{
+    static thread_local DetectorCuLaunch detector;
+    return detector;
+}
+
+void DetectorCuLaunch::onCuFuncSetBlockShape(void *f, int x, int y, int z)
+{
+    auto &params = ensureParams(f);
+    params.blkX = x;
+    params.blkY = y;
+    params.blkZ = z;
+}
+
+void DetectorCuLaunch::onCuFuncSetSharedSize(void *f, unsigned int bytes)
+{
+    auto &params = ensureParams(f);
+    params.shdMem = bytes;
+}
+
+void DetectorCuLaunch::onCuLaunch(void *func)
+{
+    auto &params = ensureParams(func);
+    params.gridX = params.gridY = params.gridZ = 1;
+    fire(params);
+    m_params.erase(func);
+}
+
+void DetectorCuLaunch::onCuLaunchGrid(void *f, int grid_width, int grid_height)
+{
+    auto &params = ensureParams(f);
+    params.gridX = grid_width;
+    params.gridY = grid_height;
+    params.gridZ = 1;
+    fire(params);
+    m_params.erase(f);
+}
+
+void DetectorCuLaunch::onCuLaunchGridAsync(void *f, int grid_width, int grid_height, void *stream)
+{
+    auto &params = ensureParams(f);
+    params.gridX = grid_width;
+    params.gridY = grid_height;
+    params.gridZ = 1;
+    params.stream = stream;
+    fire(params);
+    m_params.erase(f);
+}
+
+details::KernelParams &DetectorCuLaunch::ensureParams(void *func)
+{
+    auto it = m_params.try_emplace(func, details::KernelParams{1, 1, 1, 1, 1, 1, 0, nullptr}).first;
+    return it->second;
+}
+
+void DetectorCuLaunch::fire(const details::KernelParams &params)
+{
+    if (m_callback) {
+        m_callback(params.gridX, params.gridY, params.gridZ,
+                   params.blkX, params.blkY, params.blkZ,
+                   params.shdMem, params.stream);
+    }
+}
+
+} // namespace salus
diff --git a/src/cudahook/kernellaunches.h b/src/cudahook/kernellaunches.h
new file mode 100644
index 0000000..772b6ca
--- /dev/null
+++ b/src/cudahook/kernellaunches.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2019, peifeng <email>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *     * Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice,
+ *       this list of conditions and the following disclaimer in the documentation
+ *       and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SALUS_KERNELLAUNCHES_H
+#define SALUS_KERNELLAUNCHES_H
+
+#include <unordered_map>
+#include <cstdint>
+
+namespace salus {
+
+constexpr auto KernelLaunchCallbackFuncationName = "salus_kernel_launch_callback";
+
+using FnKernelLaunchCallback = void (unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
+                                     unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+                                     unsigned int sharedMemBytes, void *hStream);
+
+namespace details {
+
+struct KernelParams
+{
+    uint32_t gridX = 0;
+    uint32_t gridY = 0;
+    uint32_t gridZ = 0;
+    uint32_t blkX = 0;
+    uint32_t blkY = 0;
+    uint32_t blkZ = 0;
+    uint32_t shdMem = 0;
+    void *stream = nullptr;
+};
+
+} // namespace details
+
+class DetectorCuLaunchKernel
+{
+    static FnKernelLaunchCallback *m_callback;
+
+public:
+    static void setCallback(FnKernelLaunchCallback *callback)
+    {
+        m_callback = callback;
+    }
+
+    static void installHooks();
+
+    static DetectorCuLaunchKernel &localInstance();
+
+    /*
+     * Actual detector logic below
+     */
+private:
+    enum class State {
+        Idle, Found
+    };
+    State m_state = State::Idle;
+
+    details::KernelParams m_kernelParams;
+
+    void fire();
+
+public:
+    DetectorCuLaunchKernel() = default;
+
+    void onCuLaunchKernel(details::KernelParams params);
+};
+
+class DetectorCuLaunch
+{
+    static FnKernelLaunchCallback *m_callback;
+public:
+    static void setCallback(FnKernelLaunchCallback *callback)
+    {
+        m_callback = callback;
+    }
+
+    static void installHooks();
+
+    static DetectorCuLaunch &localInstance();
+
+    // Actual detector logic below
+private:
+    enum class State {
+        Idle
+    };
+    State m_state = State::Idle;
+
+    std::unordered_map<void*, details::KernelParams> m_params;
+
+    details::KernelParams &ensureParams(void *func);
+
+    void fire(const details::KernelParams &params);
+
+public:
+    void onCuFuncSetBlockShape(void* f, int x, int y, int z);
+    void onCuFuncSetSharedSize(void* f, unsigned int bytes);
+    void onCuLaunch(void *func);
+    void onCuLaunchGrid(void* f, int grid_width, int grid_height);
+    void onCuLaunchGridAsync(void* f, int grid_width, int grid_height, void* stream);
+};
+
+class KernelLaunches
+{
+    void *m_selfHandle = nullptr;
+    FnKernelLaunchCallback *m_kernelLaunchCallback = nullptr;
+
+    bool m_debugging = false;
+
+public:
+    KernelLaunches() noexcept;
+
+    bool debugging() const { return m_debugging; }
+};
+
+} // namespace salus
+
+#endif // SALUS_KERNELLAUNCHES_H
diff --git a/src/cudahook/realdlsym.cpp b/src/cudahook/realdlsym.cpp
new file mode 100644
index 0000000..1a1b21c
--- /dev/null
+++ b/src/cudahook/realdlsym.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019, peifeng <email>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *     * Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice,
+ *       this list of conditions and the following disclaimer in the documentation
+ *       and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "realdlsym.h"
+
+#include <dlfcn.h>
+
+extern "C" {
+// For interposing dlsym(). See elf/dl-libc.c for the internal dlsym interface function
+void* __libc_dlsym (void *map, const char *name);
+}
+
+namespace salus {
+
+using FnDlsym = void *(void*, const char*);
+void* real_dlsym(void *handle, const char* symbol) noexcept
+{
+    static auto internal_dlsym = func_cast<FnDlsym*>(__libc_dlsym(dlopen("libdl.so.2", RTLD_LAZY), "dlsym"));
+    return (*internal_dlsym)(handle, symbol);
+}
+
+} // namespace salus
diff --git a/src/cudahook/realdlsym.h b/src/cudahook/realdlsym.h
new file mode 100644
index 0000000..3e87c24
--- /dev/null
+++ b/src/cudahook/realdlsym.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019, peifeng <email>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *     * Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice,
+ *       this list of conditions and the following disclaimer in the documentation
+ *       and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SALUS_CUDAHOOK_REALDLSYM_H
+#define SALUS_CUDAHOOK_REALDLSYM_H
+
+#include <cstdint>
+
+namespace salus {
+
+void *real_dlsym(void *handle, const char *symbol) noexcept;
+
+template<typename FnPtrT>
+constexpr auto func_cast(void *ptr) noexcept {
+    return reinterpret_cast<FnPtrT>(reinterpret_cast<intptr_t>(ptr));
+}
+
+} // namespace salus
+
+#endif // SALUS_CUDAHOOK_REALDLSYM_H
diff --git a/src/execution/engine/taskexecutor.cpp b/src/execution/engine/taskexecutor.cpp
index 13faf31..e8ad0bc 100644
--- a/src/execution/engine/taskexecutor.cpp
+++ b/src/execution/engine/taskexecutor.cpp
@@ -27,6 +27,7 @@
 #include "execution/scheduler/basescheduler.h"
 #include "execution/scheduler/operationitem.h"
 #include "utils/date.h"
+#include "platform/thread_annotations.h"
 
 using std::chrono::duration_cast;
 using std::chrono::microseconds;
@@ -134,6 +135,8 @@ void TaskExecutor::queueTask(POpItem &&opItem)
 
 void TaskExecutor::scheduleLoop()
 {
+    threading::set_thread_name("TaskExecutor");
+
     auto scheduler = SchedulerRegistary::instance().create(m_schedParam.scheduler, *this);
     DCHECK(scheduler);
     VLOG(2) << "Using scheduler: " << scheduler;
diff --git a/src/execution/executionengine.cpp b/src/execution/executionengine.cpp
index 2041fd4..28b8f5d 100644
--- a/src/execution/executionengine.cpp
+++ b/src/execution/executionengine.cpp
@@ -25,6 +25,7 @@
 #include "execution/engine/resourcecontext.h"
 #include "execution/iterationtask.h"
 #include "platform/logging.h"
+#include "platform/thread_annotations.h"
 #include "utils/containerutils.h"
 #include "utils/date.h"
 #include "utils/debugging.h"
@@ -125,6 +126,7 @@ void ExecutionEngine::maybeWaitForWork(size_t pending, size_t scheduled)
 void ExecutionEngine::scheduleLoop()
 {
     LOG(INFO) << "ExecutionEngine scheduling thread started";
+    threading::set_thread_name("ExecutionEngine");
 
     // a map of lane id to thread local queues.
     std::unordered_map<uint64_t, LaneQueue> queues;
diff --git a/src/execution/threadpool/nonblockingthreadpool.cpp b/src/execution/threadpool/nonblockingthreadpool.cpp
index 43f53a4..39a24b7 100644
--- a/src/execution/threadpool/nonblockingthreadpool.cpp
+++ b/src/execution/threadpool/nonblockingthreadpool.cpp
@@ -26,6 +26,7 @@
 #include "EventCount.h"
 #include "utils/fixed_function.hpp"
 #include "RunQueue.h"
+#include "platform/thread_annotations.h"
 
 #include <atomic>
 #include <memory>
@@ -329,6 +330,12 @@ int ThreadPoolPrivate::nonEmptyQueueIndex()
 
 void ThreadPoolPrivate::workerLoop(int thread_id)
 {
+    if (m_options.workerName.empty()) {
+        salus::threading::set_thread_name("ThreadPoolWorker");
+    } else {
+        salus::threading::set_thread_name(m_options.workerName);
+    }
+
     const auto numThreads = m_options.numThreads;
     const auto spinCount = m_options.spinCount;
     const auto allowSpinning = m_options.allowSpinning;
diff --git a/src/execution/threadpool/threadpool.h b/src/execution/threadpool/threadpool.h
index 7f358f0..c22143f 100644
--- a/src/execution/threadpool/threadpool.h
+++ b/src/execution/threadpool/threadpool.h
@@ -33,17 +33,46 @@ struct ThreadPoolOptions
      */
     size_t numThreads = 0;
 
+    ThreadPoolOptions &setNumThreads(size_t num)
+    {
+        numThreads = num;
+        return *this;
+    }
+
     /**
      * Whether allow spinning wait in worker threads for lower latency
      */
     bool allowSpinning = true;
 
+    ThreadPoolOptions &setAllowSpinning(bool allow)
+    {
+        allowSpinning = allow;
+        return *this;
+    }
+
     /**
      * Times of tries for spin wait before go to wait.
      * Use -1 for default value, which is 5000 / numThreads
      */
     int spinCount = -1;
 
+    ThreadPoolOptions &setSpinCount(int count)
+    {
+        spinCount = count;
+        return *this;
+    }
+
+    /**
+     * @brief Optional worker thread name, truncated at 16 characters.
+     */
+    std::string workerName = "";
+
+    ThreadPoolOptions &setWorkerName(const std::string &name)
+    {
+        workerName = name;
+        return *this;
+    }
+
     ThreadPoolOptions();
     ThreadPoolOptions(const ThreadPoolOptions &) = default;
     ThreadPoolOptions(ThreadPoolOptions &&) = default;
diff --git a/src/main.cpp b/src/main.cpp
index 03c966e..ab67d14 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -17,13 +17,18 @@
  * limitations under the License.
  */
 
+#include "utils/macros.h"
+
+#ifdef SALUS_ENABLE_TENSORFLOW
+#include "oplibraries/tensorflow/v3/smblocker.h"
+#endif
+
 #include "execution/executionengine.h"
 #include "resources/resources.h"
 #include "platform/logging.h"
 #include "platform/signals.h"
 #include "platform/profiler.h"
 #include "rpcserver/zmqserver.h"
-#include "utils/macros.h"
 #include "utils/envutils.h"
 
 #include <docopt.h>
@@ -45,6 +50,7 @@ const static auto listen = "--listen";
 const static auto maxHolWaiting = "--max-hol-waiting";
 const static auto disableFairness = "--disable-fairness";
 const static auto disableWorkConservative = "--disable-wc";
+const static auto smFactor = "--sm-factor";
 const static auto scheduler = "--sched";
 
 const static auto logConf = "--logconf";
@@ -77,6 +83,7 @@ Salus: Fine-Grained GPU Sharing for DNN.
                                 fairness is on.
     --max-hol-waiting=<num>     Maximum number of task allowed go before queue head
                                 in scheduling. [default: 50]
+    --sm-factor=<num>           Scale factor for # of SMs. [default: 1]
     -c <file>, --logconf=<file> Path to log configuration file. Note that
                                 settings in this file takes precedence over
                                 other command line arguments.
@@ -214,6 +221,17 @@ void configureExecution(std::map<std::string, docopt::value> &args)
     salus::ExecutionEngine::instance().setSchedulingParam({maxQueueHeadWaiting, !disableWorkConservative, sched});
 }
 
+void configureSMBlocker(std::map<std::string, docopt::value> &args)
+{
+#ifdef SALUS_ENABLE_TENSORFLOW
+    // docopt doesn't handle double number
+    // so we get as string and do conversion ourselves
+    auto scale = std::atof(value_or<std::string>(args[flags::smFactor], "1.0"s).c_str());
+
+    salus::oplib::tensorflow::SMBlocker::setScaleFactorSM(scale);
+#endif
+}
+
 void printConfiguration(std::map<std::string, docopt::value> &)
 {
     LOG(INFO) << "Running build type: " << SALUS_BUILD_TYPE;
@@ -237,6 +255,11 @@ void printConfiguration(std::map<std::string, docopt::value> &)
     LOG(INFO) << "    Policy: " << param.scheduler;
     LOG(INFO) << "    MaxQueueHeadWaiting: " << param.maxHolWaiting;
     LOG(INFO) << "    WorkConservative: " << (param.workConservative ? "on" : "off");
+
+#ifdef SALUS_ENABLE_TENSORFLOW
+    LOG(INFO) << "GPU execution:";
+    LOG(INFO) << "    SM scale factor: " << salus::oplib::tensorflow::SMBlocker::scaleFactorSM();
+#endif
 }
 
 int main(int argc, char **argv)
@@ -250,6 +273,8 @@ int main(int argc, char **argv)
 
     configureExecution(args);
 
+    configureSMBlocker(args);
+
     printConfiguration(args);
 
     ScopedProfiling sp(value_or<bool>(args[flags::gperf], false));
diff --git a/src/oplibraries/tensorflow/device/gpu/gpu.cpp b/src/oplibraries/tensorflow/device/gpu/gpu.cpp
index e5d54d6..465c552 100644
--- a/src/oplibraries/tensorflow/device/gpu/gpu.cpp
+++ b/src/oplibraries/tensorflow/device/gpu/gpu.cpp
@@ -23,6 +23,7 @@
 
 #include "execution/engine/resourcecontext.h"
 #include "oplibraries/tensorflow/device/gpu/sessiondevice.h"
+#include "oplibraries/tensorflow/v3/smblocker.h"
 #include "utils/threadutils.h"
 
 #include <thread>
@@ -37,7 +38,11 @@ SalusGPUDevice::SalusGPUDevice(const tf::SessionOptions &options, const std::str
                     false /* sync every op */, max_streams)
     , m_streamUsed(static_cast<size_t>(max_streams), false)
     , m_cudaHostAlloc(cuda_host_alloc)
+    , m_SMPoller(nullptr)
 {
+    auto executor_status = tf::GPUMachineManager()->ExecutorForDevice(gpu_id);
+
+    m_SMPoller = std::make_unique<SMEventPoller>(executor_status.ValueOrDie());
 }
 
 tf::Allocator *SalusGPUDevice::GetAllocator(tf::AllocatorAttributes attr)
@@ -55,6 +60,24 @@ tf::Allocator *SalusGPUDevice::GetAllocator(tf::AllocatorAttributes attr)
     return gpu_allocator_;
 }
 
+void SalusGPUDevice::Compute(tf::OpKernel *op_kernel, tf::OpKernelContext *context)
+{
+    BaseGPUDevice::Compute(op_kernel, context);
+
+    m_SMPoller->thenReleaseSM(context->op_device_context<tf::GPUDeviceContext>()->stream(),
+                              SMBlocker::instance().currentThreadSMHolding());
+}
+
+void SalusGPUDevice::ComputeAsync(tf::AsyncOpKernel *op_kernel, tf::OpKernelContext *context,
+                                  tf::AsyncOpKernel::DoneCallback done)
+{
+    BaseGPUDevice::ComputeAsync(op_kernel, context, [this, context, done = std::move(done)]() {
+        m_SMPoller->thenReleaseSM(context->op_device_context<tf::GPUDeviceContext>()->stream(),
+                                  SMBlocker::instance().currentThreadSMHolding());
+        done();
+    });
+}
+
 Status SalusGPUDevice::Sync()
 {
     return BaseGPUDevice::Sync();
diff --git a/src/oplibraries/tensorflow/device/gpu/gpu.h b/src/oplibraries/tensorflow/device/gpu/gpu.h
index ef930a8..3458d3f 100644
--- a/src/oplibraries/tensorflow/device/gpu/gpu.h
+++ b/src/oplibraries/tensorflow/device/gpu/gpu.h
@@ -22,6 +22,7 @@
 
 #include "oplibraries/tensorflow/tensorflow_headers.h"
 #include "oplibraries/tensorflow/device/salusdevices.h"
+#include "oplibraries/tensorflow/device/gpu/smeventpoller.h"
 #include "utils/objectpool.h"
 
 #include <mutex>
@@ -50,6 +51,10 @@ class SalusGPUDevice : public ISalusDevice, public tf::BaseGPUDevice
 
     Status FillContextMap(const tf::Graph *graph, std::vector<tf::DeviceContext *> *device_context_map) override;
 
+    void Compute(tf::OpKernel *op_kernel, tf::OpKernelContext *context) override;
+    void ComputeAsync(tf::AsyncOpKernel *op_kernel, tf::OpKernelContext *context,
+                      tf::AsyncOpKernel::DoneCallback done) override;
+
     void flushCacheFor(sstl::not_null<const tf::Graph *> graph) override;
 
     std::shared_ptr<PerTaskDevice> createPerTaskDevice(sstl::not_null<const tf::Graph *> graph,
@@ -99,6 +104,7 @@ class SalusGPUDevice : public ISalusDevice, public tf::BaseGPUDevice
     std::mutex m_muStream;
     std::vector<bool> m_streamUsed;
     tf::Allocator *m_cudaHostAlloc;
+    std::unique_ptr<SMEventPoller> m_SMPoller;
 };
 
 class SalusGPUDeviceFactory : public tf::BaseGPUDeviceFactory
diff --git a/src/oplibraries/tensorflow/device/gpu/lane/lanemgr.cpp b/src/oplibraries/tensorflow/device/gpu/lane/lanemgr.cpp
index 3bfadc7..75e4c7a 100644
--- a/src/oplibraries/tensorflow/device/gpu/lane/lanemgr.cpp
+++ b/src/oplibraries/tensorflow/device/gpu/lane/lanemgr.cpp
@@ -26,6 +26,9 @@
 #include "utils/envutils.h"
 #include "utils/threadutils.h"
 
+#include <numeric>
+#include <algorithm>
+
 namespace tfgpu = perftools::gputools;
 
 namespace salus::oplib::tensorflow {
@@ -123,30 +126,48 @@ void LaneMgr::processRequests()
     processRequests(sstl::with_guard(m_mu));
 }
 
-void LaneMgr::processRequests(sstl::detail::Guard &&g)
+void LaneMgr::processRequests(sstl::detail::Guard &&)
 {
-    UNUSED(g);
-
     auto it = m_pending.begin();
     auto end = m_pending.end();
     while (it != end) {
         auto &req = *it;
+        const auto reqLen = req.layout.memoryLimits.size();
 
-        // TODO: the algorithm below assumes single GPU, to scale to multiple ones, a global lock is needed
-        CHECK_EQ(req.layout.memoryLimits.size(), 1_sz) << "Only single lane layout is supported";
+        CHECK_LE(reqLen, m_gpus.size()) << "Requested more GPU than available";
+
+        // use a greedy algorithm, sort requested layout in desc order, and try to fit the largest one first
+        std::vector<int> indices(reqLen);
+        std::iota(indices.begin(), indices.end(), 0);
+        std::sort(indices.begin(), indices.end(), [&req](const int a, const int b) {
+            if (req.layout.memoryLimits.at(a) == req.layout.memoryLimits.at(b)) {
+                return req.layout.persistentOccupation.at(a) < req.layout.persistentOccupation.at(b);
+            }
+            return req.layout.memoryLimits.at(a) < req.layout.memoryLimits.at(b);
+        });
 
         std::vector<std::shared_ptr<LaneHolder>> lanes;
-        auto &gcb = m_gpus.at(0);
+        for (auto idx : indices) {
+            std::shared_ptr<LaneHolder> lane{nullptr};
+            for (auto &gcb : m_gpus) {
+                lane = gcb.bestFitFor(req.layout.memoryLimits.at(idx), req.layout.persistentOccupation.at(idx));
+                if (lane) {
+                    break;
+                }
+            }
+            if (!lane) {
+                // can't find a suitable allocation
+                break;
+            }
+            lanes.emplace_back(std::move(lane));
+        }
 
-        // using a best fit policy
-        const auto firstIdx = 0;
-        auto lane = gcb.bestFitFor(req.layout.memoryLimits.at(firstIdx), req.layout.persistentOccupation.at(firstIdx));
-        if (!lane) {
-            // can't find a suitable allocation.
+        if (lanes.size() != reqLen) {
+            // no enough lanes
+            lanes.clear();
             ++it;
             continue;
         }
-        lanes.emplace_back(std::move(lane));
 
         req.cb(std::move(lanes));
 
diff --git a/src/oplibraries/tensorflow/device/gpu/smeventpoller.cpp b/src/oplibraries/tensorflow/device/gpu/smeventpoller.cpp
new file mode 100644
index 0000000..c4a42d2
--- /dev/null
+++ b/src/oplibraries/tensorflow/device/gpu/smeventpoller.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright 2019 Peifeng Yu <peifeng@umich.edu>
+ *
+ * This file is part of Salus
+ * (see https://github.com/SymbioticLab/Salus).
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "smeventpoller.h"
+
+#include "oplibraries/tensorflow/v3/smblocker.h"
+#include "platform/thread_annotations.h"
+
+namespace salus::oplib::tensorflow {
+
+namespace {
+
+} // namespace
+
+SMEventPoller::SMEventPoller(tf::gpu::StreamExecutor *se)
+    : m_pool(ThreadPoolOptions{}
+             .setWorkerName("SMEvtWorker")
+             // one thread for poller, one thread for executing callbacks
+             .setNumThreads(2))
+    , m_se(se)
+{
+    startPollingLoop();
+}
+
+SMEventPoller::~SMEventPoller()
+{
+    stopPollingLoop();
+
+    // free anything owned by this
+    for (auto &act : m_pendingActions) {
+        SMBlocker::instance().release(act.count);
+        if (act.func) {
+            act.func();
+        }
+    }
+}
+
+void SMEventPoller::startPollingLoop()
+{
+    m_pool.run([this]() {
+        pollLoop();
+    });
+}
+
+void SMEventPoller::stopPollingLoop()
+{
+    m_stopPolling.notify();
+    // make sure to wake up polling loop thread
+    m_eventsStaging.notify();
+    m_pollingStopped.wait();
+}
+
+void SMEventPoller::pollLoop()
+{
+    threading::set_thread_name("SMEvtPoller");
+    // actions go from m_stagedEvents to staging, to waiting and finally to ready
+    while (!m_stopPolling.notified()) {
+        PendingActions staging;
+        {
+            auto g = sstl::with_guard(m_mu);
+            staging.swap(m_stagedEvents);
+        }
+
+        m_pendingActions.insert(m_pendingActions.end(),
+                                std::make_move_iterator(staging.begin()),
+                                std::make_move_iterator(staging.end()));
+
+        if (m_pendingActions.empty()) {
+            m_eventsStaging.wait();
+            continue;
+        }
+
+        auto ready = pollEvents();
+        executeReady(ready);
+    }
+    m_pollingStopped.notify();
+}
+
+SMEventPoller::PendingActions SMEventPoller::pollEvents()
+{
+    if (VLOG_IS_ON(2)) {
+        size_t freeSize;
+        {
+            auto g = sstl::with_guard(m_mu);
+            freeSize = m_freeEvents.size();
+        }
+        VLOG(2) << "SMEventPoller m_freeEvents " << freeSize << " m_pendingActions " << m_pendingActions.size();
+    }
+    PendingActions ready;
+    auto it = m_pendingActions.begin();
+    while (it != m_pendingActions.end()) {
+        auto &act = *it;
+        CHECK_NOTNULL(act.event);
+        auto s = act.event->PollForStatus();
+        switch (s) {
+        default:
+        case tf::gpu::Event::Status::kUnknown:
+        case tf::gpu::Event::Status::kError:
+            // We don't expect to see these.  Someday maybe propagate
+            // a Status error, but for now fail hard.
+            LOG(FATAL) << "Unexpected Event status: " << static_cast<int>(s);
+            break;
+        case tf::gpu::Event::Status::kPending:
+            break;
+        case tf::gpu::Event::Status::kComplete:
+            // add event back to free event
+            {
+                auto g = sstl::with_guard(m_mu);
+                m_freeEvents.emplace_back(std::move(act.event));
+            }
+            // add action to ready
+            ready.emplace_back(std::move(act));
+            // remove from pending
+            it = m_pendingActions.erase(it);
+            // skip ++it
+            continue;
+        }
+
+        ++it;
+    }
+    return ready;
+}
+
+void SMEventPoller::executeReady(SMEventPoller::PendingActions &ready)
+{
+    for (auto &act : ready) {
+        SMBlocker::instance().release(act.count);
+        if (act.func) {
+            act.func();
+        }
+    }
+}
+
+void SMEventPoller::queueAction(tf::gpu::Stream *stream, PendingAction act)
+{
+    act.event = allocEvent();
+    CHECK_NOTNULL(act.event);
+    stream->ThenRecordEvent(act.event.get());
+
+    {
+        auto g = sstl::with_guard(m_mu);
+        m_stagedEvents.emplace_back(std::move(act));
+    }
+    // Wake up the polling thread
+    m_eventsStaging.notify();
+}
+
+std::unique_ptr<tf::gpu::Event> SMEventPoller::allocEvent()
+{
+    auto g = sstl::with_guard(m_mu);
+    // Events are created on demand, and repeatedly reused.  There is no
+    // limit placed here on the number of allocated Events.
+    if (m_freeEvents.empty()) {
+        m_freeEvents.emplace_back(std::make_unique<tf::gpu::Event>(m_se));
+        m_freeEvents.back()->Init();
+    }
+    auto e = std::move(m_freeEvents.back());
+    m_freeEvents.pop_back();
+    return e;
+}
+
+} // namespace salus::oplib::tensorflow
diff --git a/src/oplibraries/tensorflow/device/gpu/smeventpoller.h b/src/oplibraries/tensorflow/device/gpu/smeventpoller.h
new file mode 100644
index 0000000..1a51bb3
--- /dev/null
+++ b/src/oplibraries/tensorflow/device/gpu/smeventpoller.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2019 Peifeng Yu <peifeng@umich.edu>
+ *
+ * This file is part of Salus
+ * (see https://github.com/SymbioticLab/Salus).
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SALUS_OPLIB_TENSORFLOW_SMEVENTPOLLER_H
+#define SALUS_OPLIB_TENSORFLOW_SMEVENTPOLLER_H
+
+#include "oplibraries/tensorflow/tensorflow_headers.h"
+
+#include "execution/threadpool/threadpool.h"
+#include "utils/fixed_function.hpp"
+#include "utils/threadutils.h"
+#include "utils/pointerutils.h"
+
+#include <vector>
+#include <memory>
+#include <list>
+
+namespace salus::oplib::tensorflow {
+
+class SMEventPoller
+{
+public:
+    explicit SMEventPoller(tf::gpu::StreamExecutor *se);
+    ~SMEventPoller();
+
+    inline void thenReleaseSM(tf::gpu::Stream *stream, uint64_t count)
+    {
+        if (count == 0) {
+            return;
+        }
+        queueAction(stream, {count, {}, nullptr});
+    }
+
+    inline void thenExecute(tf::gpu::Stream *stream, sstl::FixedFunction<void()> func)
+    {
+        queueAction(stream, {{}, std::move(func), nullptr});
+    }
+
+private:
+    // Posting action from other threads
+    struct PendingAction
+    {
+        uint64_t count; // num of SMs to release
+        sstl::FixedFunction<void()> func; // action to execute
+        std::unique_ptr<tf::gpu::Event> event; // perform action after this event
+    };
+
+    using PendingActions = std::vector<PendingAction>;
+
+    std::unique_ptr<tf::gpu::Event> allocEvent();
+
+    void queueAction(tf::gpu::Stream *stream, PendingAction action);
+
+    void startPollingLoop();
+    void stopPollingLoop();
+
+    void pollLoop();
+    PendingActions pollEvents();
+    void executeReady(PendingActions &ready);
+
+    // pending actions waiting for its events, in order
+    std::list<PendingAction> m_pendingActions;
+
+    // Threading related variables
+    sstl::notification m_stopPolling;
+    sstl::notification m_pollingStopped;
+
+    ThreadPool m_pool;
+
+    // other threads put actions into this queue, which will be regularly picked up by polling thread
+    PendingActions m_stagedEvents GUARDED_BY(m_mu);
+    std::mutex m_mu;
+    sstl::notification m_eventsStaging;
+
+    // GPU Event related variables
+    tf::gpu::StreamExecutor * const m_se;
+
+    // Free events
+    std::vector<std::unique_ptr<tf::gpu::Event>> m_freeEvents GUARDED_BY(m_mu);
+};
+
+} // namespace salus::oplib::tensorflow
+
+#endif // SALUS_OPLIB_TENSORFLOW_SMEVENTPOLLER_H
diff --git a/src/oplibraries/tensorflow/device/shadowdevices.h b/src/oplibraries/tensorflow/device/shadowdevices.h
index 6690bd0..df5cfe3 100644
--- a/src/oplibraries/tensorflow/device/shadowdevices.h
+++ b/src/oplibraries/tensorflow/device/shadowdevices.h
@@ -111,6 +111,8 @@ class ShadowDevice : public tf::Device
 
     ~ShadowDevice() override;
 
+    sstl::not_null<tf::Device *> base() const { return m_base; }
+
     // Hook allocators
     tf::Allocator *GetAllocator(tf::AllocatorAttributes attr) override;
     tf::Allocator *GetStepAllocator(tf::AllocatorAttributes attr,
diff --git a/src/oplibraries/tensorflow/tfinstance.cpp b/src/oplibraries/tensorflow/tfinstance.cpp
index 5638f7b..7e78d63 100644
--- a/src/oplibraries/tensorflow/tfinstance.cpp
+++ b/src/oplibraries/tensorflow/tfinstance.cpp
@@ -91,34 +91,62 @@ void TFInstance::handleCreateSession(std::unique_ptr<tf::CreateSessionRequest> &
 
     LaneMgr::Layout layout;
     // Get resource estimation from client
-    constexpr const auto rt = "MEMORY:GPU";
-    const auto totalGPUMemory = m_laneMgr->totalMemoryForGPU(0);
-    size_t limit = 0;
-    size_t persistant = 0;
+    constexpr const char *rt[] = {
+        "MEMORY:GPU0",
+        "MEMORY:GPU1",
+        "MEMORY:GPU2",
+        "MEMORY:GPU3",
+        "MEMORY:GPU4",
+        nullptr,
+    };
     auto &m = req->config().salus_options().resource_map();
-    persistant = static_cast<size_t>(std::round(sstl::getOrDefault(m.persistant(), rt, 0.0)));
-    // HACK: scale up 10% to mitigate OOM and fragmentation
-    persistant = static_cast<size_t>(persistant * 1.1);
-    limit += persistant;
-    limit += static_cast<size_t>(std::round(sstl::getOrDefault(m.temporary(), rt, 0.0)));
-
-    // HACK: Double the persistant and add to to temporary, just to be safe
-    limit = static_cast<size_t>(limit * 1.05); // and even more 10%
-    limit = std::min(limit, totalGPUMemory); // cap to max value
-
-    if (limit == 0) {
-        limit = totalGPUMemory;
-        persistant = limit;
+    for (auto iGpu = 0_sz; iGpu != m_laneMgr->numGPUs(); ++iGpu) {
+        const auto totalGPUMemory = m_laneMgr->totalMemoryForGPU(iGpu);
+
+        CHECK_NOTNULL(rt[iGpu]) << "We need more GPU strings";
+
+        size_t limit = 0;
+        size_t persistant = 0;
+        auto p = sstl::optionalGet(m.persistant(), rt[iGpu]);
+        auto t = sstl::optionalGet(m.temporary(), rt[iGpu]);
+        if (!p || !t) {
+            break;
+        }
+        persistant = static_cast<size_t>(std::round(*p));
+        // HACK: scale persistent up 10% to mitigate OOM and fragmentation
+        persistant = static_cast<size_t>(persistant * 1.1);
+        limit += persistant;
+
+        limit += static_cast<size_t>(std::round(*t));
+
+        // HACK: scale the total up 5%, just to be safe
+        limit = static_cast<size_t>(limit * 1.05); // and even more 10%
+        limit = std::min(limit, totalGPUMemory); // cap to max value
+
+        layout.memoryLimits.push_back(limit);
+        layout.persistentOccupation.push_back(persistant);
+    }
+
+    if (layout.memoryLimits.empty()) {
+        auto limit = m_laneMgr->totalMemoryForGPU(0);
+        layout.memoryLimits.push_back(limit);
+        layout.persistentOccupation.push_back(limit);
         LOG(WARNING) << "No resource info for current session, assuming whole GPU allocation: " << limit;
     }
-    layout.memoryLimits.push_back(limit);
-    layout.persistentOccupation.push_back(persistant);
+
+    CHECK_EQ(layout.memoryLimits.size(), layout.persistentOccupation.size());
 
     auto totalRunningTime =
         static_cast<uint64_t>(std::round(sstl::getOrDefault(m.persistant(), "TIME:TOTAL", 0.0))) * 1000;
     ectx->setExpectedRunningTime(totalRunningTime);
 
-    m_laneMgr->requestLanes(std::move(layout), [&resp, cb = std::move(cb), req = std::move(req), ectx = std::move(ectx),
+    // smaller is higher priority
+    auto priority = static_cast<int>(sstl::getOrDefault(m.persistant(), "SCHED:PRIORITY", 20));
+
+    LOG(INFO) << "Accept session with priority " << priority;
+
+    m_laneMgr->requestLanes(std::move(layout), [&resp, priority,
+                                                cb = std::move(cb), req = std::move(req), ectx = std::move(ectx),
                                                 this](auto &&lanes) mutable {
         std::vector<tf::Device *> devices;
 
@@ -134,6 +162,7 @@ void TFInstance::handleCreateSession(std::unique_ptr<tf::CreateSessionRequest> &
         // NOTE: laneId on ectx is separated from actual lane implementation.
         // It is only used to have separate scheduling domain. So use first lane's id as the id
         // Revisit if later multi-lane for a job is implemented.
+        // TODO: support multiple lane id
         ectx->setLaneId(lanes.at(0)->id());
 
         auto session =
@@ -151,7 +180,7 @@ void TFInstance::handleCreateSession(std::unique_ptr<tf::CreateSessionRequest> &
                           });
         // Keep a reference for lanes on ectx's user data
         // which should outlive the TFSession.
-        ectx->setUserData(std::forward<decltype(lanes)>(lanes));
+        ectx->setUserData(TFExecutionCtxData{std::forward<decltype(lanes)>(lanes), priority});
 
         // Register force interrupt handler
         ectx->setInterruptCallback([this, handle]() { popSession(handle)->safeClose(); });
diff --git a/src/oplibraries/tensorflow/tfutils.h b/src/oplibraries/tensorflow/tfutils.h
index b36f103..0b6c5ef 100644
--- a/src/oplibraries/tensorflow/tfutils.h
+++ b/src/oplibraries/tensorflow/tfutils.h
@@ -25,6 +25,7 @@
 #include <functional>
 #include <memory>
 #include <string_view>
+#include <vector>
 
 #define CallWithMasterMethodName(m)                                                                                    \
     m(CreateSession) m(ExtendSession) m(PartialRunSetup) m(CloseSession) m(ListDevices) m(Reset) m(RunStep)
@@ -72,6 +73,13 @@ using POpKernel = std::unique_ptr<tf::OpKernel, void (*)(tf::OpKernel *)>;
 
 std::string tfGraphToGraphviz(const tf::Graph &g, const std::string &name);
 
+class LaneHolder;
+struct TFExecutionCtxData
+{
+    std::vector<std::shared_ptr<LaneHolder>> lanes;
+    int priority;
+};
+
 } // namespace salus::oplib::tensorflow
 
 #endif // SALUS_OPLIB_TENSORFLOW_TFUTILS_H
diff --git a/src/oplibraries/tensorflow/v3/smblocker.cpp b/src/oplibraries/tensorflow/v3/smblocker.cpp
new file mode 100644
index 0000000..f2eaf55
--- /dev/null
+++ b/src/oplibraries/tensorflow/v3/smblocker.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2019, peifeng <email>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *     * Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice,
+ *       this list of conditions and the following disclaimer in the documentation
+ *       and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "oplibraries/tensorflow/tensorflow_headers.h"
+#include "oplibraries/tensorflow/v3/smblocker.h"
+#include "utils/threadutils.h"
+#include "utils/containerutils.h"
+
+#include <vector>
+
+namespace {
+
+struct SalusCudaKernelLaunchParams
+{
+    struct Vec3
+    {
+        uint64_t x;
+        uint64_t y;
+        uint64_t z;
+    };
+    Vec3 blockCount;
+    Vec3 threadPerBlock;
+    uint64_t sharedMemBytes;
+};
+
+thread_local std::vector<SalusCudaKernelLaunchParams> SavedCudaKernelLaunches{};
+thread_local uint64_t CurrentThreadHoldingBlocks = 0;
+
+inline auto max(uint64_t a, SalusCudaKernelLaunchParams::Vec3 vec)
+{
+    auto b = vec.x * vec.y * vec.z;
+    return std::max(a, b);
+}
+
+} // namespace
+
+extern "C" {
+
+void salus_kernel_launch_callback(unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
+                                  unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+                                  unsigned int sharedMemBytes,
+                                  void *)
+{
+    SavedCudaKernelLaunches.push_back(SalusCudaKernelLaunchParams{
+        {gridDimX, gridDimY, gridDimZ},
+        {blockDimX, blockDimY, blockDimZ},
+        sharedMemBytes,
+    });
+    VLOG(3) << "Got kernel launch params: blk=("
+            << gridDimX << "," << gridDimY << "," << gridDimZ
+            << ") x thd=(" << blockDimX << "," << blockDimY << "," << blockDimZ << "), " << sharedMemBytes;
+}
+
+} // extern "C"
+
+namespace salus::oplib::tensorflow {
+
+double SMBlocker::m_scaleFactorSM = 0.0;
+
+SMBlocker &SMBlocker::instance()
+{
+    static SMBlocker blocker(scaleFactorSM());
+    return blocker;
+}
+
+SMUsage SMBlocker::queryAvailableSM()
+{
+    auto gpu_manager = tf::GPUMachineManager();
+    // TODO: assume each device has the same number of SM
+    auto se = gpu_manager->ExecutorForDevice(0).ValueOrDie();
+    return {
+        se->GetDeviceDescription().threads_per_block_limit(),
+        static_cast<uint64_t>(se->GetDeviceDescription().core_count())
+    };
+}
+
+SMBlocker::SMBlocker(double factor)
+    : m_maxUsage{queryAvailableSM(), factor}
+    , m_freeBlocks(m_maxUsage.get().blockCount)
+{
+}
+
+uint64_t SMBlocker::currentThreadSMHolding() const
+{
+    return CurrentThreadHoldingBlocks;
+}
+
+void SMBlocker::saveCurrentThreadResults(uint64_t graphId, int nodeId)
+{
+    // reset current thread value
+    CurrentThreadHoldingBlocks = 0;
+
+    // update cache
+    std::unique_lock l{m_mu};
+
+    SMUsage newUsage{0, 0};
+    LOG(DEBUG) << "SavedCudaKernelLaunches " << SavedCudaKernelLaunches.size();
+    for (const auto &res : SavedCudaKernelLaunches) {
+        LOG(DEBUG) << "SavedCudaKernelLaunches: blk=("
+                   << res.blockCount.x << "," << res.blockCount.y << "," << res.blockCount.z
+                   << ") x thd=(" << res.threadPerBlock.x << "," << res.threadPerBlock.y << "," << res.threadPerBlock.z << ")";
+        newUsage.threadPerBlock = max(newUsage.threadPerBlock, res.threadPerBlock);
+        newUsage.blockCount = max(newUsage.blockCount, res.blockCount);
+    }
+
+    auto &usage = m_cache[std::make_pair(graphId, nodeId)];
+    if ((usage.blockCount != 0 || usage.threadPerBlock != 0) && usage != newUsage) {
+        LOG(WARNING) << "Overriding SM usage for graph " << graphId << " node " << nodeId
+                     << ", previous: blk=" << usage.blockCount << " thd=" << usage.threadPerBlock
+                     << ", new: blk=" << newUsage.blockCount << " thd=" << newUsage.threadPerBlock;
+    }
+    usage = newUsage;
+
+    SavedCudaKernelLaunches.clear();
+}
+
+bool SMBlocker::tryTake(uint64_t graphId, int nodeId, int priority)
+{
+    auto smUsage = getUsageForKernel(graphId, nodeId);
+
+    auto res = m_freeBlocks.try_wait(smUsage, priority);
+    if (res) {
+        // save the count
+        CurrentThreadHoldingBlocks = smUsage;
+        LogSMTracing() << "Passed at SMBlocker: graph " << graphId << " node " << nodeId
+                   << " sm " << smUsage << " priority " << priority;
+    }
+    return res;
+}
+
+void SMBlocker::wait(uint64_t graphId, int nodeId, int priority)
+{
+    auto smUsage = getUsageForKernel(graphId, nodeId);
+
+    // save the count
+    CurrentThreadHoldingBlocks = smUsage;
+
+    LogSMTracing() << "Wait at SMBlocker: graph " << graphId << " node " << nodeId
+               << " sm " << smUsage << " priority " << priority;
+    m_freeBlocks.wait(smUsage, priority);
+    LogSMTracing() << "Took at SMBlocker: graph " << graphId << " node " << nodeId
+               << " sm " << smUsage << " priority " << priority;
+}
+
+uint64_t SMBlocker::getUsageForKernel(uint64_t graphId, int nodeId)
+{
+    std::shared_lock l{m_mu};
+
+    auto usage = sstl::getOrDefault(m_cache, {graphId, nodeId}, {});
+
+    return std::min(usage.blockCount, m_maxUsage.get().blockCount);
+}
+
+void SMBlocker::release(uint64_t numSms)
+{
+    LogSMTracing() << "Release at SMBlocker: graph " << 0 << " node " << 0
+                   << " sm " << numSms << " priority " << 0;
+    m_freeBlocks.post(numSms);
+}
+
+} // namespace salus::oplib::tensorflow
diff --git a/src/oplibraries/tensorflow/v3/smblocker.h b/src/oplibraries/tensorflow/v3/smblocker.h
new file mode 100644
index 0000000..714a871
--- /dev/null
+++ b/src/oplibraries/tensorflow/v3/smblocker.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2019, peifeng <email>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *     * Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice,
+ *       this list of conditions and the following disclaimer in the documentation
+ *       and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SALUS_OPLIB_TENSORFLOW_SMBLOCKER_H
+#define SALUS_OPLIB_TENSORFLOW_SMBLOCKER_H
+
+#include "oplibraries/tensorflow/tensorflow_headers.h"
+
+#include "utils/threadutils.h"
+
+#include <boost/functional/hash.hpp>
+
+#include <vector>
+#include <unordered_map>
+#include <shared_mutex>
+
+namespace salus::oplib::tensorflow {
+struct SMUsage
+{
+    uint64_t threadPerBlock = 0;
+    uint64_t blockCount = 0;
+
+    bool operator ==(const SMUsage &other) {
+        return threadPerBlock == other.threadPerBlock && blockCount == other.blockCount;
+    }
+
+    bool operator !=(const SMUsage &other) {
+        return !(*this == other);
+    }
+};
+
+class SMBlocker
+{
+public:
+    static SMBlocker &instance();
+
+    static void setScaleFactorSM(double factor)
+    {
+        m_scaleFactorSM = factor;
+    }
+
+    static double scaleFactorSM()
+    {
+        CHECK_NE(m_scaleFactorSM, 0.0) << "Must call SMBlocker::setScaleFactorSM before getting value";
+        return m_scaleFactorSM;
+    }
+
+    /**
+     * @brief Release this amount of numSms
+     */
+    void release(uint64_t numSms);
+
+    /**
+     * @brief Return the number of sms held by current thread
+     * @return
+     */
+    uint64_t currentThreadSMHolding() const;
+
+    /**
+     * @brief Save current thread's launch parameter
+     * @param graphId
+     * @param nodeId
+     */
+    void saveCurrentThreadResults(uint64_t graphId, int nodeId);
+
+    /**
+     * @brief Non-blocking version of wait
+     * @param graphId
+     * @param nodeId
+     * @param priority Smaller priority is higher, default is 10
+     * @return true if successfully get needed resource
+     */
+    bool tryTake(uint64_t graphId, int nodeId, int priority);
+
+    /**
+     * @brief Blocking wait, takes SMs
+     * @param graphId
+     * @param nodeId
+     * @param priority
+     */
+    void wait(uint64_t graphId, int nodeId, int priority);
+
+    static constexpr int MaxPriority = 100;
+
+private:
+    static double m_scaleFactorSM;
+    static SMUsage queryAvailableSM();
+
+    explicit SMBlocker(double factor);
+
+    uint64_t getUsageForKernel(uint64_t graphId, int nodeId);
+
+    class MaxSMUsage
+    {
+        SMUsage usage;
+        double scale;
+    public:
+        explicit MaxSMUsage(SMUsage u, double scale = 1.0)
+            : usage(u)
+            , scale(scale)
+        {}
+
+        SMUsage get() const {
+            return {usage.threadPerBlock, static_cast<uint64_t>(usage.blockCount * scale)};
+        }
+        double getScale() const {
+            return scale;
+        }
+        void set(SMUsage u) {
+            usage = u;
+        }
+        void setScale(double s) {
+            scale = s;
+        }
+    };
+
+    MaxSMUsage m_maxUsage;
+
+    sstl::priority_semaphore<MaxPriority> m_freeBlocks;
+
+    using KernelId = std::pair<uint64_t, int>;
+    std::unordered_map<KernelId, SMUsage, boost::hash<KernelId>> m_cache;
+    std::shared_mutex m_mu;
+};
+
+} // namespace salus::oplib::tensorflow
+
+#endif // SALUS_OPLIB_TENSORFLOW_SMBLOCKER_H
diff --git a/src/oplibraries/tensorflow/v3/tf_executor.cpp b/src/oplibraries/tensorflow/v3/tf_executor.cpp
index dafba69..1adc249 100644
--- a/src/oplibraries/tensorflow/v3/tf_executor.cpp
+++ b/src/oplibraries/tensorflow/v3/tf_executor.cpp
@@ -19,6 +19,7 @@ limitations under the License.
 #include "execution/engine/iterationcontext.h"
 #include "execution/iterationtask.h"
 #include "oplibraries/tensorflow/tfinstance.h"
+#include "oplibraries/tensorflow/v3/smblocker.h"
 #include "utils/envutils.h"
 
 namespace salus::oplib::tensorflow {
@@ -1103,6 +1104,11 @@ class ExecutorState
             return ready_.end();
         }
 
+        auto size() const
+        {
+            return ready_.size();
+        }
+
     private:
         tf::gtl::InlinedVector<TaggedNode, 16> ready_;
         size_t front_index_;
@@ -1481,10 +1487,24 @@ void ExecutorState::Process(TaggedNode tagged_node, tf::int64)
     Status s;
     EntryVector outputs;
     bool completed = false;
+    uint64_t failedTake = 0;
+    auto priority = std::any_cast<TFExecutionCtxData>(impl_->params_.ins->userData()).priority;
     inline_ready.push_back(tagged_node);
     while (!inline_ready.empty()) {
         tagged_node = inline_ready.front();
+
+        if (!SMBlocker::instance().tryTake(impl_->graph_id_, tagged_node.node->id(), priority)) {
+            ++failedTake;
+            if (failedTake < inline_ready.size()) {
+                continue;
+            } else {
+                SMBlocker::instance().wait(impl_->graph_id_, tagged_node.node->id(), priority);
+                failedTake = 0;
+            }
+        }
+
         inline_ready.pop_front();
+
         const auto *node = tagged_node.node;
         FrameState *input_frame = tagged_node.input_frame;
         const tf::int64 input_iter = tagged_node.input_iter;
@@ -1564,6 +1584,8 @@ void ExecutorState::Process(TaggedNode tagged_node, tf::int64)
                 AsyncState *state = new AsyncState(params, tagged_node, &item, first_input, nullptr);
 
                 auto done = [this, state]() {
+                    SMBlocker::instance().saveCurrentThreadResults(impl_->graph_id_, state->item->node->id());
+
                     auto *device = impl_->params_.device;
                     Entry *first_input = state->first_input; // Shorthand
 
@@ -1606,6 +1628,9 @@ void ExecutorState::Process(TaggedNode tagged_node, tf::int64)
                 tf::OpKernelContext ctx(&params, item.num_outputs);
                 CHECK_NOTNULL(op_kernel);
                 device->Compute(op_kernel, &ctx);
+
+                SMBlocker::instance().saveCurrentThreadResults(impl_->graph_id_, item.node->id());
+
                 s = ProcessOutputs(item, &ctx, &outputs, nullptr);
                 if (s.ok() && impl_->device_record_tensor_accesses_) {
                     // Get the list of all tensors accessed during the execution
diff --git a/src/platform/CMakeLists.txt b/src/platform/CMakeLists.txt
index 088cc12..5f9cb1f 100644
--- a/src/platform/CMakeLists.txt
+++ b/src/platform/CMakeLists.txt
@@ -8,10 +8,11 @@ if(WIN32)
         "windows/memory.cpp"
         "windows/signals.cpp"
     )
-else(WIN32)
+else() # POSIX
     list(APPEND SRC_LIST
         "posix/memory.cpp"
         "posix/signals.cpp"
+        "posix/thread_annotations.cpp"
     )
 endif(WIN32)
 
@@ -22,4 +23,13 @@ target_link_libraries(platform
 
     PRIVATE
     protos_gen
+    Threads::Threads
 )
+
+if(NOT WIN32)
+    # POSIX
+    target_compile_definitions(platform
+        PRIVATE
+            _GNU_SOURCE=1
+    )
+endif()
diff --git a/src/platform/logging.cpp b/src/platform/logging.cpp
index 369ceb0..9547064 100644
--- a/src/platform/logging.cpp
+++ b/src/platform/logging.cpp
@@ -117,7 +117,7 @@ void initialize(const Params &params)
 
     // Force to create loggers here with default configuration
     // in non-performance sensitive code path.
-    for (auto tag : {logging::kAllocTag, logging::kOpTracing, logging::kPerfTag, logging::kDefTag}) {
+    for (auto tag : {logging::kSMTag, logging::kAllocTag, logging::kOpTracing, logging::kPerfTag, logging::kDefTag}) {
         auto logger = Loggers::getLogger(tag);
         DCHECK(logger);
     }
diff --git a/src/platform/logging.h b/src/platform/logging.h
index 0f1853e..6ff6355 100644
--- a/src/platform/logging.h
+++ b/src/platform/logging.h
@@ -116,6 +116,7 @@ constexpr const auto kAllocTag = "alloc";
 constexpr const auto kPerfTag = "performance";
 constexpr const auto kOpTracing = "optracing";
 constexpr const auto kDefTag = "default";
+constexpr const auto kSMTag = "smtracing";
 
 // logging configurations
 struct Params
@@ -133,6 +134,7 @@ void initialize(const Params &params);
 #define LogPerf() CLOG(TRACE, logging::kPerfTag)
 #define LogAlloc() CLOG(TRACE, logging::kAllocTag)
 #define LogOpTracing() CLOG(TRACE, logging::kOpTracing)
+#define LogSMTracing() CLOG(TRACE, logging::kSMTag)
 
 // Additional operator<< implementations
 MAKE_LOGGABLE(std::exception_ptr, ep, os);
diff --git a/src/platform/posix/thread_annotations.cpp b/src/platform/posix/thread_annotations.cpp
new file mode 100644
index 0000000..3ce20a8
--- /dev/null
+++ b/src/platform/posix/thread_annotations.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2019 Peifeng Yu <peifeng@umich.edu>
+ *
+ * This file is part of Salus
+ * (see https://github.com/SymbioticLab/Salus).
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "platform/thread_annotations.h"
+
+#include <pthread.h>
+
+#include <string>
+
+namespace salus::threading {
+
+void set_thread_name(std::string_view name)
+{
+#if defined(__GLIBC__)
+    // maximum length is 16 (including \0), longer than that casues error
+    constexpr auto MAX_THREAD_NAME_LENGTH = 16 - 1;
+    pthread_setname_np(pthread_self(), std::string(name.substr(0, MAX_THREAD_NAME_LENGTH)).c_str());
+#elif defined(__APPLE__)
+    pthread_setname_np(std::string(name).c_str());
+#else
+#error unsupported platform for POSIX!
+#endif
+}
+
+} // namespace salus::threading
diff --git a/src/platform/thread_annotations.h b/src/platform/thread_annotations.h
index a4d6414..8db0e77 100644
--- a/src/platform/thread_annotations.h
+++ b/src/platform/thread_annotations.h
@@ -155,4 +155,11 @@ inline T &ts_unchecked_read(T &v) NO_THREAD_SAFETY_ANALYSIS
 }
 } // namespace salus::thread_safety_analysis
 
+#include <string_view>
+namespace salus::threading {
+
+void set_thread_name(std::string_view name);
+
+} // namespace salus::threading
+
 #endif // SALUS_PLATFORM_THREAD_ANNOTATIONS_H_
diff --git a/src/rpcserver/iothreadpool.cpp b/src/rpcserver/iothreadpool.cpp
index 4bb08be..7388c69 100644
--- a/src/rpcserver/iothreadpool.cpp
+++ b/src/rpcserver/iothreadpool.cpp
@@ -18,6 +18,7 @@
  */
 
 #include "iothreadpool.h"
+#include "platform/thread_annotations.h"
 
 #include <thread>
 
@@ -42,6 +43,7 @@ IOThreadPoolImpl::~IOThreadPoolImpl()
 
 void IOThreadPoolImpl::workerLoop()
 {
+    threading::set_thread_name("salus::IOThreadPoolWorker");
     m_context.run();
 }
 
diff --git a/src/rpcserver/zmqserver.cpp b/src/rpcserver/zmqserver.cpp
index d4fd3fc..8626cfe 100644
--- a/src/rpcserver/zmqserver.cpp
+++ b/src/rpcserver/zmqserver.cpp
@@ -22,6 +22,7 @@
 #include "rpcservercore.h"
 #include "platform/logging.h"
 #include "platform/signals.h"
+#include "platform/thread_annotations.h"
 #include "utils/protoutils.h"
 
 #include "protos.h"
@@ -292,6 +293,8 @@ void ZmqServer::sendMessage(MultiPartMessage &&parts)
 
 void ZmqServer::sendLoop()
 {
+    salus::threading::set_thread_name("ZmqSendLoop");
+
     zmq::socket_t sock(m_zmqCtx, zmq::socket_type::pair);
     sock.connect(kBeAddr);
     VLOG(2) << "Sending loop started";
diff --git a/src/salus-server.list b/src/salus-server.list
new file mode 100644
index 0000000..f7e71f2
--- /dev/null
+++ b/src/salus-server.list
@@ -0,0 +1,3 @@
+{
+    salus_kernel_launch_callback;
+};
diff --git a/src/utils/threadutils.cpp b/src/utils/threadutils.cpp
index 6b7bf6f..c368b9d 100644
--- a/src/utils/threadutils.cpp
+++ b/src/utils/threadutils.cpp
@@ -21,7 +21,7 @@
 
 namespace sstl {
 
-void semaphore::notify(uint32_t c)
+void semaphore::notify(uint64_t c)
 {
     {
         auto l = with_guard(m_mu);
@@ -31,7 +31,13 @@ void semaphore::notify(uint32_t c)
     m_cv.notify_all();
 }
 
-void semaphore::wait(uint32_t c)
+bool semaphore::may_block(uint64_t c)
+{
+    auto lock = with_guard(m_mu);
+    return m_count < c;
+}
+
+void semaphore::wait(uint64_t c)
 {
     auto lock = with_uguard(m_mu);
     m_cv.wait(lock, [&]() { return m_count >= c; });
diff --git a/src/utils/threadutils.h b/src/utils/threadutils.h
index 87a43ea..c7d854d 100644
--- a/src/utils/threadutils.h
+++ b/src/utils/threadutils.h
@@ -163,12 +163,91 @@ class semaphore
 {
     std::mutex m_mu;
     std::condition_variable m_cv;
-    uint32_t m_count = 0; // Initialized as locked.
+    uint64_t m_count = 0;
 
 public:
-    void notify(uint32_t c = 1);
+    // Initialized as locked.
+    explicit semaphore(uint64_t init = 0) : m_count(init) {}
 
-    void wait(uint32_t c = 1);
+    void notify(uint64_t c = 1);
+
+    void wait(uint64_t c = 1);
+
+    bool may_block(uint64_t c = 1);
+};
+
+/**
+ * @brief Semaphore that can wait on count and with strict priority.
+ * As long as higher priority queue is not empty, lower priority reqeust will wait.
+ */
+template<uint8_t kMaxPriority, uint8_t kDefaultPriority = 0>
+class priority_semaphore
+{
+    std::mutex m_mu;
+    uint64_t m_pending[kMaxPriority]{};
+    std::condition_variable m_queues[kMaxPriority];
+    uint64_t m_count;
+
+public:
+    static_assert(kMaxPriority > 0, "Max priority must be greater than 0");
+    static_assert(kDefaultPriority < kMaxPriority, "Default priority must be in the range [0, kMaxPriority)");
+
+    explicit priority_semaphore(uint64_t init = 0) : m_count(init) {}
+
+    void post(uint64_t c = 1)
+    {
+        auto l = with_guard(m_mu);
+        m_count += c;
+        for (auto p = 0; p != kMaxPriority; ++p) {
+            if (m_pending[p] > 0) {
+                m_queues[p].notify_all();
+                break;
+            }
+        }
+    }
+
+    void wait(uint64_t c = 1, uint8_t p = kDefaultPriority)
+    {
+        auto lock = with_uguard(m_mu);
+        if (can_take(c, p)) {
+            m_count -= c;
+            return;
+        }
+        m_pending[p] += 1;
+        m_queues[p].wait(lock, [&]() { return can_take(c, p); });
+        m_pending[p] -= 1;
+        m_count -= c;
+    }
+
+    bool try_wait(uint64_t c = 1, uint8_t p = kDefaultPriority)
+    {
+        auto lock = with_guard(m_mu);
+        if (can_take(c, p)) {
+            m_count -= c;
+            return true;
+        }
+        return false;
+    }
+
+private:
+    /**
+     * @brief Must be called under lock of m_mu
+     * @param c
+     * @param p
+     * @return true if can take the resource at this p level
+     */
+    bool can_take(uint64_t c, uint8_t p)
+    {
+        for (auto i = 0; i != p; ++i) {
+            if (m_pending[i] > 0) {
+                return false;
+            }
+        }
+        // whether to skip current priority level's queue if it's available?
+        // yes. because, when waken up in cv's wait, m_pending is not subtracted yet,
+        // but we still need to proceed
+        return m_count >= c;
+    }
 };
 
 /**
diff --git a/src/wrapper/CMakeLists.txt b/src/wrapper/CMakeLists.txt
new file mode 100644
index 0000000..a2eef04
--- /dev/null
+++ b/src/wrapper/CMakeLists.txt
@@ -0,0 +1,2 @@
+# configure and install helper script
+configure_file(salus-server.in salus-server @ONLY)
diff --git a/src/wrapper/salus-server.in b/src/wrapper/salus-server.in
new file mode 100755
index 0000000..315199b
--- /dev/null
+++ b/src/wrapper/salus-server.in
@@ -0,0 +1,30 @@
+#! /bin/bash
+
+# Get current file directory
+SOURCE="${BASH_SOURCE[0]}"
+while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
+    DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )"
+    SOURCE="$(readlink "$SOURCE")"
+    [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
+done
+DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )"
+
+# Find salus executable, it should be in the same directory as this script
+SALUS_EXECUTABLE=$DIR/salus-server-exec
+# Find cudahook library, it should be either in ../lib/libcudahook.so,
+# or, if we are in the build tree, in ./cudahook/libcudahook.so
+LIBCUDAHOOK=$(realpath $DIR/../lib/libcudahook.so 2>/dev/null)
+if [[ ! -f "$LIBCUDAHOOK" ]]; then
+    LIBCUDAHOOK=$(realpath $DIR/cudahook/libcudahook.so 2>/dev/null)
+fi
+
+if [[ ! -f "$LIBCUDAHOOK" ]]; then
+    echo "libcudahook.so not found!" >2
+    exit -1
+fi
+
+TF_MIN_CPP_LOG_LEVEL=${TF_MIN_CPP_LOG_LEVEL:-4}
+CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-2,3}
+
+export LD_PRELOAD=$LIBCUDAHOOK
+exec $SALUS_EXECUTABLE "$@"
diff --git a/tests/test_tf/lib/datasets/ptb_reader.py b/tests/test_tf/lib/datasets/ptb_reader.py
index a1b7609..8f1874c 100644
--- a/tests/test_tf/lib/datasets/ptb_reader.py
+++ b/tests/test_tf/lib/datasets/ptb_reader.py
@@ -37,7 +37,7 @@ def __init__(self, config, data, name=None):
 
 
 def _read_words(filename):
-    with tf.gfile.GFile(filename, "r") as f:
+    with tf.gfile.GFile(filename, "rb") as f:
         return f.read().decode("utf-8").replace("\n", "<eos>").split()
 
 
diff --git a/tests/test_tf/lib/seq2seq/ptb/ptb_word_lm.py b/tests/test_tf/lib/seq2seq/ptb/ptb_word_lm.py
index bfe1ed6..6f9eb46 100644
--- a/tests/test_tf/lib/seq2seq/ptb/ptb_word_lm.py
+++ b/tests/test_tf/lib/seq2seq/ptb/ptb_word_lm.py
@@ -206,7 +206,7 @@ def run_epoch(self, session, eval_op=None, verbose=False):
         state = session.run(self.initial_state)
 
         eval_interval = os.environ.get('SALUS_TFBENCH_EVAL_INTERVAL', '0.1')
-        eval_rand_factor = os.environ.get('SALUS_TFBENCH_EVAL_RAND_FACTOR', '5')
+        eval_rand_factor = os.environ.get('SALUS_TFBENCH_EVAL_RAND_FACTOR', None)
         eval_block = os.environ.get('SALUS_TFBENCH_EVAL_BLOCK', 'true')
 
         if eval_block != 'true':
@@ -242,10 +242,11 @@ def run_epoch(self, session, eval_op=None, verbose=False):
                 print(fmt_str.format(datetime.now(), step, np.exp(costs / iters), local_speed, dur))
 
             if self._train_op is None:
-                factor = 1
-                if eval_rand_factor != "1":
-                    factor = random.randint(1, int(eval_rand_factor))
-                time.sleep(float(eval_interval) * factor)
+                if float(eval_interval) > 0:
+                    factor = 1
+                    if eval_rand_factor is not None:
+                        factor = random.randint(1, int(eval_rand_factor))
+                    time.sleep(float(eval_interval) * factor)
 
         return np.exp(costs / iters), speeds
 
diff --git a/tests/test_tf/lib/tfhelper.py b/tests/test_tf/lib/tfhelper.py
index 08256ab..34838b2 100644
--- a/tests/test_tf/lib/tfhelper.py
+++ b/tests/test_tf/lib/tfhelper.py
@@ -9,6 +9,14 @@
 import tensorflow as tf
 
 
+def batch_size_from_env(default=1):
+    """Get batch size from environment variable SALUS_BATCH_SIZE"""
+    try:
+        return int(os.environ.get('SALUS_BATCH_SIZE', ''))
+    except ValueError:
+        return default
+
+
 def iteration_num_from_env(default=20):
     """Get iteration number from environment variable EXEC_ITER_NUMBER"""
     try:
diff --git a/tests/test_tf/test_mnist_tf.py b/tests/test_tf/test_mnist_tf.py
index 43239de..21b5496 100644
--- a/tests/test_tf/test_mnist_tf.py
+++ b/tests/test_tf/test_mnist_tf.py
@@ -16,6 +16,8 @@
 
 
 def run_mnist_softmax(sess, batch_size=50):
+    batch_size = tfhelper.batch_size_from_env(batch_size)
+    print('Using batch_size {}'.format(batch_size))
     x_image, y_, num_classes = fake_data(batch_size, None, height=28, width=28, depth=1, num_classes=10)
     y_ = tf.one_hot(y_, num_classes)
     x = tf.reshape(x_image, [-1, 784])
@@ -73,6 +75,8 @@ def conv2d(x, W):
     def max_pool_2x2(x):
         return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
 
+    batch_size = tfhelper.batch_size_from_env(batch_size)
+    print('Using batch_size {}'.format(batch_size))
     x_image, y_, num_classes = fake_data(batch_size, None, height=28, width=28, depth=1, num_classes=10)
     y_ = tf.one_hot(y_, num_classes)
     keep_prob = tf.placeholder(tf.float32)
@@ -150,6 +154,8 @@ def conv2d(x, W):
     def max_pool_2x2(x):
         return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
 
+    batch_size = tfhelper.batch_size_from_env(batch_size)
+    print('Using batch_size {}'.format(batch_size))
     x_image, y_, num_classes = fake_data(batch_size, None, height=28, width=28, depth=1, num_classes=10)
     y_ = tf.one_hot(y_, num_classes)
     keep_prob = tf.placeholder(tf.float32)
@@ -266,8 +272,9 @@ def test_distributed(self, batch_size):
                         dev='/job:tfworker/device:GPU:0',
                         config=self._config(batch_size=batch_size))
 
-    def test_correctness(self):
-        actual, expected = run_on_rpc_and_gpu(self._runner(), config=self._config())
+    @parameterized.expand([(25,), (50,), (100,)])
+    def test_correctness(self, batch_size):
+        actual, expected = run_on_rpc_and_gpu(self._runner(batch_size), config=self._config())
         assertAllClose(actual, expected, rtol=1e-3)
 
 
@@ -293,6 +300,8 @@ def _config(self, **kwargs):
         config.allow_soft_placement = True
         config.salus_options.resource_map.temporary['MEMORY:GPU'] = memusages[batch_size][0]
         config.salus_options.resource_map.persistant['MEMORY:GPU'] = memusages[batch_size][1]
+        config.salus_options.resource_map.temporary['MEMORY:GPU0'] = memusages[batch_size][0]
+        config.salus_options.resource_map.persistant['MEMORY:GPU0'] = memusages[batch_size][1]
         return config
 
 
@@ -313,6 +322,8 @@ def _config(self, **kwargs):
         config.allow_soft_placement = True
         config.salus_options.resource_map.temporary['MEMORY:GPU'] = memusages[batch_size][0]
         config.salus_options.resource_map.persistant['MEMORY:GPU'] = memusages[batch_size][1]
+        config.salus_options.resource_map.temporary['MEMORY:GPU0'] = memusages[batch_size][0]
+        config.salus_options.resource_map.persistant['MEMORY:GPU0'] = memusages[batch_size][1]
         return config
 
 
diff --git a/tests/test_tf/test_seq.py b/tests/test_tf/test_seq.py
index 5d40663..971d558 100644
--- a/tests/test_tf/test_seq.py
+++ b/tests/test_tf/test_seq.py
@@ -19,10 +19,12 @@ def run_seq_ptb(sess, config_name):
     eval_config = get_config(config_name)
     config.max_max_epoch = 1
     config.max_epoch = 1
+    config.batch_size = tfhelper.batch_size_from_env(config.batch_size)
+    print("Using batch size {}".format(config.batch_size))
 
     eval_config.max_max_epoch = 1
     eval_config.max_epoch = 1
-    eval_config.batch_size = 1
+    eval_config.batch_size = config.batch_size
     eval_config.num_steps = 1
 
     train_input, valid_input, test_input = datasets.ptb_data(config, eval_config)
@@ -65,10 +67,12 @@ def test_seq_ptb(sess, config_name):
     eval_config = get_config(config_name)
     config.max_max_epoch = 1
     config.max_epoch = 1
+    config.batch_size = tfhelper.batch_size_from_env(config.batch_size)
+    print("Using batch size {}".format(config.batch_size))
 
     eval_config.max_max_epoch = 1
     eval_config.max_epoch = 1
-    eval_config.batch_size = 1
+    eval_config.batch_size = config.batch_size
     eval_config.num_steps = 1
 
     train_input, valid_input, test_input = datasets.ptb_data(config, eval_config)
@@ -192,6 +196,8 @@ def _config(self, model_size, isEval=False):
         config.allow_soft_placement = True
         config.salus_options.resource_map.temporary['MEMORY:GPU'] = memusages[model_size][0]
         config.salus_options.resource_map.persistant['MEMORY:GPU'] = memusages[model_size][1]
+        config.salus_options.resource_map.temporary['MEMORY:GPU0'] = memusages[model_size][0]
+        config.salus_options.resource_map.persistant['MEMORY:GPU0'] = memusages[model_size][1]
         return config
 
 
diff --git a/tests/test_tf/test_super_res.py b/tests/test_tf/test_super_res.py
index fd15c1b..7dfd669 100644
--- a/tests/test_tf/test_super_res.py
+++ b/tests/test_tf/test_super_res.py
@@ -19,6 +19,9 @@
 
 
 def run_superres(sess, input_data, batch_size=100, isEval=False):
+    batch_size = tfhelper.batch_size_from_env(batch_size)
+    print("{}: Using batch size {}".format(datetime.now(), batch_size))
+
     input_images, target_images = input_data(batch_size=batch_size)
 
     model = networks.SuperRes(input_images, target_images, batch_size=batch_size)
@@ -57,7 +60,7 @@ def run_superres(sess, input_data, batch_size=100, isEval=False):
             print(fmt_str.format(datetime.now(), i, loss_value, examples_per_sec, sec_per_batch))
             losses.append(loss_value)
 
-            if isEval:
+            if isEval and eval_rand_factor != '0':
                 factor = 1
                 if eval_rand_factor != "1":
                     factor = random.randint(1, int(eval_rand_factor))
@@ -95,6 +98,8 @@ def _config(self, isEval=False, **kwargs):
         config.allow_soft_placement = True
         config.salus_options.resource_map.temporary['MEMORY:GPU'] = memusages[batch_size][0]
         config.salus_options.resource_map.persistant['MEMORY:GPU'] = memusages[batch_size][1]
+        config.salus_options.resource_map.temporary['MEMORY:GPU0'] = memusages[batch_size][0]
+        config.salus_options.resource_map.persistant['MEMORY:GPU0'] = memusages[batch_size][1]
         return config
 
     def _get_func(self, batch_size, isEval=False):
diff --git a/tests/test_tf/test_vae.py b/tests/test_tf/test_vae.py
index 466c5b0..9e867e6 100644
--- a/tests/test_tf/test_vae.py
+++ b/tests/test_tf/test_vae.py
@@ -26,8 +26,11 @@ def run_vae(sess, args=None, isEval=False):
     if args is None:
         args = networks.vae.get_args()
 
+    batch_size = tfhelper.batch_size_from_env(args.batch_size)
+    print(f"Batch size: {batch_size}")
+
     dim_img = IMAGE_SIZE_MNIST ** 2  # number of pixels for a MNIST image
-    x_image, _, num_classes = fake_data(args.batch_size, None, height=IMAGE_SIZE_MNIST, width=IMAGE_SIZE_MNIST,
+    x_image, _, num_classes = fake_data(batch_size, None, height=IMAGE_SIZE_MNIST, width=IMAGE_SIZE_MNIST,
                                         depth=1, num_classes=10)
 
     with tf.name_scope('model'):
@@ -78,7 +81,7 @@ def run_vae(sess, args=None, isEval=False):
             last_end_time = end_time
 
             duration = end_time - start_time
-            examples_per_sec = args.batch_size / duration
+            examples_per_sec = batch_size / duration
             sec_per_batch = float(duration)
             speeds.append(sec_per_batch)
 
@@ -87,7 +90,7 @@ def run_vae(sess, args=None, isEval=False):
             print(fmt_str.format(datetime.now(), i,
                                  loss_value, examples_per_sec, sec_per_batch))
 
-            if isEval:
+            if isEval and eval_rand_factor != '0':
                 factor = 1
                 if eval_rand_factor != "1":
                     factor = random.randint(1, int(eval_rand_factor))
@@ -124,6 +127,8 @@ def _config(self, args, isEval=False):
         config.allow_soft_placement = True
         config.salus_options.resource_map.temporary['MEMORY:GPU'] = memusages[args.batch_size][0]
         config.salus_options.resource_map.persistant['MEMORY:GPU'] = memusages[args.batch_size][1]
+        config.salus_options.resource_map.temporary['MEMORY:GPU0'] = memusages[args.batch_size][0]
+        config.salus_options.resource_map.persistant['MEMORY:GPU0'] = memusages[args.batch_size][1]
         return config
 
     @parameterized.expand([(1,), (5,), (10,)])
diff --git a/tests/test_tf/test_vgg.py b/tests/test_tf/test_vgg.py
index 0502121..7cc7206 100644
--- a/tests/test_tf/test_vgg.py
+++ b/tests/test_tf/test_vgg.py
@@ -166,6 +166,8 @@ def _config(self, **kwargs):
         config.allow_soft_placement = True
         config.salus_options.resource_map.temporary['MEMORY:GPU'] = memusages[batch_size][0]
         config.salus_options.resource_map.persistant['MEMORY:GPU'] = memusages[batch_size][1]
+        config.salus_options.resource_map.temporary['MEMORY:GPU0'] = memusages[batch_size][0]
+        config.salus_options.resource_map.persistant['MEMORY:GPU0'] = memusages[batch_size][1]
         return config