Run one tool per engine for remotehost

- engine naming needs to be resolved to allow multiple remotehost endpoints
perftool-incubator · Nov 21, 2023 · 64e17dc · 64e17dc
1 parent 3125528
commit 64e17dc
Show file tree

Hide file tree

Showing 4 changed files with 133 additions and 23 deletions.
diff --git a/endpoints/remotehost/remotehost b/endpoints/remotehost/remotehost
@@ -47,6 +47,7 @@ image_cache_size=3
 osruntime[default]="chroot"
 host_mounts=""
 hypervisor_host="none" # Default is no hypervisor
+new_remotehost_followers=""
 
 function endpoint_remotehost_engine_init() {
     echo "Running endpoint_engine_init"
@@ -160,6 +161,18 @@ function cleanup_osruntime() {
     do_ssh $user@$host podman mount
     echo
 
+    # Clean up tool engines
+
+    echo "Going to remove a pod for each tool"
+    local tools=`awk -F: '{print $1}' $config_dir/tool-cmds/profiler/start`
+    tool_count=1
+    for tool in $tools; do
+        engine_label=profiler-$tool_count
+        container_name="${endpoint_label}_${run_id}_${engine_label}_${os_runtime}"
+        do_ssh $user@$host podman rm ${container_name}
+        let tool_count=$tool_count+1
+    done
+
     for this_cs_label in ${clients[@]} ${servers[@]} ${collectors[@]}; do
         set_osruntime_numanode_cpupart ${this_cs_label}
 
@@ -282,6 +295,70 @@ function remotehost_req_check() {
     fi
 }
 
+function exec_pod() {
+        local engine_label=$1; shift
+        local tool=""
+        tool=$1; shift
+        local os_runtime="pod"
+
+        env_tool_file="${engine_label}_env.txt"
+        get_image profiler $tool_count this_image
+
+        echo "rickshaw_host=$controller_ipaddr"         >> ${endpoint_run_dir}/${env_tool_file}
+        echo "endpoint_run_dir=$endpoint_run_dir"       >> ${endpoint_run_dir}/${env_tool_file}
+        echo "cs_label=$engine_label"                   >> ${endpoint_run_dir}/${env_tool_file}
+        echo "tool_name=$tool"                          >> ${endpoint_run_dir}/${env_tool_file}
+        echo "base_run_dir=$base_run_dir"               >> ${endpoint_run_dir}/${env_tool_file}
+        echo "endpoint=remotehost"                      >> ${endpoint_run_dir}/${env_tool_file}
+        echo "max_rb_attempts=$max_rb_attempts"         >> ${endpoint_run_dir}/${env_tool_file}
+        echo "ssh_id=${ssh_id}"                         >> ${endpoint_run_dir}/${env_tool_file}
+
+        # roadblock opts like redis server
+        for cs_rb_opt in $cs_rb_opts; do
+            arg=$(echo $cs_rb_opt | awk -F'=' '{print $1}')
+            value=$(echo $cs_rb_opt | awk -F'=' '{print $2}')
+            arg=$(echo ${arg} | sed -e 's/^--//' -e 's/-/_/g' )
+            echo "${arg}=${value}"                      >> ${endpoint_run_dir}/${env_tool_file}
+        done
+
+        if pushd ${endpoint_run_dir} >/dev/null; then
+            echo "Copying ${endpoint_run_dir}/${env_tool_file} to ${user}@${host}:${remote_cfg_dir}"
+            do_scp "" "${env_tool_file}" "${user}@${host}" "${remote_cfg_dir}"
+            popd >/dev/null
+        else
+            echo "Failed to pushd to ${endpoint_run_dir} to scp env file"
+            exit 1
+        fi
+
+        tool_cs_cmd="podman run"
+        tool_cs_cmd+=" --detach=true"
+        tool_cs_cmd+=" --name=${endpoint_label}_${run_id}_${engine_label}_podman"
+        tool_cs_cmd+=" --env-file ${remote_cfg_dir}/${env_tool_file}"
+        tool_cs_cmd+=" --privileged --ipc=host --pid=host --net=host --security-opt=label=disable"
+        tool_cs_cmd+=" --mount=type=bind,source=${remote_data_dir},destination=/tmp"
+        tool_cs_cmd+=" --mount=type=bind,source=/lib/firmware,destination=/lib/firmware"
+        tool_cs_cmd+=" --mount=type=bind,source=/lib/modules,destination=/lib/modules"
+        tool_cs_cmd+=" --mount=type=bind,source=/usr/src,destination=/usr/src"
+        if [ "$host_mounts" != "" ]; then
+            local oldIFS=$IFS
+            IFS=" "
+            for fs in $host_mounts; do
+            tool_cs_cmd+=" --mount=type=bind,source=$fs,destination=$fs"
+            done
+            IFS=$oldIFS
+        fi
+        tool_cs_cmd+=" ${this_image}"
+
+        echo -e "About to run:\ndo_ssh $user@$host ${tool_cs_cmd}\n"
+        # Note: this never really captures non-zero exits
+        do_ssh $user@$host "${tool_cs_cmd}"
+        ssh_rc=$?
+        if [ ${ssh_rc} -gt 0 ]; then
+            echo "running ${os_runtime} failed"
+            exit 1
+        fi
+}
+
 function launch_osruntime() {
     local this_cs_label this_cs_log_file base_cmd cs_cmd cs_rb_env env_file
     local env_opts existing_container container_id container_mount container_name fs
@@ -304,30 +381,40 @@ function launch_osruntime() {
 
     set_total_cpupart
 
+    # Launch the tool engines (support only pods)
+    local tools=`awk -F: '{print $1}' $config_dir/tool-cmds/profiler/start`
+    echo "Creating a pod for each of these tools: $tools"
+    tool_count=1
+    echo "Going to create a pod for each tool"
+    for tool in $tools; do
+        # This label will not work with mulitple remotehost endpoints!
+        engine_label=profiler-$tool_count
+        exec_pod $engine_label $tool
+        new_remotehost_followers+=" $engine_label"
+        let tool_count=$tool_count+1
+    done
+
     # For each client and server launch the actual script which will run it.
     count=1
+    ssh_id=$(sed -z 's/\n/\\n/g' ${config_dir}/rickshaw_id.rsa)
     for this_cs_label in ${clients[@]} ${servers[@]} ${collectors[@]}; do
+
         this_cs_type=`echo $this_cs_label | awk -F- '{print $1}'`
         this_cs_id=`echo $this_cs_label | awk -F- '{print $2}'`
         get_image $this_cs_type $this_cs_id this_image
         set_osruntime_numanode_cpupart $this_cs_label
 
+        echo "Preparing to launch $this_cs_label"
         container_name="${endpoint_label}_${run_id}_${this_cs_label}_${os_runtime}"
         existing_container=`do_ssh $user@$host podman ps --all --format "{{.Names}}" | grep ^$container_name$`
         if [ ! -z "$existing_container" ]; then
             echo "WARNING: found existing container '$existing_container', deleting"
+            do_ssh $user@$host podman stop $container_name
+            do_ssh $user@$host podman kill $container_name
             do_ssh $user@$host podman rm $container_name
         fi
         this_cs_log_file="$this_cs_label.txt"
 
-        if [ $count -gt 1 ]; then
-            # Only the first client/server needs to run tools
-            echo "Skipping tools execution on $this_cs_label because a previous client/server is running tools on this host"
-            this_disable_tools="1"
-        else
-            this_disable_tools="$disable_tools"
-        fi
-
         if [ "${os_runtime}" == "chroot" ]; then
             echo "using chroot"
 
@@ -387,7 +474,9 @@ function launch_osruntime() {
             base_cmd+=" --cpu-partition-index=${count}"
             base_cmd+=" --cpu-partitioning=$cpu_partitioning"
             base_cmd+=" --engine-script-start-timeout=$engine_script_start_timeout"
-            base_cmd+=" --disable-tools=$this_disable_tools"
+            #base_cmd+=" --disable-tools=$this_disable_tools"
+            # For one-tool-per-engine, client and server engines never run tools
+            base_cmd+=" --disable-tools=1"
             if [ $numa_node -gt -1 ]; then
                 base_cmd="numactl -N $numa_node -m $numa_node $base_cmd"
             fi
@@ -412,7 +501,9 @@ function launch_osruntime() {
             echo "max_sample_failures=$max_sample_failures" >> ${endpoint_run_dir}/${env_file}
             echo "max_rb_attempts=$max_rb_attempts"         >> ${endpoint_run_dir}/${env_file}
             echo "ssh_id=${ssh_id}"                         >> ${endpoint_run_dir}/${env_file}
-            echo "disable_tools=$this_disable_tools"        >> ${endpoint_run_dir}/${env_file}
+            #echo "disable_tools=$this_disable_tools"        >> ${endpoint_run_dir}/${env_file}
+            # For one-tool-per-engine, client and server engines never run tools
+            echo "disable_tools=1"        >> ${endpoint_run_dir}/${env_file}
 
             for cs_rb_opt in $cs_rb_opts; do
                 arg=$(echo $cs_rb_opt | awk -F'=' '{print $1}')
@@ -457,7 +548,7 @@ function launch_osruntime() {
             cs_cmd+=" ${this_image}"
         fi
 
-        echo -e "About to run:\n${cs_cmd}\n"
+        echo -e "About to run:\ndo_ssh $user@$host ${cs_cmd}\n"
         do_ssh $user@$host "${cs_cmd}"
         ssh_rc=$?
         if [ ${ssh_rc} -gt 0 ]; then
@@ -557,4 +648,5 @@ fi
 ssh_id=$(sed -z 's/\n/\\n/g' ${config_dir}/rickshaw_id.rsa)
 base_req_check
 launch_osruntime
-process_roadblocks remotehost
+echo "about to call: process_roadblocks remotehost $new_remotehost_followers"
+process_roadblocks remotehost $new_remotehost_followers
diff --git a/engine/engine-script b/engine/engine-script
@@ -113,19 +113,26 @@ do_roadblock start-tools-begin ${default_timeout}
 roadblock_rc=$?
 roadblock_exit_on_error ${roadblock_rc}
 
-start_tools
+start_stop_tools_opt=""
+if [ "$cs_type" == "profiler" ]; then
+    if [ -z "$tool_name" ]; then
+        echo "env:"
+        env
+        exit_error "tool_name not defined [$tool_name], exiting"
+    fi
+start_stop_tools_opt=$tool_name
+fi
+
+start_tools $start_stop_tools_opt
 
 do_roadblock start-tools-end ${default_timeout}
 roadblock_rc=$?
 roadblock_exit_on_error ${roadblock_rc}
 
-
 process_bench_roadblocks
 
-
 do_roadblock stop-tools-begin ${default_timeout}
-do_roadblock stop-tools-end ${default_timeout} wait-for "/usr/local/bin/engine-script-library stop_tools '$(pwd)' '${tool_stop_cmds}' '${disable_tools}'"
-
+do_roadblock stop-tools-end ${default_timeout} wait-for "/usr/local/bin/engine-script-library stop_tools '$(pwd)' '${tool_stop_cmds}' '${disable_tools}' '${start_stop_tools_opt}'"
 
 do_roadblock send-data-begin ${default_timeout}
 do_roadblock send-data-end ${default_timeout} wait-for "/usr/local/bin/engine-script-library send_data '${ssh_id_file}' '${cs_dir}' '${rickshaw_host}' '${archives_dir}/${cs_label}-data.tgz'"

diff --git a/engine/engine-script-library b/engine/engine-script-library
@@ -302,10 +302,11 @@ function validate_core_env() {
     if [ -z "$cs_label" ]; then
         exit_error "The client/server label (--cs-label) was not defined"
     fi
-    if echo $cs_label | grep -q -P '^(\w+)-\d+$'; then
+    regex='^\w+-\d+(-\w+){0,1}$'
+    if echo $cs_label | grep -q -P $regex; then
         echo "engine-label \"$cs_label\" is valid"
     else
-        exit_error 'cs_label "'$cs_label'" does not adhere to regex /^(\w+)-\d+$/'
+        exit_error 'cs_label "'$cs_label'" does not adhere to regex '$regex
     fi
 
     if [ -z "${max_rb_attempts}" ]; then
@@ -357,7 +358,7 @@ function setup_core_env() {
 
 function get_data() {
     # Get files required to run benchmark and tools
-    if [ $cs_type == "client" -o $cs_type == "server" -o $cs_type == "profiler" ]; then
+    if [ $cs_type == "client" -o $cs_type == "server" ]; then
         cs_files_list="$cs_type-$cs_id-files-list"
     else # worker and master do not get id-specific files-list, at least not yet
         cs_files_list="$cs_type-files-list"
@@ -443,6 +444,7 @@ function collect_sysinfo() {
 }
 
 function start_tools() {
+    local one_tool=$1; shift
     local tool_name tool_cmd tool_cmd_rc total_tools
 
     echo "running start_tools()"
@@ -462,6 +464,10 @@ function start_tools() {
             while read -u 9 line; do
                 tool_name=`echo $line | awk -F: '{print $1}'`
                 tool_cmd=`echo $line | sed -e s/^$tool_name://`
+                if [ ! -z "$one_tool" -a "$one_tool" != "$tool_name" ]; then
+                    echo "Not starting $tool_name because this engine only runs one tool, $one_tool"
+                    continue
+                fi
                 (( total_tools += 1 ))
                 /bin/mkdir -p $tool_name
                 if pushd $tool_name >/dev/null; then
@@ -1080,6 +1086,7 @@ function stop_tools() {
     working_directory=${1}; shift
     tool_stop_cmds_file=${1}; shift
     tools_disabled=${1}; shift
+    local one_tool=$1; shift
 
     echo "running stop_tools()"
     echo "pwd: `/bin/pwd`"
@@ -1093,6 +1100,10 @@ function stop_tools() {
             while read -u 9 line; do
                 tool_name=`echo $line | awk -F: '{print $1}'`
                 tool_cmd=`echo $line | sed -e s/^$tool_name://`
+                if [ ! -z "$one_tool" -a "$one_tool" != "$tool_name" ]; then
+                    echo "Not starting $tool_name because this engine only runs one tool, $one_tool"
+                    continue
+                fi
                 (( total_tools += 1 ))
                 if pushd $tool_name >/dev/null; then
                     echo "Stopping tool '${tool_name}' with command '${tool_cmd}'"

diff --git a/rickshaw-run b/rickshaw-run
@@ -34,7 +34,7 @@ use toolbox::logging;
 use toolbox::run;
 use toolbox::jsonsettings;
 
-$toolbox::logging::debug = 0;
+$toolbox::logging::debug = 1;
 
 my $ug = Data::UUID->new;
 my %defaults = ( "num-samples" => 1, "tool-group" => "default", "test-order" => "s",
@@ -896,7 +896,7 @@ sub source_container_image {
         if ($count == 0) {
             $userenv_arg = " --userenv " . $rickshaw_project_dir . "/userenvs/" . $userenv . ".json";
             $req_arg = "";
-            $skip_update = "false";
+            $skip_update = "true";
         } else {
             $req_arg = shift(@requirements);
             $skip_update = "true";
@@ -2053,7 +2053,7 @@ sub prepare_bench_tool_engines() {
     # The "engine-script" will first scp the list (client-files-list or server-files-list).
     # then it will read this list to know what other files to copy over)
     foreach my $cs_type (keys %clients_servers, @all_collector_types) {
-        if ($cs_type =~ /^client|server|profiler?/) {
+        if ($cs_type =~ /^client|server?/) {
             foreach my $cs_ref (@{ $clients_servers{$cs_type} }) {
 	        if (! defined $$cs_ref{'id'}) {
 		    printf "cs_type: [%s] cs_ref{'id'} not defined, skipping\n", $cs_type;