fix(sgx):change entrypoint dir

bytedance · Mar 19, 2024 · 546b5b3 · 546b5b3
1 parent 141736e
commit 546b5b3
Show file tree

Hide file tree

Showing 9 changed files with 70 additions and 31 deletions.
diff --git a/deploy/scripts/sgx/enclave_env.sh b/deploy/scripts/sgx/enclave_env.sh
@@ -14,11 +14,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+EXEC_DIR=/app/exec_dir
+
 function get_env() {
     gramine-sgx-get-token -s python.sig -o /dev/null | grep $1 | awk -F ":" '{print $2}' | xargs
 }
 
 function make_custom_env() {
+    cd $EXEC_DIR
+
     export DEBUG=0
     export CUDA_VISIBLE_DEVICES=""
     export DNNL_VERBOSE=0
@@ -59,17 +63,20 @@ function make_custom_env() {
     # need meituan's
     jq --arg mr_enclave "$PEER_MR_ENCLAVE" --arg mr_signer "$PEER_MR_SIGNER" \
         '.sgx_mrs[0].mr_enclave = $mr_enclave | .sgx_mrs[0].mr_signer = $mr_signer' \
-        $GRPC_PATH/examples/dynamic_config.json > ./dynamic_config.json
+        $GRPC_PATH/examples/dynamic_config.json > $EXEC_DIR/dynamic_config.json
+
+    cd -
 }
 
 function generate_token() {
     cd /gramine/CI-Examples/generate-token/
     ./generate.sh
-    mkdir -p /app/sgx/token/
-    cp python.sig /app/sgx/token/
-    cp python.manifest.sgx /app/sgx/token/
-    cp python.token /app/sgx/token/
-    cp python.manifest /app/sgx/token/
+    mkdir -p $EXEC_DIR
+    cp /app/sgx/gramine/CI-Examples/tensorflow_io.py $EXEC_DIR
+    cp python.sig $EXEC_DIR
+    cp python.manifest.sgx $EXEC_DIR
+    cp python.token $EXEC_DIR
+    cp python.manifest $EXEC_DIR
     cd -
 }
 
@@ -81,18 +88,44 @@ elif [ -n "$PCCS_URL" ]; then
         sed -i "s|PCCS_URL=[^ ]*|PCCS_URL=$PCCS_URL|" /etc/sgx_default_qcnl.conf
 fi
 
+TEMPLATE_PATH="/gramine/CI-Examples/generate-token/python.manifest.template"
 if [ -n "$GRAMINE_LOG_LEVEL" ]; then
-        FILE="/gramine/CI-Examples/generate-token/python.manifest.template"
-        sed -i "/loader.log_level/ s/\"[^\"]*\"/\"$GRAMINE_LOG_LEVEL\"/" "$FILE"
-        # 检查sed命令是否成功执行
+        sed -i "/loader.log_level/ s/\"[^\"]*\"/\"$GRAMINE_LOG_LEVEL\"/" "$TEMPLATE_PATH"
         if [ $? -eq 0 ]; then
-            echo "Log level changed to $GRAMINE_LOG_LEVEL in $FILE"
+            echo "Log level changed to $GRAMINE_LOG_LEVEL in $TEMPLATE_PATH"
         else
-            echo "Failed to change log level in $FILE"
+            echo "Failed to change log level in $TEMPLATE_PATH"
         fi
 fi
 
+if [ -n "$GRAMINE_ENCLAVE_SIZE" ]; then
+    sed -i "/sgx.enclave_size/ s/\"[^\"]*\"/\"$GRAMINE_ENCLAVE_SIZE\"/" "$TEMPLATE_PATH"
+    if [ $? -eq 0 ]; then
+        echo "Enclave size changed to $GRAMINE_ENCLAVE_SIZE in $TEMPLATE_PATH"
+    else
+        echo "Failed to change enclave size in $TEMPLATE_PATH"
+    fi
+fi
+
+if [ -n "$GRAMINE_THREAD_NUM" ]; then
+    sed -i "s/sgx.thread_num = [0-9]\+/sgx.thread_num = $GRAMINE_THREAD_NUM/" "$TEMPLATE_PATH"
+    if [ $? -eq 0 ]; then
+        echo "Thread number changed to $GRAMINE_THREAD_NUM in $TEMPLATE_PATH"
+    else
+        echo "Failed to change thread number in $TEMPLATE_PATH"
+    fi
+fi
+
+if [ -n "$GRAMINE_STACK_SIZE" ]; then
+    sed -i "/sys.stack.size/ s/\"[^\"]*\"/\"$GRAMINE_STACK_SIZE\"/" "$TEMPLATE_PATH"
+    if [ $? -eq 0 ]; then
+        echo "Stack size changed to $GRAMINE_STACK_SIZE in $TEMPLATE_PATH"
+    else
+        echo "Failed to change stack size in $TEMPLATE_PATH"
+    fi
+fi
+
 sed -i 's/USE_SECURE_CERT=TRUE/USE_SECURE_CERT=FALSE/' /etc/sgx_default_qcnl.conf
 mkdir -p /data
 
-generate_token
+generate_token
diff --git a/deploy/scripts/sgx/run_data_join_master.sh b/deploy/scripts/sgx/run_data_join_master.sh
@@ -24,7 +24,6 @@ kvstore_type=$(normalize_env_to_args '--kvstore_type' $KVSTORE_TYPE)
 
 source /app/deploy/scripts/sgx/enclave_env.sh
 cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
-cp /app/sgx/token/* ./
 unset HTTPS_PROXY https_proxy http_proxy ftp_proxy
 
 make_custom_env 4

diff --git a/deploy/scripts/sgx/run_data_join_worker.sh b/deploy/scripts/sgx/run_data_join_worker.sh
@@ -55,7 +55,6 @@ fi
 
 source /app/deploy/scripts/sgx/enclave_env.sh
 cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
-cp /app/sgx/token/* ./
 unset HTTPS_PROXY https_proxy http_proxy ftp_proxy
 
 make_custom_env 4

diff --git a/deploy/scripts/sgx/run_data_portal_master.sh b/deploy/scripts/sgx/run_data_portal_master.sh
@@ -26,7 +26,6 @@ files_per_job_limit=$(normalize_env_to_args '--files_per_job_limit' $FILES_PER_J
 
 source /app/deploy/scripts/sgx/enclave_env.sh
 cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
-cp /app/sgx/token/* ./
 unset HTTPS_PROXY https_proxy http_proxy ftp_proxy
 
 make_custom_env 4

diff --git a/deploy/scripts/sgx/run_data_portal_worker.sh b/deploy/scripts/sgx/run_data_portal_worker.sh
@@ -38,7 +38,6 @@ input_data_validation_ratio=$(normalize_env_to_args '--input_data_validation_rat
 
 source /app/deploy/scripts/sgx/enclave_env.sh
 cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
-cp /app/sgx/token/* ./
 unset HTTPS_PROXY https_proxy http_proxy ftp_proxy
 
 make_custom_env 4

diff --git a/deploy/scripts/sgx/run_trainer_master_sgx.sh b/deploy/scripts/sgx/run_trainer_master_sgx.sh
@@ -87,10 +87,11 @@ if [[ -n "${CODE_KEY}" ]]; then
 else
   pull_code ${CODE_TAR} $PWD
 fi
-cd ${ROLE}
-cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
+
+cp /app/sgx/gramine/CI-Examples/tensorflow_io.py /gramine/follower/
+cp /app/sgx/gramine/CI-Examples/tensorflow_io.py /gramine/leader/
 source /app/deploy/scripts/sgx/enclave_env.sh
-cp /app/sgx/token/* ./
+
 unset HTTPS_PROXY https_proxy http_proxy ftp_proxy
 
 make_custom_env 4
@@ -103,14 +104,15 @@ fi
 
 server_port=$(normalize_env_to_args "--server-port" "$PORT1")
 
+cd $EXEC_DIR
 if [[ -z "${START_CPU_SN}" ]]; then
     START_CPU_SN=0
 fi
 if [[ -z "${END_CPU_SN}" ]]; then
     END_CPU_SN=3
 fi
 
-taskset -c $START_CPU_SN-$END_CPU_SN stdbuf -o0 gramine-sgx python main.py --master \
+taskset -c $START_CPU_SN-$END_CPU_SN stdbuf -o0 gramine-sgx python /gramine/$ROLE/main.py --master \
     --application-id=$APPLICATION_ID \
     --data-source=$DATA_SOURCE \
     --data-path=$DATA_PATH \
@@ -126,4 +128,4 @@ taskset -c $START_CPU_SN-$END_CPU_SN stdbuf -o0 gramine-sgx python main.py --mas
     $summary_save_steps $summary_save_secs \
     $local_data_source $local_data_path $local_start_date \
     $local_end_date $epoch_num $start_date $end_date \
-    $shuffle $shuffle_in_day $extra_params $export_model
+    $shuffle $shuffle_in_day $extra_params $export_model
diff --git a/deploy/scripts/sgx/run_trainer_ps_sgx.sh b/deploy/scripts/sgx/run_trainer_ps_sgx.sh
@@ -20,25 +20,32 @@ export CUDA_VISIBLE_DEVICES=
 cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
 source /app/deploy/scripts/hdfs_common.sh || true
 source /app/deploy/scripts/pre_start_hook.sh || true
+source /app/deploy/scripts/env_to_args.sh
 
 LISTEN_PORT=50052
 if [[ -n "${PORT1}" ]]; then
   LISTEN_PORT=${PORT1}
 fi
 
+if [[ -n "${CODE_KEY}" ]]; then
+  pull_code ${CODE_KEY} $PWD
+else
+  pull_code ${CODE_TAR} $PWD
+fi
+
+cp /app/sgx/gramine/CI-Examples/tensorflow_io.py /gramine/leader
+cp /app/sgx/gramine/CI-Examples/tensorflow_io.py /gramine/follower
 source /app/deploy/scripts/sgx/enclave_env.sh
-cp /app/sgx/gramine/CI-Examples/tensorflow_io.py ./
-cp /app/sgx/token/* ./
-unset HTTPS_PROXY https_proxy http_proxy ftp_proxy
 
 make_custom_env 4
 source /root/start_aesm_service.sh
 
+cd $EXEC_DIR
 if [[ -z "${START_CPU_SN}" ]]; then
     START_CPU_SN=0
 fi
 if [[ -z "${END_CPU_SN}" ]]; then
     END_CPU_SN=3
 fi
 
-taskset -c $START_CPU_SN-$END_CPU_SN stdbuf -o0 gramine-sgx python -m fedlearner.trainer.parameter_server $POD_IP:${LISTEN_PORT}
+taskset -c $START_CPU_SN-$END_CPU_SN stdbuf -o0 gramine-sgx python -m fedlearner.trainer.parameter_server $POD_IP:${LISTEN_PORT}
diff --git a/deploy/scripts/sgx/run_trainer_worker_sgx.sh b/deploy/scripts/sgx/run_trainer_worker_sgx.sh
@@ -112,4 +112,4 @@ taskset -c $START_CPU_SN-$END_CPU_SN stdbuf -o0 gramine-sgx python /gramine/$ROL
     --peer-addr="$PEER_ADDR" \
     --worker-rank="$INDEX" \
     $server_port $mode $batch_size \
-    $sparse_estimator $learning_rate
+    $sparse_estimator $learning_rate
diff --git a/sgx/gramine/CI-Examples/generate-token/python.manifest.template b/sgx/gramine/CI-Examples/generate-token/python.manifest.template