diff --git a/test/long-haul/utils.sh b/test/long-haul/utils.sh index c0e9cea8b..bf40a76f4 100755 --- a/test/long-haul/utils.sh +++ b/test/long-haul/utils.sh @@ -35,8 +35,8 @@ fast_exit () { reset_csi_driver () { echo "Reset CSI driver" - kubectl delete -f $REPO_ROOT_PATH/deploy/csi-azurelustre-controller.yaml - kubectl delete -f $REPO_ROOT_PATH/deploy/csi-azurelustre-node.yaml + kubectl delete -f $REPO_ROOT_PATH/deploy/csi-azurelustre-controller.yaml --ignore-not-found + kubectl delete -f $REPO_ROOT_PATH/deploy/csi-azurelustre-node.yaml --ignore-not-found kubectl wait pod -n kube-system --for=delete --selector='app in (csi-azurelustre-controller,csi-azurelustre-node)' --timeout=300s echo "Reset node label" @@ -149,8 +149,8 @@ verify_csi_driver () { start_sample_workload () { stop_sample_workload - kubectl apply -f ./sample-workload/deployment_write_print_file.yaml --timeout=60s - kubectl wait pod --for=condition=Ready --selector=app=azurelustre-longhaulsample-deployment --timeout=60s + kubectl apply -f ./sample-workload/deployment_write_print_file.yaml --timeout=300s + kubectl wait pod --for=condition=Ready --selector=app=azurelustre-longhaulsample-deployment --timeout=300s sleep 15 } @@ -160,8 +160,8 @@ stop_sample_workload () { kubectl patch pvc azurelustre-longhaulsample-pvc -p '{"metadata":{"finalizers":null}}' fi - kubectl delete -f ./sample-workload/deployment_write_print_file.yaml --ignore-not-found --timeout=60s --grace-period=0 --force --cascade - kubectl wait pod --for=delete --selector=app=azurelustre-longhaulsample-deployment --timeout=60s + kubectl delete -f ./sample-workload/deployment_write_print_file.yaml --ignore-not-found --timeout=300s --grace-period=0 --force --cascade + kubectl wait pod --for=delete --selector=app=azurelustre-longhaulsample-deployment --timeout=300s } verify_sample_workload_logs () { diff --git a/test/scale/run_test.py b/test/scale/run_test.py index 94e6f2cc0..3f08f5c2f 100644 --- a/test/scale/run_test.py +++ b/test/scale/run_test.py @@ -203,15 +203,24 @@ def generate_workload_yaml(self): logger.info("generated workload yaml:") self.run_command(f"cat {self._generated_workload_yaml}") - def run_command(self, command: str, need_stdout=False, raise_error=True): + def run_command(self, command: str, need_stdout=False, raise_error=True, retries=5): logger.info(f"run command {command}") stdout = None if need_stdout: stdout = subprocess.PIPE - process = subprocess.run(command, shell=True, text=True, stdout=stdout) - if process.returncode != 0 and raise_error: - raise RuntimeError(f"command {command} exit with error" - f" code {process.returncode}") + total_retries = retries # used for calculating sleep later + while retries > 0: + process = subprocess.run(command, shell=True, text=True, stdout=stdout) + retries -=1 + if process.returncode == 0: + break + elif process.returncode != 0 and raise_error and retries == 0: + raise RuntimeError(f"command {command} exit with error" + f" code {process.returncode}") + logger.info(f"command {command} failed, retrying" + f" attempt {retries}") + time.sleep(10 * (total_retries - retries)) # sleep between retries + stdout = process.stdout if need_stdout: logger.info(stdout) @@ -221,7 +230,10 @@ def setup(self, current_scale): logger.info("reinstalling CSI driver") self.run_command(f"{ROOT_PATH}/deploy/uninstall-driver.sh") self.run_command(f"{ROOT_PATH}/deploy/install-driver.sh local") - os.mkdir(self._csi_log_path) + if os.path.exists(self._csi_log_path): + pass + else: + os.mkdir(self._csi_log_path) self._perf_result = PerfResultCollector() for target_func in self.TARGET_FUNCS: @@ -256,7 +268,7 @@ def delete_workload(self): f" -f {self._generated_workload_yaml}" f" --ignore-not-found" f" --wait=true" - f" --timeout=300s" + f" --timeout=600s" ) logger.info("workload was deleted") @@ -289,7 +301,8 @@ def collection_logs(self): self.run_command(f"kubectl logs {pod}" f" -nkube-system" f" -cazurelustre" - f" >{self._csi_log_path}/{pod}") + f" >{self._csi_log_path}/{pod}", + raise_error=False) def parse_result_from_log(self): logger.info("parsing log file for perf result")