Skip to content

Commit

Permalink
Merge pull request #169 from t-mialve/mialve/fix-longhaul-tests
Browse files Browse the repository at this point in the history
Fix longhaul tests when running on fresh cluster
  • Loading branch information
dabradley authored Aug 20, 2024
2 parents 5202f83 + f4946a5 commit d8a5cc1
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 14 deletions.
12 changes: 6 additions & 6 deletions test/long-haul/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ fast_exit () {

reset_csi_driver () {
echo "Reset CSI driver"
kubectl delete -f $REPO_ROOT_PATH/deploy/csi-azurelustre-controller.yaml
kubectl delete -f $REPO_ROOT_PATH/deploy/csi-azurelustre-node.yaml
kubectl delete -f $REPO_ROOT_PATH/deploy/csi-azurelustre-controller.yaml --ignore-not-found
kubectl delete -f $REPO_ROOT_PATH/deploy/csi-azurelustre-node.yaml --ignore-not-found
kubectl wait pod -n kube-system --for=delete --selector='app in (csi-azurelustre-controller,csi-azurelustre-node)' --timeout=300s

echo "Reset node label"
Expand Down Expand Up @@ -149,8 +149,8 @@ verify_csi_driver () {

start_sample_workload () {
stop_sample_workload
kubectl apply -f ./sample-workload/deployment_write_print_file.yaml --timeout=60s
kubectl wait pod --for=condition=Ready --selector=app=azurelustre-longhaulsample-deployment --timeout=60s
kubectl apply -f ./sample-workload/deployment_write_print_file.yaml --timeout=300s
kubectl wait pod --for=condition=Ready --selector=app=azurelustre-longhaulsample-deployment --timeout=300s
sleep 15
}

Expand All @@ -160,8 +160,8 @@ stop_sample_workload () {
kubectl patch pvc azurelustre-longhaulsample-pvc -p '{"metadata":{"finalizers":null}}'
fi

kubectl delete -f ./sample-workload/deployment_write_print_file.yaml --ignore-not-found --timeout=60s --grace-period=0 --force --cascade
kubectl wait pod --for=delete --selector=app=azurelustre-longhaulsample-deployment --timeout=60s
kubectl delete -f ./sample-workload/deployment_write_print_file.yaml --ignore-not-found --timeout=300s --grace-period=0 --force --cascade
kubectl wait pod --for=delete --selector=app=azurelustre-longhaulsample-deployment --timeout=300s
}

verify_sample_workload_logs () {
Expand Down
29 changes: 21 additions & 8 deletions test/scale/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,15 +203,24 @@ def generate_workload_yaml(self):
logger.info("generated workload yaml:")
self.run_command(f"cat {self._generated_workload_yaml}")

def run_command(self, command: str, need_stdout=False, raise_error=True):
def run_command(self, command: str, need_stdout=False, raise_error=True, retries=5):
logger.info(f"run command {command}")
stdout = None
if need_stdout:
stdout = subprocess.PIPE
process = subprocess.run(command, shell=True, text=True, stdout=stdout)
if process.returncode != 0 and raise_error:
raise RuntimeError(f"command {command} exit with error"
f" code {process.returncode}")
total_retries = retries # used for calculating sleep later
while retries > 0:
process = subprocess.run(command, shell=True, text=True, stdout=stdout)
retries -=1
if process.returncode == 0:
break
elif process.returncode != 0 and raise_error and retries == 0:
raise RuntimeError(f"command {command} exit with error"
f" code {process.returncode}")
logger.info(f"command {command} failed, retrying"
f" attempt {retries}")
time.sleep(10 * (total_retries - retries)) # sleep between retries

stdout = process.stdout
if need_stdout:
logger.info(stdout)
Expand All @@ -221,7 +230,10 @@ def setup(self, current_scale):
logger.info("reinstalling CSI driver")
self.run_command(f"{ROOT_PATH}/deploy/uninstall-driver.sh")
self.run_command(f"{ROOT_PATH}/deploy/install-driver.sh local")
os.mkdir(self._csi_log_path)
if os.path.exists(self._csi_log_path):
pass
else:
os.mkdir(self._csi_log_path)

self._perf_result = PerfResultCollector()
for target_func in self.TARGET_FUNCS:
Expand Down Expand Up @@ -256,7 +268,7 @@ def delete_workload(self):
f" -f {self._generated_workload_yaml}"
f" --ignore-not-found"
f" --wait=true"
f" --timeout=300s"
f" --timeout=600s"
)
logger.info("workload was deleted")

Expand Down Expand Up @@ -289,7 +301,8 @@ def collection_logs(self):
self.run_command(f"kubectl logs {pod}"
f" -nkube-system"
f" -cazurelustre"
f" >{self._csi_log_path}/{pod}")
f" >{self._csi_log_path}/{pod}",
raise_error=False)

def parse_result_from_log(self):
logger.info("parsing log file for perf result")
Expand Down

0 comments on commit d8a5cc1

Please sign in to comment.