Skip to content

Commit

Permalink
TFA-FIX:CEPH-83595932-To verify crashes while executing drain and mgr…
Browse files Browse the repository at this point in the history
… failover commands and preempt scrub fix

Signed-off-by: Srinivasa Bharath Kanta <[email protected]>
  • Loading branch information
SrinivasaBharath committed Nov 22, 2024
1 parent c6a0bd4 commit 01241e3
Show file tree
Hide file tree
Showing 8 changed files with 149 additions and 46 deletions.
57 changes: 57 additions & 0 deletions ceph/rados/core_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -4615,3 +4615,60 @@ def get_rados_df(self, pool_name: str = None):
out = self.run_ceph_command(cmd=_cmd, client_exec=True)

return out["pools"][0] if pool_name else out

def set_service_managed_type(self, service_type, unmanaged) -> bool:
"""
Method to set the service to either managed or unmanaged
The service types are- mon,mgr,osd,rgw, mds
Args:
unmanaged: True or false, for the service management
returns:
Pass -> True, Fail -> false
"""
cmd_export = f"ceph orch ls {service_type} --export"
out = self.run_ceph_command(cmd=cmd_export, client_exec=True)[0]
if unmanaged:
log.debug(
f"Setting the {service_type} service as unmanaged by cephadm. current status : {out}"
)
out["unmanaged"] = "true"
else:
log.debug(
f"Setting the {service_type} service as unmanaged by cephadm. current status : {out}"
)
out["unmanaged"] = "false"

file_name = (
f"/tmp/{service_type}_spec_{self.set_service_managed_type.__name__}.yaml"
)
# Creating service config file
self.client.exec_command(sudo=True, cmd=f"touch {file_name}")
json_out = json.dumps(out)
# Adding the spec rules into the file
cmd = f"echo '{json_out}' > {file_name}"
self.client.exec_command(cmd=cmd, sudo=True)

log.debug(f"Contents of {service_type} spec file : {out}")
apply_cmd = f"ceph orch apply -i {file_name}"
log.info(f"Applying the spec file via cmd : {apply_cmd}")
self.client.exec_command(cmd=apply_cmd, sudo=True)

time.sleep(10)
# Checking for the unmanaged setting on service
cmd = "ceph orch ls"
out = self.run_ceph_command(cmd=cmd)
for entry in out:
if entry["service_name"] == service_type:
log.debug(f"Service status : {entry}")
status = entry.get("unmanaged", False)
if status != unmanaged:
log.error(
f"{service_type} Service not in unmamaned={unmanaged} state. Fail"
)
return False
else:
log.info(
f"{service_type} Service in unmamaned={unmanaged} state. Pass"
)
return True
92 changes: 64 additions & 28 deletions ceph/rados/serviceability_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,22 +225,79 @@ def remove_custom_host(self, host_node_name: str):
Returns:
None | raises exception in case of failure
"""
status_cmd = ""
try:

def wait_osd_operation_status(status_cmd):
status_flag = False
end_time = datetime.datetime.now() + datetime.timedelta(seconds=600)
log.debug(
"The logic used to verify the OSD is removed or not is-"
"case1: If the ceph is still in process of removing the OSD the command generated "
"the proper json output.The json.loads method loads the output without any failure."
"case2: If the OSDs are removed from the node then the command wont generate any output."
"In this case the json.loads method throws the JSONDecodeError exception.This is the "
"confirmation that the OSDs removal are completed. "
)
while end_time > datetime.datetime.now():
out, err = self.cephadm.shell([status_cmd])
try:
drain_ops = json.loads(out)
for entry in drain_ops:
log.debug(
f"OSD remove operation is in progress {osd_id}\nOperations: {entry}"
)
except json.JSONDecodeError:
log.info(f"The OSD removal is completed on OSD : {osd_id}")
status_flag = True
break
except Exception as error:
log.error(f"Hit issue during drain operations: {error}")
raise Exception(error)
log.debug("Sleeping for 10 seconds and checking again....")
time.sleep(10)
return status_flag

# Removing an OSD host and checking status
rm_host = utils.get_node_by_id(self.cluster, host_node_name)
log.info(
f"Identified host : {rm_host.hostname} to be removed from the cluster"
)

# get list of osd_id on the host to be removed
# Get list of osd_id on the host to be removed
rm_osd_list = self.rados_obj.collect_osd_daemon_ids(osd_node=rm_host)
log.info(
f"The osd id list to be removed from the {rm_host} is {rm_osd_list}"
)
# Get the OSD out list and remove before drain the node
osd_out_list = self.rados_obj.get_osd_list(status="out")
log.info(
f"The out osd id list to be removed from the {rm_host} is {osd_out_list}"
)
if osd_out_list:
for osd_id in rm_osd_list:
if osd_id in osd_out_list:
osd_utils.osd_remove(
self.cluster, osd_id=osd_id, zap=True, force=True
)
time.sleep(10)
status_cmd = "ceph orch osd rm status -f json"
if wait_osd_operation_status(status_cmd):
log.info("The OSD successfully removed")
else:
log.error(
"OSD removal not completed on the cluster even after 600 seconds"
)
raise Exception("OSD not removed error")
rm_osd_list.remove(osd_id)
dev_path_list = []
if rm_osd_list:
for osd_id in rm_osd_list:
dev_path_list.append(
rados_utils.get_device_path(host=rm_host, osd_id=osd_id)
)
osd_utils.set_osd_out(self.cluster, osd_id=osd_id)
time.sleep(30)
osd_utils.osd_remove(self.cluster, osd_id=osd_id)
time.sleep(30)

Expand All @@ -253,36 +310,15 @@ def remove_custom_host(self, host_node_name: str):
# Sleeping for 2 seconds for removal to have started
time.sleep(2)
log.debug(f"Started drain operation on node : {rm_host.hostname}")

status_cmd = "ceph orch osd rm status -f json"
end_time = datetime.datetime.now() + datetime.timedelta(seconds=600)
flag = False
while end_time > datetime.datetime.now():
out, err = self.cephadm.shell([status_cmd])
try:
drain_ops = json.loads(out)
for entry in drain_ops:
log.debug(
f"Drain operations are going on host {rm_host.hostname} \nOperations: {entry}"
)
except json.JSONDecodeError:
log.info(f"Drain operations completed on host : {rm_host.hostname}")
flag = True
break
except Exception as error:
log.error(f"Hit issue during drain operations: {error}")
raise Exception(error)
log.debug("Sleeping for 10 seconds and checking again....")
time.sleep(10)

if not flag:
if wait_osd_operation_status(status_cmd):
log.info(
f"Completed drain operation on the host. {rm_host.hostname}\n Removing host from the cluster"
)
else:
log.error(
"Drain operation not completed on the cluster even after 600 seconds"
)
raise Exception("Execution Error")
log.info(
f"Completed drain operation on the host. {rm_host.hostname}\n Removing host from the cluster"
)
raise Exception("Drain operation-OSD not removed error")

if dev_path_list:
for dev_path in dev_path_list:
Expand Down
13 changes: 9 additions & 4 deletions ceph/rados/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def set_osd_devices_unmanaged(ceph_cluster, osd_id, unmanaged):
break

if not service_name:
log.error(f"No orch service found for osd: {osd_id}")
return
log.info(f"Setting OSD service {service_name} to unmanaged={unmanaged}")

Expand All @@ -57,7 +56,7 @@ def set_osd_devices_unmanaged(ceph_cluster, osd_id, unmanaged):
# return if no services found
if "No services reported" in out or "No services reported" in err:
log.debug(out)
log.error(err)
log.debug(err)
return
svc = loads(out)[0]

Expand Down Expand Up @@ -195,17 +194,23 @@ def set_osd_in(
return ret_val


def osd_remove(ceph_cluster, osd_id, zap=False):
def osd_remove(ceph_cluster, osd_id, zap=False, force=False):
"""
osd remove
Args:
ceph_cluster: ceph cluster
osd_id: osd id
zap: flag to control zapping of device
force: flag to remove the OSD forcefully
"""
config = {"command": "rm", "service": "osd", "pos_args": [osd_id]}
cmd_args = {}
if zap:
config["base_cmd_args"] = {"zap": True}
cmd_args["zap"] = True
if force:
cmd_args["force"] = True
if bool(cmd_args):
config["base_cmd_args"] = cmd_args
log.info(f"Executing OSD {config.pop('command')} service")
osd = OSD(cluster=ceph_cluster, **config)
osd.rm(config)
Expand Down
1 change: 0 additions & 1 deletion suites/reef/rados/test_rados_all_generic_features.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1641,7 +1641,6 @@ tests:
pool_type: replicated
pg_num: 16
delete_pool: true
comments: Active BZ-2269089

- test:
name: verify scrub chunk max
Expand Down
1 change: 0 additions & 1 deletion suites/reef/rados/tier-2_rados_test_omap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ tests:
pool_type: replicated
pg_num: 16
delete_pool: true
comments: Active BZ-2269089

- test:
name: Omap creations on objects
Expand Down
1 change: 0 additions & 1 deletion suites/squid/rados/tier-2_rados_test_omap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ tests:
pool_type: replicated
pg_num: 16
delete_pool: true
comments: Active BZ-2269089

- test:
name: Omap creations on objects
Expand Down
26 changes: 17 additions & 9 deletions tests/rados/test_node_drain_customer_bug.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
The file contain the method to check the customer issue-
CEPH-83593996 - Check that the Ceph cluster logs are being generated appropriately according to the log level
CEPH-83595932-To verify crashes while executing drain and mgr failover commands
"""

import datetime
Expand All @@ -21,7 +21,7 @@

def run(ceph_cluster, **kw):
"""
# CEPH-83593996
# CEPH-83595932
Bug id - https://bugzilla.redhat.com/show_bug.cgi?id=2305677
1. Configure a cluster that have more than four OSD nodes
2. Select an OSD node and drain the node
Expand All @@ -42,19 +42,20 @@ def run(ceph_cluster, **kw):
service_obj = ServiceabilityMethods(cluster=ceph_cluster, **config)
ceph_nodes = kw.get("ceph_nodes")
config = kw["config"]

# cmd_unset_unmanaged = ""
replicated_config = config.get("replicated_pool")
pool_name = replicated_config["pool_name"]
active_osd_list = rados_obj.get_osd_list(status="up")
active_osd_list = rados_obj.get_active_osd_list()
log.info(f"The active OSDs list before starting the test-{active_osd_list}")
if not rados_obj.create_pool(pool_name=pool_name):
log.error("Failed to create the Pool")
return 1

rados_obj.bench_write(pool_name=pool_name, byte_size="5M", rados_write_duration=90)
rados_obj.bench_write(pool_name=pool_name, byte_size="5M", rados_write_duration=180)
mgr_daemon = Thread(
target=background_mgr_task, kwargs={"mgr_object": mgr_obj}, daemon=True
)

# Printing the hosts in cluster
cmd_host_ls = "ceph orch host ls"
out = rados_obj.run_ceph_command(cmd=cmd_host_ls)
Expand All @@ -64,7 +65,7 @@ def run(ceph_cluster, **kw):
for node in ceph_nodes:
if node.role == "mgr":
mgr_host_object_list.append(node)
log.debug(f"The mgr host node is{node.hostname}")
log.debug(f"The mgr host node is {node.hostname}")

mgr_daemon_list = mgr_obj.get_mgr_daemon_list()
log.debug(f"The MGR daemons list are -{mgr_daemon_list}")
Expand Down Expand Up @@ -133,8 +134,10 @@ def run(ceph_cluster, **kw):
try:
osd_count_before_test = get_node_osd_list(rados_obj, ceph_nodes, drain_host)
log.info(
f"The OSDs in the drain node before starting the test - {osd_count_before_test} "
f"st The OSDs in the drain node before starting the te- {osd_count_before_test} "
)
rados_obj.set_service_managed_type("osd", unmanaged=True)
time.sleep(10)
mgr_daemon.start()
service_obj.remove_custom_host(host_node_name=drain_host)
time.sleep(300)
Expand All @@ -152,6 +155,9 @@ def run(ceph_cluster, **kw):
"The traceback messages are noticed in logs.The error snippets are noticed in the MGR logs"
)
return 1
rados_obj.set_service_managed_type("osd", unmanaged=False)
time.sleep(10)

log.info(
"Adding the node by providing the deploy_osd as False, because the script is not setting the "
"--unmanaged=true.Once the node is added back to the cluster the OSDs get configured automatically"
Expand Down Expand Up @@ -194,7 +200,7 @@ def run(ceph_cluster, **kw):
return 1

if bug_exists:
active_osd_list = rados_obj.get_osd_list(status="up")
active_osd_list = rados_obj.get_active_osd_list()
log.info(
f"The active OSDs list after reproducing the issue is-{active_osd_list}"
)
Expand Down Expand Up @@ -237,6 +243,8 @@ def run(ceph_cluster, **kw):
log.info(
"\n \n ************** Execution of finally block begins here *************** \n \n"
)
rados_obj.set_service_managed_type("osd", unmanaged=False)
time.sleep(10)
if replicated_config.get("delete_pool"):
rados_obj.delete_pool(pool=pool_name)
time.sleep(5)
Expand Down Expand Up @@ -297,7 +305,7 @@ def background_mgr_task(mgr_object):
mgr_object: mgr object
Returns: None
"""
time.sleep(20)
time.sleep(2)
for _ in range(10):
active_mgr_before_fail = mgr_object.get_active_mgr()
mgr_object.set_mgr_fail()
Expand Down
4 changes: 2 additions & 2 deletions tests/rados/test_rados_preempt_scrub.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def run(ceph_cluster, **kw):

log_lines = [
"head preempted",
"WaitReplicas::react(const GotReplicas&) PREEMPTED",
"WaitReplicas::react(const GotReplicas&) PREEMPTED!",
]

init_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'")
Expand Down Expand Up @@ -162,7 +162,7 @@ def run(ceph_cluster, **kw):
log.info(traceback.format_exc())
return 1
finally:
log.info("Execution of finally block")
log.info("===================Execution of finally block===================")
if config.get("delete_pool"):
method_should_succeed(rados_object.delete_pool, entry["pool_name"])
log.info("deleted the pool successfully")
Expand Down

0 comments on commit 01241e3

Please sign in to comment.