diff --git a/suites/reef/rgw/tier-2_rgw_rados_multisite_ecpool.yaml b/suites/reef/rgw/tier-2_rgw_rados_multisite_ecpool.yaml new file mode 100644 index 0000000000..e1ca392683 --- /dev/null +++ b/suites/reef/rgw/tier-2_rgw_rados_multisite_ecpool.yaml @@ -0,0 +1,280 @@ +# Test suite for evaluating RGW multi-site deployment scenario. +# the data bucket is configured to use EC + +# conf : conf/reef/rgw/ms-ec-profile-4+2-cluster.yaml +--- + +tests: + + # Cluster deployment stage + + - test: + abort-on-fail: true + desc: Install software pre-requisites for cluster deployment. + module: install_prereq.py + name: setup pre-requisites + + - test: + abort-on-fail: true + clusters: + ceph-pri: + config: + verify_cluster_health: true + steps: + - config: + command: bootstrap + service: cephadm + args: + mon-ip: node1 + orphan-initial-daemons: true + initial-dashboard-password: admin@123 + dashboard-password-noupdate: true + - config: + command: add_hosts + service: host + args: + attach_ip_address: true + labels: apply-all-labels + - config: + command: apply + service: mgr + args: + placement: + label: mgr + - config: + command: apply + service: mon + args: + placement: + label: mon + - config: + command: apply + service: osd + args: + all-available-devices: true + - config: + args: + - "ceph osd erasure-code-profile set rgwec01 k=4 m=2" + - "crush-failure-domain=host crush-device-class=hdd" + command: shell + - config: + args: + - "ceph osd pool create primary.rgw.buckets.data 32 32" + - "erasure rgwec01" + command: shell + - config: + args: + - "ceph osd pool application enable" + - "primary.rgw.buckets.data rgw" + command: shell + - config: + command: apply + service: rgw + pos_args: + - shared.pri + args: + placement: + nodes: + - node7 + ceph-sec: + config: + verify_cluster_health: true + steps: + - config: + command: bootstrap + service: cephadm + args: + mon-ip: node1 + orphan-initial-daemons: true + initial-dashboard-password: admin@123 + dashboard-password-noupdate: true + - config: + command: add_hosts + service: host + args: + attach_ip_address: true + labels: apply-all-labels + - config: + command: apply + service: mgr + args: + placement: + label: mgr + - config: + command: apply + service: mon + args: + placement: + label: mon + - config: + command: apply + service: osd + args: + all-available-devices: true + - config: + args: + - "ceph osd erasure-code-profile set rgwec01 k=4 m=2" + - "crush-failure-domain=host crush-device-class=hdd" + command: shell + - config: + args: + - "ceph osd pool create secondary.rgw.buckets.data 32 32" + - "erasure rgwec01" + command: shell + - config: + args: + - "ceph osd pool application enable" + - "secondary.rgw.buckets.data rgw" + command: shell + - config: + command: apply + service: rgw + pos_args: + - shared.sec + args: + placement: + nodes: + - node7 + desc: RHCS cluster deployment using cephadm. + destroy-cluster: false + module: test_cephadm.py + name: deploy cluster + polarion-id: CEPH-83575222 + + - test: + abort-on-fail: true + clusters: + ceph-pri: + config: + command: add + id: client.1 + node: node8 + install_packages: + - ceph-common + copy_admin_keyring: true + ceph-sec: + config: + command: add + id: client.1 + node: node8 + install_packages: + - ceph-common + copy_admin_keyring: true + desc: Configure the RGW client system + destroy-cluster: false + module: test_client.py + name: configure client + polarion-id: CEPH-83573758 + - test: + abort-on-fail: true + clusters: + ceph-pri: + config: + cephadm: true + commands: + - "radosgw-admin realm create --rgw-realm india --default" + - "radosgw-admin zonegroup create --rgw-realm india --rgw-zonegroup shared --endpoints http://{node_ip:node7}:80 --master --default" + - "radosgw-admin zone create --rgw-realm india --rgw-zonegroup shared --rgw-zone primary --endpoints http://{node_ip:node7}:80 --master --default" + - "radosgw-admin period update --rgw-realm india --commit" + - "radosgw-admin user create --uid=repuser --display_name='Replication user' --access-key 21e86bce636c3aa0 --secret cf764951f1fdde5d --rgw-realm india --system" + - "radosgw-admin zone modify --rgw-realm india --rgw-zonegroup shared --rgw-zone primary --access-key 21e86bce636c3aa0 --secret cf764951f1fdde5d" + - "radosgw-admin period update --rgw-realm india --commit" + - "ceph config set client.rgw.{daemon_id:shared.pri} rgw_realm india" + - "ceph config set client.rgw.{daemon_id:shared.pri} rgw_zonegroup shared" + - "ceph config set client.rgw.{daemon_id:shared.pri} rgw_zone primary" + - "ceph orch restart {service_name:shared.pri}" + ceph-sec: + config: + cephadm: true + commands: + - "sleep 120" + - "radosgw-admin realm pull --rgw-realm india --url http://{node_ip:ceph-pri#node7}:80 --access-key 21e86bce636c3aa0 --secret cf764951f1fdde5d --default" + - "radosgw-admin period pull --url http://{node_ip:ceph-pri#node7}:80 --access-key 21e86bce636c3aa0 --secret cf764951f1fdde5d" + - "radosgw-admin zone create --rgw-realm india --rgw-zonegroup shared --rgw-zone secondary --endpoints http://{node_ip:node7}:80 --access-key 21e86bce636c3aa0 --secret cf764951f1fdde5d" + - "radosgw-admin period update --rgw-realm india --commit" + - "ceph config set client.rgw.{daemon_id:shared.sec} rgw_realm india" + - "ceph config set client.rgw.{daemon_id:shared.sec} rgw_zonegroup shared" + - "ceph config set client.rgw.{daemon_id:shared.sec} rgw_zone secondary" + - "ceph orch restart {service_name:shared.sec}" + desc: Setting up RGW multisite replication environment + module: exec.py + name: setup multisite + polarion-id: CEPH-10362 + - test: + abort-on-fail: true + clusters: + ceph-pri: + config: + cephadm: true + commands: + - "radosgw-admin sync status" + - "ceph -s" + - "radosgw-admin realm list" + - "radosgw-admin zonegroup list" + - "radosgw-admin zone list" + - "ceph osd dump" + desc: Retrieve the configured environment details + module: exec.py + name: get shared realm info on primary + polarion-id: CEPH-83575227 + - test: + abort-on-fail: true + clusters: + ceph-sec: + config: + cephadm: true + commands: + - "radosgw-admin sync status" + - "ceph -s" + - "radosgw-admin realm list" + - "radosgw-admin zonegroup list" + - "radosgw-admin zone list" + - "ceph osd dump" + desc: Retrieve the configured environment details + module: exec.py + name: get shared realm info on secondary + polarion-id: CEPH-83575227 + + # Test work flow + + - test: + clusters: + ceph-pri: + config: + set-env: true + script-name: user_create.py + config-file-name: non_tenanted_user.yaml + copy-user-info-to-site: ceph-sec + desc: create non-tenanted user + module: sanity_rgw_multisite.py + name: create non-tenanted user + polarion-id: CEPH-83575199 + + - test: + clusters: + ceph-sec: + config: + config-file-name: test_Mbuckets_with_Nobjects.yaml + script-name: test_Mbuckets_with_Nobjects.py + verify-io-on-site: [ "ceph-pri" ] + desc: Execute M buckets with N objects on secondary cluster + polarion-id: CEPH-83575435 + module: sanity_rgw_multisite.py + name: m buckets with n objects + + - test: + name: Bilog trimming test on primary + desc: test bilog trimming on primary + polarion-id: CEPH-83572658 #CEPH-10722, CEPH-10547 + module: sanity_rgw_multisite.py + clusters: + ceph-pri: + config: + script-name: test_bilog_trimming.py + config-file-name: test_bilog_trimming.yaml + + - test: + name: scrub + bilog trimming with OSD down + desc: test radosgw bilog trimming and deep-scrub with OSDs down + polarion-id: CEPH-83575437 + module: test_bilog_trim.py diff --git a/suites/squid/rgw/tier-2_rgw_rados_multisite_ecpool.yaml b/suites/squid/rgw/tier-2_rgw_rados_multisite_ecpool.yaml index 9c55048ad5..dfa4ad387c 100644 --- a/suites/squid/rgw/tier-2_rgw_rados_multisite_ecpool.yaml +++ b/suites/squid/rgw/tier-2_rgw_rados_multisite_ecpool.yaml @@ -272,3 +272,9 @@ tests: config: script-name: test_bilog_trimming.py config-file-name: test_bilog_trimming.yaml + + - test: + name: scrub + bilog trimming with OSD down + desc: test radosgw bilog trimming and deep-scrub with OSDs down + polarion-id: CEPH-83575437 + module: test_bilog_trim.py diff --git a/tests/rados/test_bilog_trim.py b/tests/rados/test_bilog_trim.py new file mode 100644 index 0000000000..8f7b2aa164 --- /dev/null +++ b/tests/rados/test_bilog_trim.py @@ -0,0 +1,142 @@ +""" +Tier-2 test to ensure PG deep-scrub does not report inconsistent PGs +when radosgw bilog trimming occurs on PGs where secondary OSD is down +Customer Bug: 2056818 - [GSS][RADOS][RGW] Scrub errors (omap_digest_mismatch) on +PGs of RGW metadata pools after upgrade to RHCS 5 + +Test needs to run as part of rgw/tier-2_rgw_rados_multisite_ecpool.yaml +as it needs a rgw multisite setup along with data present in RGW pools +""" +import datetime +import random +import time +from math import floor + +from ceph.ceph_admin import CephAdmin +from ceph.rados import utils as rados_utils +from ceph.rados.core_workflows import RadosOrchestrator +from utility.log import Log + +log = Log(__name__) + + +def run(ceph_cluster, **kw): + """ + #CEPH-83575437 + #BZ-2056818 + This test is to verify no issues are observed during deep-scrub when rgw bilog + trimming occurs with OSD down + Steps: + Ref: + - https://bugzilla.redhat.com/show_bug.cgi?id=2056818#c42 + - https://bugzilla.redhat.com/show_bug.cgi?id=2056818#c50 + 1. Deploy minimal RGW multisite setup + 2. Add data to both the sites in S3 buckets and let the minimal client IO running + 3. Keep running the deep-scrub for index pool PGs + 4. Set noout flag + 5. stop the secondary OSD on the primary site(site1) + 6. run the following command + radosgw-admin bilog autotrim + 7. Once the above command finishes, start the primary OSD that was stopped in step 4 + 8. Retrigger the deep-scrub for eight PGs in index pool + """ + log.info(run.__doc__) + config = kw["config"] + cephadm = CephAdmin(cluster=ceph_cluster, **config) + rados_obj = RadosOrchestrator(node=cephadm) + + try: + # set noout flag + if not rados_utils.configure_osd_flag(ceph_cluster, action="set", flag="noout"): + log.error("Could not set noout flag on the cluster") + raise Exception("Could not set noout flag on the cluster") + + # get acting set for the rgw index pool + _pool_name = "primary.rgw.buckets.index" + acting_set = rados_obj.get_pg_acting_set(pool_name=_pool_name) + log.info(f"Acting set for pool {_pool_name}: {acting_set}") + + # secondary osd of 1st PG from rgw index pool + second_osd = acting_set[1] + + # get list on active OSDs + osd_list = rados_obj.get_osd_list(status="up") + + # bring down 30% of total UP OSDs + osd_down = random.choices(osd_list, k=floor(0.3 * len(osd_list))) + osd_down.append(second_osd) + + log.info(f"List of OSD to be stopped: {osd_down}") + + # trigger deep-scrub on the rgw index pool + rados_obj.run_deep_scrub(pool=_pool_name) + + for osd_id in osd_down: + if not rados_obj.change_osd_state(action="stop", target=int(osd_id)): + log.error(f"Could not stop OSD.{osd_id}") + raise Exception(f"Could not stop OSD.{osd_id}") + + # for a duration of 7 mins, trigger bi-log trimming along with deep-scrub + timeout_time = datetime.datetime.now() + datetime.timedelta(seconds=420) + while datetime.datetime.now() < timeout_time: + cephadm.shell(args=["radosgw-admin bilog autotrim"]) + rados_obj.run_deep_scrub(pool=_pool_name) + time.sleep(30) + + # start the stopped OSDs + for osd_id in osd_down: + if not rados_obj.change_osd_state(action="start", target=int(osd_id)): + log.error(f"Could not start OSD.{osd_id}") + raise Exception(f"Could not start OSD.{osd_id}") + + # triggering deep scrub on the rgw index pool now that OSDs have been started + rados_obj.run_deep_scrub(pool=_pool_name) + pg_id = rados_obj.get_pgid(pool_name=_pool_name) + if not rados_obj.start_check_deep_scrub_complete(pg_id=pg_id): + log.error(f"PG {pg_id} could not be deep-scrubbed in time") + raise + + # not warning should show up in cluster health + health_detail, _ = cephadm.shell(args=["ceph health detail"]) + log.info(f"Cluster health: \n {health_detail}") + assert ( + "inconsistent" not in health_detail + ), "'inconsistent' health warning unexpected" + assert ( + "scrub errors" not in health_detail + ), "scrub errors reported in cluster health" + assert "data damage" not in health_detail, "cluster health reported data damage" + + log.info( + "Verification completed, no issues observed during scrubbing and bi-log trimming " + "in a cluster with down OSDs" + ) + except Exception as e: + log.error(f"Execution failed with exception: {e.__doc__}") + log.exception(e) + return 1 + finally: + log.info( + "\n \n ************** Execution of finally block begins here *************** \n \n" + ) + # unset noout flag + if not rados_utils.configure_osd_flag( + ceph_cluster, action="unset", flag="noout" + ): + log.error("Could not unset noout flag on the cluster") + return 1 + + # start the stopped OSDs + if "osd_down" in locals() or "osd_down" in globals(): + for osd_id in osd_down: + if not rados_obj.change_osd_state(action="start", target=int(osd_id)): + log.error(f"Could not start OSD.{osd_id}") + return 1 + + # log cluster health + rados_obj.log_cluster_health() + # check for crashes after test execution + if rados_obj.check_crash_status(): + log.error("Test failed due to crash at the end of test") + return 1 + return 0