From 90c287c5ddd749df080aa6ebe96a67f9e2d9557e Mon Sep 17 00:00:00 2001 From: Srikishen Pondicherry Shanmugam Date: Thu, 22 Aug 2024 01:15:05 +0000 Subject: [PATCH] Initial changes to bring framework container to SONiC --- .gitmodules | 3 + dockers/docker-framework/Dockerfile.j2 | 33 + dockers/docker-framework/framework.sh | 3 + dockers/docker-framework/start.sh | 4 + dockers/docker-framework/supervisord.conf | 64 + .../docker-framework/versions-deb-bullseye | 225 +++ .../dockers/docker-framework/versions-mirror | 3 + .../dockers/docker-framework/versions-py3 | 26 + .../docker-sonic-vs/versions-deb-bullseye | 1 + files/build_templates/framework.service.j2 | 20 + files/build_templates/init_cfg.json.j2 | 1 + rules/docker-framework.dep | 10 + rules/docker-framework.mk | 35 + rules/framework.dep | 13 + rules/framework.mk | 18 + src/sonic-framework/.gitignore | 45 + src/sonic-framework/Makefile.am | 21 + src/sonic-framework/autogen.sh | 5 + src/sonic-framework/configure.ac | 85 + src/sonic-framework/debian/changelog | 6 + src/sonic-framework/debian/compat | 1 + src/sonic-framework/debian/control | 18 + src/sonic-framework/debian/rules | 38 + src/sonic-framework/gnoi | 1 + src/sonic-framework/rebootbackend/Makefile.am | 19 + .../rebootbackend/container_stop.proto | 30 + .../rebootbackend/gnoi_container_shutdown.xml | 31 + .../rebootbackend/gnoi_reboot.xml | 30 + .../rebootbackend/init_thread.cpp | 529 ++++++ .../rebootbackend/init_thread.h | 260 +++ .../rebootbackend/interfaces.cpp | 133 ++ .../rebootbackend/interfaces.h | 51 + .../rebootbackend/reboot_common.cpp | 14 + .../rebootbackend/reboot_common.h | 20 + .../rebootbackend/reboot_interfaces.h | 88 + .../rebootbackend/reboot_thread.cpp | 961 +++++++++++ .../rebootbackend/reboot_thread.h | 418 +++++ .../rebootbackend/rebootbackend.cpp | 15 + .../rebootbackend/rebootbe.cpp | 330 ++++ src/sonic-framework/rebootbackend/rebootbe.h | 101 ++ .../rebootbackend/redis_utils.cpp | 474 ++++++ .../rebootbackend/redis_utils.h | 289 ++++ .../rebootbackend/telemetry_helper.cpp | 120 ++ .../rebootbackend/telemetry_helper.h | 49 + src/sonic-framework/tests/Makefile.am | 60 + .../tests/init_thread_test.cpp | 923 ++++++++++ .../tests/mock_reboot_interfaces.h | 62 + .../tests/reboot_common_test.cpp | 27 + .../tests/reboot_thread_test.cpp | 1489 +++++++++++++++++ src/sonic-framework/tests/rebootbe_test.cpp | 1294 ++++++++++++++ .../tests/redis_utils_test.cpp | 785 +++++++++ .../tests/telemetry_helper_test.cpp | 394 +++++ src/sonic-framework/tests/test_main.cpp | 7 + .../tests/test_utils_common.cpp | 143 ++ src/sonic-framework/tests/test_utils_common.h | 48 + 55 files changed, 9873 insertions(+) create mode 100644 dockers/docker-framework/Dockerfile.j2 create mode 100755 dockers/docker-framework/framework.sh create mode 100755 dockers/docker-framework/start.sh create mode 100644 dockers/docker-framework/supervisord.conf create mode 100644 files/build/versions/dockers/docker-framework/versions-deb-bullseye create mode 100644 files/build/versions/dockers/docker-framework/versions-mirror create mode 100644 files/build/versions/dockers/docker-framework/versions-py3 create mode 100644 files/build_templates/framework.service.j2 create mode 100644 rules/docker-framework.dep create mode 100644 rules/docker-framework.mk create mode 100644 rules/framework.dep create mode 100644 rules/framework.mk create mode 100644 src/sonic-framework/.gitignore create mode 100644 src/sonic-framework/Makefile.am create mode 100755 src/sonic-framework/autogen.sh create mode 100644 src/sonic-framework/configure.ac create mode 100644 src/sonic-framework/debian/changelog create mode 100644 src/sonic-framework/debian/compat create mode 100644 src/sonic-framework/debian/control create mode 100755 src/sonic-framework/debian/rules create mode 160000 src/sonic-framework/gnoi create mode 100644 src/sonic-framework/rebootbackend/Makefile.am create mode 100644 src/sonic-framework/rebootbackend/container_stop.proto create mode 100644 src/sonic-framework/rebootbackend/gnoi_container_shutdown.xml create mode 100644 src/sonic-framework/rebootbackend/gnoi_reboot.xml create mode 100644 src/sonic-framework/rebootbackend/init_thread.cpp create mode 100644 src/sonic-framework/rebootbackend/init_thread.h create mode 100644 src/sonic-framework/rebootbackend/interfaces.cpp create mode 100644 src/sonic-framework/rebootbackend/interfaces.h create mode 100644 src/sonic-framework/rebootbackend/reboot_common.cpp create mode 100644 src/sonic-framework/rebootbackend/reboot_common.h create mode 100644 src/sonic-framework/rebootbackend/reboot_interfaces.h create mode 100644 src/sonic-framework/rebootbackend/reboot_thread.cpp create mode 100644 src/sonic-framework/rebootbackend/reboot_thread.h create mode 100644 src/sonic-framework/rebootbackend/rebootbackend.cpp create mode 100644 src/sonic-framework/rebootbackend/rebootbe.cpp create mode 100644 src/sonic-framework/rebootbackend/rebootbe.h create mode 100644 src/sonic-framework/rebootbackend/redis_utils.cpp create mode 100644 src/sonic-framework/rebootbackend/redis_utils.h create mode 100644 src/sonic-framework/rebootbackend/telemetry_helper.cpp create mode 100644 src/sonic-framework/rebootbackend/telemetry_helper.h create mode 100644 src/sonic-framework/tests/Makefile.am create mode 100644 src/sonic-framework/tests/init_thread_test.cpp create mode 100644 src/sonic-framework/tests/mock_reboot_interfaces.h create mode 100644 src/sonic-framework/tests/reboot_common_test.cpp create mode 100644 src/sonic-framework/tests/reboot_thread_test.cpp create mode 100644 src/sonic-framework/tests/rebootbe_test.cpp create mode 100644 src/sonic-framework/tests/redis_utils_test.cpp create mode 100644 src/sonic-framework/tests/telemetry_helper_test.cpp create mode 100644 src/sonic-framework/tests/test_main.cpp create mode 100644 src/sonic-framework/tests/test_utils_common.cpp create mode 100644 src/sonic-framework/tests/test_utils_common.h diff --git a/.gitmodules b/.gitmodules index 9013bdb67021..658393f7e588 100644 --- a/.gitmodules +++ b/.gitmodules @@ -124,3 +124,6 @@ [submodule "platform/innovium/sonic-platform-marvell"] path = platform/innovium/sonic-platform-marvell url = https://github.com/Marvell-switching/sonic-platform-marvell +[submodule "src/sonic-framework/gnoi"] + path = src/sonic-framework/gnoi + url = https://github.com/openconfig/gnoi diff --git a/dockers/docker-framework/Dockerfile.j2 b/dockers/docker-framework/Dockerfile.j2 new file mode 100644 index 000000000000..d52b3e0df822 --- /dev/null +++ b/dockers/docker-framework/Dockerfile.j2 @@ -0,0 +1,33 @@ +{% from "dockers/dockerfile-macros.j2" import install_debian_packages, install_python_wheels, copy_files %} +FROM docker-config-engine-bullseye-{{DOCKER_USERNAME}}:{{DOCKER_USERTAG}} + +ARG docker_container_name +RUN [ -f /etc/rsyslog.conf ] && sed -ri "s/%syslogtag%/$docker_container_name#%syslogtag%/;" /etc/rsyslog.conf + +## Make apt-get non-interactive +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -f -y \ + libdbus-1-3 \ + libdbus-c++-1-0v5 + +{% if docker_framework_debs.strip() -%} +# Copy locally-built Debian package dependencies +{{ copy_files("debs/", docker_framework_debs.split(' '), "/debs/") }} + +# Install locally-built Debian packages and implicitly install their dependencies +{{ install_debian_packages(docker_framework_debs.split(' ')) }} +{%- endif %} + +RUN apt-get clean -y && \ + apt-get autoclean - && \ + apt-get autoremove -y && \ + rm -rf /debs /var/lib/apt/lists/* /tmp/* ~/.cache/ + +COPY ["start.sh", "/usr/bin/"] +COPY ["supervisord.conf", "/etc/supervisor/conf.d/"] +COPY ["files/supervisor-proc-exit-listener", "/usr/bin"] +COPY ["git_commits", "/usr"] + +ENTRYPOINT ["/usr/local/bin/supervisord"] diff --git a/dockers/docker-framework/framework.sh b/dockers/docker-framework/framework.sh new file mode 100755 index 000000000000..2d2e4c2c6fa5 --- /dev/null +++ b/dockers/docker-framework/framework.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +exec /usr/local/bin/framework --logtostderr diff --git a/dockers/docker-framework/start.sh b/dockers/docker-framework/start.sh new file mode 100755 index 000000000000..d6722a27fc77 --- /dev/null +++ b/dockers/docker-framework/start.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +mkdir -p /var/sonic +echo "# Config files managed by sonic-config-engine" > /var/sonic/config_status diff --git a/dockers/docker-framework/supervisord.conf b/dockers/docker-framework/supervisord.conf new file mode 100644 index 000000000000..306e612ab7cd --- /dev/null +++ b/dockers/docker-framework/supervisord.conf @@ -0,0 +1,64 @@ +[supervisord] +logfile_maxbytes=1MB +logfile_backups=2 +loglevel=warn +nodaemon=true + +[eventlistener:dependent-startup] +command=python3 -m supervisord_dependent_startup --log-level warn +autostart=true +autorestart=unexpected +stdout_logfile=syslog +stderr_logfile=syslog +startretries=0 +exitcodes=0,3 +events=PROCESS_STATE +buffer_size=50 + +[eventlistener:supervisor-proc-exit-listener] +command=/usr/bin/supervisor-proc-exit-listener --container-name framework +events=PROCESS_STATE_EXITED +autostart=true +autorestart=unexpected +stdout_logfile=syslog +stderr_logfile=syslog + +[program:rsyslogd] +command=/usr/sbin/rsyslogd -n -iNONE +priority=1 +autostart=false +autorestart=unexpected +stdout_logfile=syslog +stderr_logfile=syslog +dependent_startup=true + +[program:start] +command=/usr/bin/start.sh +priority=2 +autostart=false +autorestart=false +startsecs=0 +stdout_logfile=syslog +stderr_logfile=syslog +dependent_startup=true +dependent_startup_wait_for=rsyslogd:running + +[program:stateverificationd] +command=/usr/bin/stateverificationd +priority=3 +autostart=false +autorestart=true +stdout_logfile=syslog +stderr_logfile=syslog +dependent_startup=true +dependent_startup_wait_for=start:exited + +[program:rebootbackend] +command=/usr/bin/rebootbackend +priority=3 +autostart=false +autorestart=true +stdout_logfile=syslog +stderr_logfile=syslog +dependent_startup=true +dependent_startup_wait_for=start:exited diff --git a/files/build/versions/dockers/docker-framework/versions-deb-bullseye b/files/build/versions/dockers/docker-framework/versions-deb-bullseye new file mode 100644 index 000000000000..7f812389cacf --- /dev/null +++ b/files/build/versions/dockers/docker-framework/versions-deb-bullseye @@ -0,0 +1,225 @@ +adduser==3.118+deb11u1 +apt==2.2.4 +apt-utils==2.2.4 +base-files==11.1+deb11u9 +base-passwd==3.5.51 +bash==5.1-2+deb11u1 +bsdutils==1:2.36.1-8+deb11u2 +ca-certificates==20210119 +coreutils==8.32-4+b1 +curl==7.74.0-1.3+deb11u11 +dash==0.5.11+git20200708+dd9ef66-5 +debconf==1.5.77 +debian-archive-keyring==2021.1.1+deb11u1 +debianutils==4.11.2 +diffutils==1:3.7-5 +dpkg==1.20.13 +dpkg-dev==1.20.13 +e2fsprogs==1.46.2-2 +findutils==4.8.0-1 +framework==1.0.0 +framework-dbg==1.0.0 +gcc-10-base==10.2.1-6 +gcc-9-base==9.3.0-22 +gdb==10.1-1.7 +gdbserver==10.1-1.7 +gpgv==2.2.27-2+deb11u2 +grep==3.6-1+deb11u1 +gzip==1.10-4+deb11u1 +hostname==3.23 +init-system-helpers==1.60 +iproute2==5.10.0-4 +jq==1.6-2.1 +less==551-2+deb11u2 +libabsl20200923==0~20200923.3-2 +libacl1==2.2.53-10 +libapt-pkg6.0==2.2.4 +libatomic1==10.2.1-6 +libattr1==1:2.4.48-6 +libaudit-common==1:3.0-2 +libaudit1==1:3.0-2 +libbabeltrace1==1.5.8-1+b3 +libblkid1==2.36.1-8+deb11u2 +libboost-regex1.74.0==1.74.0-9 +libboost-serialization1.74.0==1.74.0-9 +libbpf0==1:0.3-3 +libbrotli1==1.0.9-2+b2 +libbsd0==0.11.3-1+deb11u1 +libbz2-1.0==1.0.8-4 +libc-ares2==1.17.1-1+deb11u3 +libc-bin==2.31-13+deb11u10 +libc6==2.31-13+deb11u10 +libcap-ng0==0.7.9-2.2+b1 +libcap2==1:2.44-1 +libcap2-bin==1:2.44-1 +libcbor0==0.5.0+dfsg-2 +libcom-err2==1.46.2-2 +libcrypt1==1:4.4.18-4 +libcurl3-gnutls==7.74.0-1.3+deb11u11 +libcurl4==7.74.0-1.3+deb11u11 +libdaemon0==0.14-7.1 +libdb5.3==5.3.28+dfsg1-0.8 +libdbus-1-3==1.12.28-0+deb11u1 +libdbus-c++-1-0v5==0.9.0-8.2 +libdebconfclient0==0.260 +libdebuginfod1==0.183-1 +libdw1==0.183-1 +libecore1==1.25.1-1 +libedit2==3.1-20191231-2+b1 +libeina1a==1.25.1-1 +libelf1==0.183-1 +libestr0==0.1.10-2.1+b1 +libexpat1==2.2.10-2+deb11u5 +libext2fs2==1.46.2-2 +libfastjson4==0.99.9-1 +libffi7==3.3-6 +libfido2-1==1.6.0-2 +libgcc-s1==10.2.1-6 +libgcrypt20==1.8.7-6 +libgdbm-compat4==1.19-2 +libgdbm6==1.19-2 +libglib2.0-0==2.66.8-1+deb11u3 +libgmp10==2:6.2.1+dfsg-1+deb11u1 +libgnutls30==3.7.1-5+deb11u4 +libgpg-error0==1.38-2 +libgpm2==1.20.7-8 +libgrpc++1==1.30.2-3 +libgrpc10==1.30.2-3 +libgssapi-krb5-2==1.18.3-6+deb11u4 +libhiredis0.14==0.14.1-1 +libhiredis0.14-dbgsym==0.14.1-1 +libhogweed6==3.7.3-1 +libicu67==67.1-7 +libidn2-0==2.3.0-5 +libipt2==2.0.3-1 +libjansson4==2.13.1-1.1 +libjemalloc2==5.2.1-3 +libjq1==1.6-2.1 +libjs-jquery==3.5.1+dfsg+~3.5.5-7 +libk5crypto3==1.18.3-6+deb11u4 +libkeyutils1==1.6.1-2 +libkrb5-3==1.18.3-6+deb11u4 +libkrb5support0==1.18.3-6+deb11u4 +libldap-2.4-2==2.4.57+dfsg-3+deb11u1 +liblognorm5==2.0.5-1.1 +liblua5.1-0==5.1.5-8.1+b3 +liblz4-1==1.9.3-2 +liblzf1==3.6-3 +liblzma5==5.2.5-2.1~deb11u1 +libmd0==1.0.3-3 +libmnl0==1.0.4-3 +libmount1==2.36.1-8+deb11u2 +libmpdec3==2.5.1-1 +libmpfr6==4.1.0-3 +libncurses6==6.2+20201114-2+deb11u2 +libncursesw6==6.2+20201114-2+deb11u2 +libnettle8==3.7.3-1 +libnghttp2-14==1.43.0-1+deb11u1 +libnl-3-200==3.5.0-1 +libnl-cli-3-200==3.5.0-1 +libnl-genl-3-200==3.5.0-1 +libnl-nf-3-200==3.5.0-1 +libnl-route-3-200==3.5.0-1 +libnorm1==1.5.9+dfsg-2 +libnsl2==1.3.0-2 +libonig5==6.9.6-1.1 +libp11-kit0==0.23.22-1 +libpam-modules==1.4.0-9+deb11u1 +libpam-modules-bin==1.4.0-9+deb11u1 +libpam-runtime==1.4.0-9+deb11u1 +libpam0g==1.4.0-9+deb11u1 +libpcre2-8-0==10.36-2+deb11u1 +libpcre3==2:8.39-13 +libperl5.32==5.32.1-4+deb11u3 +libpgm-5.3-0==5.3.128~dfsg-2 +libprocps8==2:3.3.17-5 +libprotobuf23==3.12.4-1+deb11u1 +libpsl5==0.21.0-1.2 +libpython3-stdlib==3.9.2-3 +libpython3.9==3.9.2-1 +libpython3.9-minimal==3.9.2-1 +libpython3.9-stdlib==3.9.2-1 +libreadline8==8.1-1 +librtmp1==2.4+20151223.gitfa8646d.1-2+b2 +libsasl2-2==2.1.27+dfsg-2.1+deb11u1 +libsasl2-modules-db==2.1.27+dfsg-2.1+deb11u1 +libseccomp2==2.5.1-1+deb11u1 +libselinux1==3.1-3 +libsemanage-common==3.1-1 +libsemanage1==3.1-1+b2 +libsepol1==3.1-1 +libsmartcols1==2.36.1-8+deb11u2 +libsodium23==1.0.18-1 +libsource-highlight-common==3.1.9-3 +libsource-highlight4v5==3.1.9-3+b1 +libsqlite3-0==3.34.1-3 +libss2==1.46.2-2 +libssh2-1==1.9.0-2 +libssl1.1==1.1.1w-0+deb11u1 +libstdc++6==10.2.1-6 +libswsscommon==1.0.0 +libswsscommon-dbgsym==1.0.0 +libsystemd0==247.3-7+deb11u4 +libtasn1-6==4.16.0-2+deb11u1 +libtinfo6==6.2+20201114-2+deb11u2 +libtirpc-common==1.3.1-1+deb11u1 +libtirpc3==1.3.1-1+deb11u1 +libudev1==247.3-7+deb11u4 +libunistring2==0.9.10-4 +libunwind8==1.3.2-2 +libuuid1==2.36.1-8+deb11u2 +libwrap0==7.6.q-31 +libxtables12==1.8.7-1 +libxxhash0==0.8.0-2 +libyang==1.0.73 +libyang-cpp==1.0.73 +libzmq5==4.3.4-1+deb11u1 +libzstd1==1.4.8+dfsg-2.1 +login==1:4.8.1-1 +logsave==1.46.2-2 +lsb-base==11.1.0 +lua-bitop==1.0.2-5 +lua-cjson==2.1.0+dfsg-2.1 +mawk==1.3.4.20200120-2 +media-types==4.0.0 +mount==2.36.1-8+deb11u2 +ncurses-base==6.2+20201114-2+deb11u2 +ncurses-bin==6.2+20201114-2+deb11u2 +net-tools==1.60+git20181103.0eebece-1 +netbase==6.3 +openssh-client==1:8.4p1-5+deb11u3 +openssl==1.1.1w-0+deb11u1 +passwd==1:4.8.1-1 +perl==5.32.1-4+deb11u3 +perl-base==5.32.1-4+deb11u3 +perl-modules-5.32==5.32.1-4+deb11u3 +procps==2:3.3.17-5 +python-is-python3==3.9.2-1 +python3==3.9.2-3 +python3-distutils==3.9.2-1 +python3-lib2to3==3.9.2-1 +python3-minimal==3.9.2-3 +python3-swsscommon==1.0.0 +python3-yang==1.0.73 +python3.9==3.9.2-1 +python3.9-minimal==3.9.2-1 +readline-common==8.1-1 +redis-tools==5:6.0.16-1+deb11u2 +rsyslog==8.2302.0-1~bpo11+1 +sed==4.7-1 +socat==1.7.4.1-3 +sonic-build-hooks==1.0 +sonic-db-cli==1.0.0 +sonic-eventd==1.0.0-0 +sshpass==1.09-1+b1 +strace==5.10-1 +sysvinit-utils==2.96-7+deb11u1 +tar==1.34+dfsg-1+deb11u1 +tzdata==2024a-0+deb11u1 +util-linux==2.36.1-8+deb11u2 +vim==2:8.2.2434-3+deb11u1 +vim-common==2:8.2.2434-3+deb11u1 +vim-runtime==2:8.2.2434-3+deb11u1 +vim-tiny==2:8.2.2434-3+deb11u1 +xxd==2:8.2.2434-3+deb11u1 +zlib1g==1:1.2.11.dfsg-2+deb11u2 diff --git a/files/build/versions/dockers/docker-framework/versions-mirror b/files/build/versions/dockers/docker-framework/versions-mirror new file mode 100644 index 000000000000..1cc5fd8352b0 --- /dev/null +++ b/files/build/versions/dockers/docker-framework/versions-mirror @@ -0,0 +1,3 @@ +debian-archive.trafficmanager.net_debian-security_dists_bullseye-security==2024-05-10T08:02:59Z +debian-archive.trafficmanager.net_debian_dists_bullseye-backports==2024-05-10T02:16:16Z +debian-archive.trafficmanager.net_debian_dists_bullseye-updates==2024-05-10T02:16:16Z diff --git a/files/build/versions/dockers/docker-framework/versions-py3 b/files/build/versions/dockers/docker-framework/versions-py3 new file mode 100644 index 000000000000..3388288851dc --- /dev/null +++ b/files/build/versions/dockers/docker-framework/versions-py3 @@ -0,0 +1,26 @@ +async-timeout==4.0.3 +bitarray==1.5.3 +ijson==2.6.1 +ipaddress==1.0.23 +j2cli==0.3.10 +jinja2==3.1.4 +jsondiff==2.0.0 +lxml==4.9.1 +markupsafe==2.1.5 +natsort==6.2.1 +netaddr==0.8.0 +pip==24.0 +pyang==2.6.0 +pyangbind==0.8.1 +pyyaml==5.4.1 +redis==4.5.4 +redis-dump-load==1.1 +regex==2024.5.10 +setuptools==58.1.0 +six==1.16.0 +supervisor==4.2.1 +supervisord-dependent-startup==1.4.0 +tabulate==0.8.2 +toposort==1.10 +wheel==0.40.0 +xmltodict==0.12.0 diff --git a/files/build/versions/dockers/docker-sonic-vs/versions-deb-bullseye b/files/build/versions/dockers/docker-sonic-vs/versions-deb-bullseye index c8b815af33ab..72fa75815bb7 100644 --- a/files/build/versions/dockers/docker-sonic-vs/versions-deb-bullseye +++ b/files/build/versions/dockers/docker-sonic-vs/versions-deb-bullseye @@ -13,6 +13,7 @@ fontconfig-config==2.13.1-4.2 fonts-dejavu-core==2.37-2 fonts-font-awesome==5.0.10+really4.7.0~dfsg-4.1 fonts-lato==2.0-2.1 +framework==1.0.0 frr==8.5.4-sonic-0 gettext-base==0.21-4 gir1.2-glib-2.0==1.66.1-1+b1 diff --git a/files/build_templates/framework.service.j2 b/files/build_templates/framework.service.j2 new file mode 100644 index 000000000000..c6273a3c4bc9 --- /dev/null +++ b/files/build_templates/framework.service.j2 @@ -0,0 +1,20 @@ +[Unit] +Description=Framework Container +Requires=database.service +After=database.service swss.service +BindsTo=sonic.target +After=sonic.target +Before=ntp-config.service +StartLimitIntervalSec=1200 +StartLimitBurst=3 + +[Service] +User={{ sonicadmin_user }} +ExecStartPre=/usr/bin/{{docker_container_name}}.sh start +ExecStart=/usr/bin/{{docker_container_name}}.sh wait +ExecStop=/usr/bin/{{docker_container_name}}.sh stop +Restart=always +RestartSec=30 + +[Install] +WantedBy=sonic.target diff --git a/files/build_templates/init_cfg.json.j2 b/files/build_templates/init_cfg.json.j2 index f8083f040541..ad36673f53fe 100644 --- a/files/build_templates/init_cfg.json.j2 +++ b/files/build_templates/init_cfg.json.j2 @@ -37,6 +37,7 @@ }, {%- set features = [("bgp", "{% if not DEVICE_RUNTIME_METADATA['ETHERNET_PORTS_PRESENT'] or ('CHASSIS_METADATA' in DEVICE_RUNTIME_METADATA and DEVICE_RUNTIME_METADATA['CHASSIS_METADATA']['module_type'] in ['supervisor']) %}disabled{% else %}enabled{% endif %}", false, "enabled"), ("database", "always_enabled", false, "always_enabled"), + ("framework", "enabled", false, "enabled"), ("lldp", "enabled", true, "enabled"), ("pmon", "enabled", "{% if 'type' in DEVICE_METADATA['localhost'] and DEVICE_METADATA['localhost']['type'] == 'SpineRouter' %}False{% else %}True{% endif %}", "enabled"), ("snmp", "enabled", true, "enabled"), diff --git a/rules/docker-framework.dep b/rules/docker-framework.dep new file mode 100644 index 000000000000..238e16db8b09 --- /dev/null +++ b/rules/docker-framework.dep @@ -0,0 +1,10 @@ +DPATH := $($(DOCKER_FRAMEWORK)_PATH) +DEP_FILES := $(SONIC_COMMON_FILES_LIST) rules/docker-framework.mk rules/docker-framework.dep +DEP_FILES += $(SONIC_COMMON_BASE_FILES_LIST) +DEP_FILES += $(shell git ls-files $(DPATH)) + +$(DOCKER_FRAMEWORK)_CACHE_MODE := GIT_CONTENT_SHA +$(DOCKER_FRAMEWORK)_DEP_FLAGS := $(SONIC_COMMON_FLAGS_LIST) +$(DOCKER_FRAMEWORK)_DEP_FILES := $(DEP_FILES) + +$(eval $(call add_dbg_docker,$(DOCKER_FRAMEWORK),$(DOCKER_FRAMEWORK_DBG))) diff --git a/rules/docker-framework.mk b/rules/docker-framework.mk new file mode 100644 index 000000000000..df9a2b0b104a --- /dev/null +++ b/rules/docker-framework.mk @@ -0,0 +1,35 @@ +# docker image for framework + +DOCKER_FRAMEWORK_STEM = docker-framework +DOCKER_FRAMEWORK = $(DOCKER_FRAMEWORK_STEM).gz +DOCKER_FRAMEWORK_DBG = $(DOCKER_FRAMEWORK_STEM)-$(DBG_IMAGE_MARK).gz + +$(DOCKER_FRAMEWORK)_PATH = $(DOCKERS_PATH)/$(DOCKER_FRAMEWORK_STEM) + +$(DOCKER_FRAMEWORK)_DEPENDS += $(FRAMEWORK) +$(DOCKER_FRAMEWORK)_DBG_DEPENDS = $($(DOCKER_CONFIG_ENGINE_BULLSEYE)_DBG_DEPENDS) +$(DOCKER_FRAMEWORK)_DBG_DEPENDS += $(FRAMEWORK_DBG) $(LIBSWSSCOMMON_DBG) +$(DOCKER_FRAMEWORK)_DBG_IMAGE_PACKAGES = $($(DOCKER_CONFIG_ENGINE_BULLSEYE)_DBG_IMAGE_PACKAGES) + +$(DOCKER_FRAMEWORK)_LOAD_DOCKERS += $(DOCKER_CONFIG_ENGINE_BULLSEYE) +$(DOCKER_FRAMEWORK)_LOAD_DOCKERS += $($(DOCKER_CONFIG_ENGINE_BULLSEYE)_LOAD_DOCKERS) + +$(DOCKER_FRAMEWORK)_VERSION = 1.0.0 +$(DOCKER_FRAMEWORK)_PACKAGE_NAME = framework + +SONIC_DOCKER_IMAGES += $(DOCKER_FRAMEWORK) +SONIC_INSTALL_DOCKER_IMAGES += $(DOCKER_FRAMEWORK) + +SONIC_DOCKER_DBG_IMAGES += $(DOCKER_FRAMEWORK_DBG) +SONIC_INSTALL_DOCKER_DBG_IMAGES += $(DOCKER_FRAMEWORK_DBG) + +$(DOCKER_FRAMEWORK)_CONTAINER_NAME = framework +$(DOCKER_FRAMEWORK)_RUN_OPT += --privileged -t +$(DOCKER_FRAMEWORK)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro +$(DOCKER_FRAMEWORK)_GIT_REPOSITORIES += "sonic-swss" +$(DOCKER_FRAMEWORK)_GIT_REPOSITORIES += "sonic-swss-common" + +$(DOCKER_FRAMEWORK)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) + +SONIC_BULLSEYE_DOCKERS += $(DOCKER_FRAMEWORK) +SONIC_BULLSEYE_DBG_DOCKERS += $(DOCKER_FRAMEWORK_DBG) diff --git a/rules/framework.dep b/rules/framework.dep new file mode 100644 index 000000000000..ef2585932926 --- /dev/null +++ b/rules/framework.dep @@ -0,0 +1,13 @@ + +SPATH := $($(FRAMEWORK)_SRC_PATH) +DEP_FILES := $(SONIC_COMMON_FILES_LIST) rules/framework.mk rules/framework.dep +DEP_FILES += $(SONIC_COMMON_BASE_FILES_LIST) +SMDEP_PATHS := $(SPATH) $(SPATH)/third_party/gnoi +$(foreach path, $(SMDEP_PATHS), $(eval $(path) :=$(filter-out $(SMDEP_PATHS),$(addprefix $(path)/, \ + $(shell cd $(path) && git ls-files | grep -v " "))))) + +$(FRAMEWORK)_CACHE_MODE := GIT_CONTENT_SHA +$(FRAMEWORK)_DEP_FLAGS := $(SONIC_COMMON_FLAGS_LIST) $(ENABLE_ASAN) +$(FRAMEWORK)_DEP_FILES := $(DEP_FILES) +$(FRAMEWORK)_SMDEP_FILES := $(foreach path, $(SMDEP_PATHS), $($(path))) +$(FRAMEWORK)_SMDEP_PATHS := $(SPATH) diff --git a/rules/framework.mk b/rules/framework.mk new file mode 100644 index 000000000000..9526ebcc20a7 --- /dev/null +++ b/rules/framework.mk @@ -0,0 +1,18 @@ +# framework package + +FRAMEWORK = framework_1.0.0_$(CONFIGURED_ARCH).deb +$(FRAMEWORK)_SRC_PATH = $(SRC_PATH)/sonic-framework +$(FRAMEWORK)_DEPENDS += $(LIBSWSSCOMMON_DEV) + +$(FRAMEWORK)_RDEPENDS += $(LIBSWSSCOMMON) +SONIC_DPKG_DEBS += $(FRAMEWORK) + +FRAMEWORK_DBG = framework-dbg_1.0.0_$(CONFIGURED_ARCH).deb +$(FRAMEWORK_DBG)_DEPENDS += $(FRAMEWORK) +$(FRAMEWORK_DBG)_RDEPENDS += $(FRAMEWORK) +$(eval $(call add_derived_package,$(FRAMEWORK),$(FRAMEWORK_DBG))) + +# The .c, .cpp, .h & .hpp files under src/{$DBG_SRC_ARCHIVE list} +# are archived into debug one image to facilitate debugging. +# +DBG_SRC_ARCHIVE += sonic-framework diff --git a/src/sonic-framework/.gitignore b/src/sonic-framework/.gitignore new file mode 100644 index 000000000000..b3c79bc051f0 --- /dev/null +++ b/src/sonic-framework/.gitignore @@ -0,0 +1,45 @@ +# Compiled Source # +################### +*.la +*.lo +*.o +*.pyc + +# Packaging Files # +################### +**/Makefile +Makefile.in +aclocal.m4 +autom4te.cache/ +config +config.h +config.h.in +config.h.in~ +config.log +config.status +configure +libtool +stamp-h1 + +debian/.debhelper/ +debian/autoreconf.after +debian/autoreconf.before +debian/debhelper-build-stamp +debian/files +debian/*.debhelper.log +debian/*.substvars + +debian/framework +debian/framework-dbg + +# Prebuilt Source +############### +rebootbackend/common +rebootbackend/system +rebootbackend/types + +# Executables # +############### +rebootbackend/rebootbackend +stateverificationd/stateverificationd + diff --git a/src/sonic-framework/Makefile.am b/src/sonic-framework/Makefile.am new file mode 100644 index 000000000000..a0c98bcc944d --- /dev/null +++ b/src/sonic-framework/Makefile.am @@ -0,0 +1,21 @@ +.PHONY: compile_protobufs +BUILT_SOURCES = rebootbackend_protobuf_compilation rebootbackend_dbus_compilation + +# compile_protobufs target needed by: +# gpins-cicd/scripts/sonic-framework/build_and_test_in_docker.sh +compile_protobufs: rebootbackend_protobuf_compilation rebootbackend_dbus_compilation + +rebootbackend_protobuf_compilation: + /usr/bin/protoc --cpp_out=rebootbackend --proto_path=third_party/gnoi \ + third_party/gnoi/types/types.proto \ + third_party/gnoi/common/common.proto \ + third_party/gnoi/system/system.proto + /usr/bin/protoc --experimental_allow_proto3_optional --cpp_out=. rebootbackend/container_stop.proto + +rebootbackend_dbus_compilation: + /usr/bin/dbusxx-xml2cpp rebootbackend/gnoi_reboot.xml \ + --proxy=rebootbackend/gnoi_reboot_dbus.h + /usr/bin/dbusxx-xml2cpp rebootbackend/gnoi_container_shutdown.xml \ + --proxy=rebootbackend/gnoi_container_shutdown_dbus.h + +SUBDIRS = stateverificationd rebootbackend diff --git a/src/sonic-framework/autogen.sh b/src/sonic-framework/autogen.sh new file mode 100755 index 000000000000..c282a898f1c1 --- /dev/null +++ b/src/sonic-framework/autogen.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +libtoolize --force --copy && +autoreconf --force --install -I m4 +rm -Rf autom4te.cache diff --git a/src/sonic-framework/configure.ac b/src/sonic-framework/configure.ac new file mode 100644 index 000000000000..fc156c76aff8 --- /dev/null +++ b/src/sonic-framework/configure.ac @@ -0,0 +1,85 @@ +AC_INIT([sonic-swss],[1.0]) +AC_CONFIG_SRCDIR([]) +AC_CONFIG_AUX_DIR(config) +AM_CONFIG_HEADER(config.h) +AM_INIT_AUTOMAKE([foreign]) +AC_LANG_C +AC_LANG([C++]) +AC_PROG_CC +AC_PROG_CXX +AC_PROG_LIBTOOL +AC_HEADER_STDC + +AC_CHECK_LIB([hiredis], [redisConnect],, + AC_MSG_ERROR([libhiredis is not installed.])) + +PKG_CHECK_MODULES([JANSSON], [jansson]) + +AC_ARG_ENABLE(debug, +[ --enable-debug Compile with debugging flags], +[case "${enableval}" in + yes) debug=true ;; + no) debug=false ;; + *) AC_MSG_ERROR(bad value ${enableval} for --enable-debug) ;; +esac],[debug=false]) +AM_CONDITIONAL(DEBUG, test x$debug = xtrue) + +AC_ARG_WITH(extra-inc, +[ --with-extra-inc=DIR + prefix where extra includes are installed], +[AC_SUBST(CPPFLAGS, "$CPPFLAGS -I${withval} -I${withval}/swss")]) + +AC_ARG_WITH(extra-lib, +[ --with-extra-lib=DIR + prefix where extra libraries are installed], +[AC_SUBST(LDFLAGS, "$LDFLAGS -L${withval}")]) + +CFLAGS_COMMON="-std=c++14 -Wall -fPIC -Wno-write-strings -I/usr/include/swss" + +CFLAGS_COMMON+=" -Werror" +CFLAGS_COMMON+=" -Wno-reorder" +CFLAGS_COMMON+=" -Wcast-align" +CFLAGS_COMMON+=" -Wcast-qual" +#TODO (b/314850353): Re-enable conversion errors with updated protoc compiler. +#CFLAGS_COMMON+=" -Wconversion" +CFLAGS_COMMON+=" -Wdisabled-optimization" +CFLAGS_COMMON+=" -Wextra" +CFLAGS_COMMON+=" -Wfloat-equal" +CFLAGS_COMMON+=" -Wformat=2" +CFLAGS_COMMON+=" -Wformat-nonliteral" +CFLAGS_COMMON+=" -Wformat-security" +CFLAGS_COMMON+=" -Wformat-y2k" +CFLAGS_COMMON+=" -Wimport" +CFLAGS_COMMON+=" -Winit-self" +CFLAGS_COMMON+=" -Winvalid-pch" +CFLAGS_COMMON+=" -Wlong-long" +CFLAGS_COMMON+=" -Wmissing-field-initializers" +CFLAGS_COMMON+=" -Wmissing-format-attribute" +CFLAGS_COMMON+=" -Wno-aggregate-return" +CFLAGS_COMMON+=" -Wno-padded" +CFLAGS_COMMON+=" -Wno-switch-enum" +CFLAGS_COMMON+=" -Wno-unused-parameter" +CFLAGS_COMMON+=" -Wpacked" +CFLAGS_COMMON+=" -Wpointer-arith" +CFLAGS_COMMON+=" -Wredundant-decls" +CFLAGS_COMMON+=" -Wstack-protector" +CFLAGS_COMMON+=" -Wstrict-aliasing=3" +CFLAGS_COMMON+=" -Wswitch" +CFLAGS_COMMON+=" -Wswitch-default" +CFLAGS_COMMON+=" -Wunreachable-code" +CFLAGS_COMMON+=" -Wunused" +CFLAGS_COMMON+=" -Wvariadic-macros" +CFLAGS_COMMON+=" -Wno-switch-default" +CFLAGS_COMMON+=" -Wno-long-long" +CFLAGS_COMMON+=" -Wno-redundant-decls" + +AC_SUBST(CFLAGS_COMMON) + +AC_CONFIG_FILES([ + Makefile + stateverificationd/Makefile + rebootbackend/Makefile + tests/Makefile +]) + +AC_OUTPUT diff --git a/src/sonic-framework/debian/changelog b/src/sonic-framework/debian/changelog new file mode 100644 index 000000000000..04ec2aecab85 --- /dev/null +++ b/src/sonic-framework/debian/changelog @@ -0,0 +1,6 @@ +sonic (1.0.0) stable; urgency=medium + + * Initial release. + + -- Runming Wu Wed, 20 Sep 2023 12:00:00 -0800 + diff --git a/src/sonic-framework/debian/compat b/src/sonic-framework/debian/compat new file mode 100644 index 000000000000..f599e28b8ab0 --- /dev/null +++ b/src/sonic-framework/debian/compat @@ -0,0 +1 @@ +10 diff --git a/src/sonic-framework/debian/control b/src/sonic-framework/debian/control new file mode 100644 index 000000000000..d1c19f4ad296 --- /dev/null +++ b/src/sonic-framework/debian/control @@ -0,0 +1,18 @@ +Source: sonic +Maintainer: Runming Wu +Section: net +Priority: optional +Build-Depends: dh-exec (>=0.3), debhelper (>= 9), autotools-dev +Standards-Version: 1.0.0 + +Package: framework +Architecture: any +Depends: ${shlibs:Depends} +Description: This package contains framework service for GPINs project. + +Package: framework-dbg +Architecture: any +Section: debug +Priority: extra +Depends: framework (=${binary:Version}) +Description: debugging symbols for framework diff --git a/src/sonic-framework/debian/rules b/src/sonic-framework/debian/rules new file mode 100755 index 000000000000..a1a2957b9fec --- /dev/null +++ b/src/sonic-framework/debian/rules @@ -0,0 +1,38 @@ +#!/usr/bin/make -f +# See debhelper(7) (uncomment to enable) +# output every command that modifies files on the build system. +#export DH_VERBOSE = 1 + +# see EXAMPLES in dpkg-buildflags(1) and read /usr/share/dpkg/* +DPKG_EXPORT_BUILDFLAGS = 1 +include /usr/share/dpkg/default.mk + +# see FEATURE AREAS in dpkg-buildflags(1) +#export DEB_BUILD_MAINT_OPTIONS = hardening=+all + +# see ENVIRONMENT in dpkg-buildflags(1) +# package maintainers to append CFLAGS +#export DEB_CFLAGS_MAINT_APPEND = -Wall -pedantic +# package maintainers to append LDFLAGS +#export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed + + +# main packaging script based on dh7 syntax +%: + dh $@ --with autotools-dev + +# dh_make generated override targets +# This is example for Cmake (See https://bugs.debian.org/641051 ) +#override_dh_auto_configure: +# dh_auto_configure -- \ +# -DCMAKE_LIBRARY_PATH=$(DEB_HOST_MULTIARCH) + +override_dh_auto_configure: + dh_auto_configure -- $(configure_opts) + +override_dh_auto_install: + dh_auto_install --destdir=debian/framework + +override_dh_strip: + dh_strip -pframework --dbg-package=framework-dbg + diff --git a/src/sonic-framework/gnoi b/src/sonic-framework/gnoi new file mode 160000 index 000000000000..122dbac99072 --- /dev/null +++ b/src/sonic-framework/gnoi @@ -0,0 +1 @@ +Subproject commit 122dbac9907279f9a28f2d32066a6d3b0659be82 diff --git a/src/sonic-framework/rebootbackend/Makefile.am b/src/sonic-framework/rebootbackend/Makefile.am new file mode 100644 index 000000000000..afd3de9d0b74 --- /dev/null +++ b/src/sonic-framework/rebootbackend/Makefile.am @@ -0,0 +1,19 @@ +INCLUDES = -I $(top_srcdir) -I/usr/include/dbus-c++-1/ -I $(top_srcdir)/system + +bin_PROGRAMS = rebootbackend + +if DEBUG +DBGFLAGS = -ggdb -DDEBUG +else +DBGFLAGS = -g +endif + +rebootbackend_SOURCES = rebootbackend.cpp rebootbe.cpp interfaces.cpp \ + reboot_thread.cpp init_thread.cpp redis_utils.cpp \ + reboot_common.cpp telemetry_helper.cpp \ + system/system.pb.cc types/types.pb.cc \ + common/common.pb.cc container_stop.pb.cc + +rebootbackend_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_ASAN) +rebootbackend_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_ASAN) +rebootbackend_LDADD = $(LDFLAGS_ASAN) -lswsscommon -ldbus-c++-1 -lpthread -lprotobuf diff --git a/src/sonic-framework/rebootbackend/container_stop.proto b/src/sonic-framework/rebootbackend/container_stop.proto new file mode 100644 index 000000000000..a748c29638ea --- /dev/null +++ b/src/sonic-framework/rebootbackend/container_stop.proto @@ -0,0 +1,30 @@ +syntax = "proto3"; + +message StopContainersRequest { + optional string request_id = 1; + repeated string container_names = 2; +} + +enum ShutdownStatus { + NOT_STARTED = 0; // The thread has not been started + WAITING_FOR_NICE_SHUTDOWN = 1; // Nice ("kill") shutdown has been initiated + WAITING_FOR_FORCE_SHUTDOWN = + 2; // Force shutdown ("kill -9") has been initiated + DONE = 3; // Shutdown is complete + ERROR = 4; // An error was encountered +} + +message ContainerStatus { + optional ShutdownStatus status = 1; + optional string error_message = 2; +} + +message StopContainersStatusRequest { + optional string request_id = 1; +} + +message StopContainersResponse { + optional ShutdownStatus status = 1; + optional string error_message = 2; + map container_status = 3; +} diff --git a/src/sonic-framework/rebootbackend/gnoi_container_shutdown.xml b/src/sonic-framework/rebootbackend/gnoi_container_shutdown.xml new file mode 100644 index 000000000000..fb134889d0a8 --- /dev/null +++ b/src/sonic-framework/rebootbackend/gnoi_container_shutdown.xml @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/src/sonic-framework/rebootbackend/gnoi_reboot.xml b/src/sonic-framework/rebootbackend/gnoi_reboot.xml new file mode 100644 index 000000000000..1f785a3745cd --- /dev/null +++ b/src/sonic-framework/rebootbackend/gnoi_reboot.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/src/sonic-framework/rebootbackend/init_thread.cpp b/src/sonic-framework/rebootbackend/init_thread.cpp new file mode 100644 index 000000000000..4112a8916969 --- /dev/null +++ b/src/sonic-framework/rebootbackend/init_thread.cpp @@ -0,0 +1,529 @@ +#include "init_thread.h" + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "component_state_helper.h" +#include "dbconnector.h" +#include "logger.h" +#include "notificationproducer.h" +#include "reboot_interfaces.h" +#include "rebootbe.h" +#include "redis_utils.h" +#include "redisselect.h" +#include "select.h" +#include "selectableevent.h" +#include "selectabletimer.h" +#include "stateverification.h" +#include "status_code_util.h" +#include "subscriberstatetable.h" +#include "warm_restart.h" + +namespace rebootbackend { + +using WarmStartState = ::swss::WarmStart::WarmStartState; +using WarmBootStage = ::swss::WarmStart::WarmBootStage; + +InitThread::InitThread(CriticalStateInterface &critical_interface, + TelemetryInterface &telemetry_interface, + swss::SelectableEvent &m_finished, + swss::SelectableEvent &m_stack_unfrozen) + : m_db("STATE_DB", 0), + m_finished(m_finished), + m_stack_unfrozen(m_stack_unfrozen), + m_critical_interface(critical_interface), + m_telemetry(telemetry_interface) {} + +swss::StatusCode InitThread::Start() { + swss::StatusCode result = internal_start(); + if (result != swss::StatusCode::SWSS_RC_SUCCESS) { + do_final_failed_actions(); + m_status.set_inactive(); + } + return result; +} + +swss::StatusCode InitThread::internal_start() { + SWSS_LOG_ENTER(); + + if (m_critical_interface.is_system_critical()) { + const std::string error_string = + "InitThread: in critical state, not starting stack."; + SWSS_LOG_ERROR("%s", error_string.c_str()); + m_status.set_start_status(); + m_status.set_error( + InitThreadStatus::ErrorCondition::DETECTED_CRITICAL_STATE, + error_string); + return swss::StatusCode::SWSS_RC_FAILED_PRECONDITION; + } + + m_status.set_start_status(); + + try { + m_thread = std::thread(&InitThread::init_thread, this); + } catch (const std::system_error &e) { + std::string error_string = "Exception launching init thread: "; + error_string += e.what(); + SWSS_LOG_ERROR("%s", error_string.c_str()); + + m_status.set_error(InitThreadStatus::ErrorCondition::INTERNAL_ERROR, + error_string); + + return swss::StatusCode::SWSS_RC_INTERNAL; + } + return swss::StatusCode::SWSS_RC_SUCCESS; +} + +void InitThread::init_thread(void) { + SWSS_LOG_ENTER(); + + // Check if stop was requested before m_stop was constructed. If m_stop has + // been signaled already, this will be caught in later Select's. + if (sigterm_requested) { + const std::string error_string = "sigterm_requested was raised, exiting"; + SWSS_LOG_ERROR("%s", error_string.c_str()); + m_status.set_error(InitThreadStatus::ErrorCondition::INTERNAL_ERROR, + error_string); + do_final_failed_actions(); + m_finished.notify(); + return; + } + + swss::SelectableTimer registration_reconciliation_timer( + timespec{.tv_sec = m_reconciliation_timeout, .tv_nsec = 0}); + registration_reconciliation_timer.start(); + + m_status.set_detailed_thread_status( + InitThreadStatus::ThreadStatus::WAITING_FOR_REGISTRATION); + swss::StatusCode result = + handle_registration_step(registration_reconciliation_timer); + if (result != swss::StatusCode::SWSS_RC_SUCCESS) { + m_status.set_error(InitThreadStatus::ErrorCondition::REGISTRATION_FAILED, + "Registration failed with error"); + do_final_failed_actions(); + m_finished.notify(); + return; + } + + m_status.set_detailed_thread_status( + InitThreadStatus::ThreadStatus::WAITING_FOR_RECONCILIATION); + // Reconciliation start time is recorded by the platform layer when the + // database is started. + + result = handle_reconciliation_step(registration_reconciliation_timer); + if (result == swss::StatusCode::SWSS_RC_SUCCESS) { + m_telemetry.record_stage_end(WarmBootStage::STAGE_RECONCILIATION, + /*success=*/true); + } else { + m_status.set_error(InitThreadStatus::ErrorCondition::RECONCILIATION_FAILED, + "Reconciliation failed with error"); + m_telemetry.record_stage_end(WarmBootStage::STAGE_RECONCILIATION, + /*success=*/false); + do_final_failed_actions(); + m_finished.notify(); + return; + } + + registration_reconciliation_timer.stop(); + + bool state_verification_enabled = + swss::WarmStart::isStateVerificationBootupEnabled(); + if (state_verification_enabled) { + m_status.set_detailed_thread_status( + InitThreadStatus::ThreadStatus::WAITING_FOR_STATE_VERIFICATION); + result = handle_state_verification_step(); + if (result != swss::StatusCode::SWSS_RC_SUCCESS) { + m_status.set_error( + InitThreadStatus::ErrorCondition::STATE_VERIFICATION_FAILED, + "State verification failed with error"); + do_final_failed_actions(); + m_finished.notify(); + return; + } + + m_telemetry.record_stage_start(WarmBootStage::STAGE_UNFREEZE); + send_nsf_manager_notification( + m_db, swss::WarmStart::WarmBootNotification::kUnfreeze); + m_stack_unfrozen.notify(); + + m_status.set_detailed_thread_status( + InitThreadStatus::ThreadStatus::WAITING_FOR_UNFREEZE); + + result = handle_unfreeze_step(); + if (result == swss::StatusCode::SWSS_RC_SUCCESS) { + m_telemetry.record_stage_end(WarmBootStage::STAGE_UNFREEZE, + /*success=*/true); + } else { + m_status.set_error(InitThreadStatus::ErrorCondition::UNFREEZE_FAILED, + "Unfreeze failed with error"); + m_telemetry.record_stage_end(WarmBootStage::STAGE_UNFREEZE, + /*success=*/false); + do_final_failed_actions(); + m_finished.notify(); + return; + } + } else { + SWSS_LOG_NOTICE("Skipping state verification and unfreeze polling"); + } + + m_telemetry.record_overall_end(/*success=*/true); + // We've completed warm restart: clear the flag + set_warm_restart_enable(m_db, false); + m_status.set_success(); + // Notify calling thread that init thread has exited. + // Calling thread MUST call Join() to join and set thread status to inactive. + m_finished.notify(); + SWSS_LOG_NOTICE( + "InitThread done post-boot steps. System unblocked for future warmboots"); +} + +void InitThread::Stop(void) { + SWSS_LOG_ENTER(); + m_stop.notify(); +} + +bool InitThread::Join(void) { + SWSS_LOG_ENTER(); + + if (!m_thread.joinable()) { + SWSS_LOG_ERROR("InitThread::Join called, but not joinable"); + return false; + } + + bool ret = true; + try { + m_thread.join(); + m_status.set_inactive(); + } catch (const std::system_error &e) { + SWSS_LOG_ERROR("Exception calling join: %s", e.what()); + ret = false; + } + return ret; +} + +InitThreadStatus::DetailedStatus InitThread::GetDetailedStatus() { + return m_status.get_detailed_thread_status(); +} + +gnoi::system::RebootStatusResponse InitThread::GetResponse() { + return m_status.get_response(); +} + +InitThread::SelectStatus InitThread::ToSelectStatus( + Registration::Response result) { + switch (result.status) { + case Registration::Status::COMPLETED: { + return SelectStatus::SUCCESS; + } + case Registration::Status::FAILURE: { + return SelectStatus::FAILURE; + } + case Registration::Status::IN_PROCESS: { + return SelectStatus::KEEP_WAITING; + } + } + return SelectStatus::FAILURE; +} + +InitThread::SelectStatus InitThread::ToSelectStatus( + InitRegistration::Status status) { + switch (status) { + case InitRegistration::Status::COMPLETED: { + return SelectStatus::SUCCESS; + } + case InitRegistration::Status::IN_PROGRESS: { + return SelectStatus::KEEP_WAITING; + } + } + return SelectStatus::FAILURE; +} + +swss::StatusCode InitThread::ToStatusCode(SelectStatus select_status) { + switch (select_status) { + case SelectStatus::SUCCESS: { + return swss::StatusCode::SWSS_RC_SUCCESS; + } + case SelectStatus::FAILURE: { + return swss::StatusCode::SWSS_RC_INTERNAL; + } + case SelectStatus::KEEP_WAITING: { + return swss::StatusCode::SWSS_RC_INTERNAL; + } + } + return swss::StatusCode::SWSS_RC_INTERNAL; +} + +void InitThread::do_final_failed_actions() { + SWSS_LOG_ENTER(); + InitThreadStatus::DetailedStatus detailed_status = + m_status.get_detailed_thread_status(); + if (detailed_status.detailed_thread_status == + InitThreadStatus::ThreadStatus::ERROR && + detailed_status.detailed_thread_error_condition == + InitThreadStatus::ErrorCondition::UNFREEZE_FAILED) { + SWSS_LOG_NOTICE( + "Error occurred after sending unfreeze, raising minor alarm"); + m_critical_interface.report_minor_alarm( + "Encountered error during unfreeze"); + } else if (!m_critical_interface.is_system_critical()) { + SWSS_LOG_NOTICE( + "Error occured and system is not already critical, raising critical " + "state"); + m_critical_interface.report_critical_state( + "Encountered error with InitThread in state: " + + std::to_string(detailed_status.detailed_thread_error_condition)); + } + set_warm_restart_enable(m_db, false); + m_telemetry.record_overall_end(/*success=*/false); +} + +swss::StatusCode InitThread::handle_registration_step( + swss::SelectableTimer &timer_select) { + SWSS_LOG_ENTER(); + SWSS_LOG_NOTICE("Starting InitThread Registration step"); + + // TODO(b/322034421): Improve critical state detection. + if (m_critical_interface.is_system_critical()) { + SWSS_LOG_ERROR("InitThread: in critical state, not unfreezing stack."); + return swss::StatusCode::SWSS_RC_FAILED_PRECONDITION; + } + + swss::SubscriberStateTable table_sub( + &m_db, STATE_WARM_RESTART_REGISTRATION_TABLE_NAME); + + InitRegistration init_registration; + init_registration.fetch_init_app_info(); + SWSS_LOG_NOTICE("Waiting for apps to reregister: %s", + init_registration.join_pending_apps().c_str()); + auto initial_check_lambda = [&]() { + return InitThread::ToSelectStatus( + init_registration.check_reregistration_status()); + }; + auto handle_table_event_lambda = + [&](const swss::KeyOpFieldsValuesTuple &kco) { + return InitThread::ToSelectStatus( + init_registration.handle_registration_event(kco)); + }; + + swss::StatusCode result = select_loop( + timer_select, table_sub, initial_check_lambda, handle_table_event_lambda); + if (result == swss::StatusCode::SWSS_RC_SUCCESS) { + SWSS_LOG_NOTICE("InitThread Registration step reported success"); + } else { + SWSS_LOG_ERROR( + "Error while waiting for re-registration: missing apps: %s Error " + "text: %s", + init_registration.join_pending_apps().c_str(), + swss::statusCodeToStr(result).c_str()); + } + return result; +} + +swss::StatusCode InitThread::handle_reconciliation_step( + swss::SelectableTimer &timer_select) { + SWSS_LOG_ENTER(); + SWSS_LOG_NOTICE("Starting InitThread Reconciliation step"); + + // TODO(b/322034421): Improve critical state detection. + if (m_critical_interface.is_system_critical()) { + SWSS_LOG_ERROR("InitThread: in critical state, not unfreezing stack."); + return swss::StatusCode::SWSS_RC_FAILED_PRECONDITION; + } + + // Precise error logged within. + swss::StatusCode result = + wait_for_state(WarmBootStage::STAGE_RECONCILIATION, timer_select); + SWSS_LOG_NOTICE("InitThread Reconciliation step finished with status: %s", + swss::statusCodeToStr(result).c_str()); + return result; +} + +swss::StatusCode InitThread::handle_unfreeze_step() { + SWSS_LOG_ENTER(); + SWSS_LOG_NOTICE("Starting InitThread Unfreeze step"); + + // TODO(b/322034421): Improve critical state detection. + if (m_critical_interface.is_system_critical()) { + SWSS_LOG_ERROR( + "InitThread: in critical state, not monitoring for stack unfreeze"); + return swss::StatusCode::SWSS_RC_FAILED_PRECONDITION; + } + + swss::SelectableTimer timer_select( + timespec{.tv_sec = m_unfreeze_timeout, .tv_nsec = 0}); + timer_select.start(); + + // Precise error logged within. + swss::StatusCode result = + wait_for_state(WarmBootStage::STAGE_UNFREEZE, timer_select); + SWSS_LOG_NOTICE("InitThread Unfreeze step finished with status: %s", + swss::statusCodeToStr(result).c_str()); + return result; +} + +swss::StatusCode InitThread::wait_for_state( + WarmBootStage nsf_stage, swss::SelectableTimer &timer_select) { + swss::SubscriberStateTable table_sub(&m_db, STATE_WARM_RESTART_TABLE_NAME); + + const std::string stage_name = + Registration::get_warm_boot_stage_name(nsf_stage); + Registration registration; + registration.fetch_registration_info(); + SWSS_LOG_NOTICE("Waiting for apps: %s to reach state: %s", + registration.join_pending_apps(nsf_stage).c_str(), + stage_name.c_str()); + auto initial_check_lambda = [&]() { + return InitThread::ToSelectStatus(registration.check_stage(nsf_stage)); + }; + auto handle_table_event_lambda = + [&](const swss::KeyOpFieldsValuesTuple &kco) { + return InitThread::ToSelectStatus( + registration.handle_state_event(nsf_stage, kco)); + }; + + swss::StatusCode result = select_loop( + timer_select, table_sub, initial_check_lambda, handle_table_event_lambda); + if (result == swss::StatusCode::SWSS_RC_SUCCESS) { + SWSS_LOG_NOTICE("All apps reached state: %s", stage_name.c_str()); + } else { + SWSS_LOG_ERROR( + "Error while waiting for state: %s missing apps: %s Error " + "text: %s", + stage_name.c_str(), registration.join_pending_apps(nsf_stage).c_str(), + swss::statusCodeToStr(result).c_str()); + } + return result; +} + +swss::StatusCode InitThread::handle_state_verification_step() { + SWSS_LOG_ENTER(); + SWSS_LOG_NOTICE("Starting InitThread State Verfication step"); + + // TODO(b/322034421): Improve critical state detection. + if (m_critical_interface.is_system_critical()) { + SWSS_LOG_ERROR("InitThread: in critical state, not unfreezing stack."); + return swss::StatusCode::SWSS_RC_FAILED_PRECONDITION; + } + + swss::SubscriberStateTable table_sub(&m_db, STATE_VERIFICATION_RESP_TABLE); + swss::SelectableTimer timer_select( + timespec{.tv_sec = m_state_verification_timeout, .tv_nsec = 0}); + timer_select.start(); + + std::string timestamp = + send_state_verification_notification(m_db, /*freeze=*/true); + SWSS_LOG_NOTICE("State verification triggered, waiting for result"); + + auto initial_check_lambda = [&]() -> SelectStatus { + return SelectStatus::KEEP_WAITING; + }; + auto handle_verification_event_lambda = + [&](const swss::KeyOpFieldsValuesTuple &kco) -> SelectStatus { + if (kfvKey(kco) != ALL_COMPONENT) { + return SelectStatus::KEEP_WAITING; + } + + std::string status; + std::string ts; + for (const auto &fv : kfvFieldsValues(kco)) { + if (fvField(fv) == TIMESTAMP_FIELD) { + ts = fvValue(fv); + } else if (fvField(fv) == STATUS_FIELD) { + status = fvValue(fv); + } + } + + if (ts != timestamp) { + return SelectStatus::KEEP_WAITING; + } + + if (status == SV_PASS) { + SWSS_LOG_NOTICE("State verification reported success"); + return SelectStatus::SUCCESS; + } else if (status == SV_NOT_RUN) { + const std::string message = + "State verification did not run. Treating as success for NSF"; + SWSS_LOG_WARN("%s", message.c_str()); + m_critical_interface.report_minor_alarm(message); + return SelectStatus::SUCCESS; + } else if (status == SV_FAIL) { + SWSS_LOG_ERROR("State verification reported failure"); + return SelectStatus::FAILURE; + } + return SelectStatus::KEEP_WAITING; + }; + + swss::StatusCode result = + select_loop(timer_select, table_sub, initial_check_lambda, + handle_verification_event_lambda); + if (result == swss::StatusCode::SWSS_RC_DEADLINE_EXCEEDED) { + SWSS_LOG_WARN("State verification timed out, raising minor alarm: %s", + swss::statusCodeToStr(result).c_str()); + m_critical_interface.report_minor_alarm( + "State verification timed out. Treating as success for NSF"); + return swss::StatusCode::SWSS_RC_SUCCESS; + } else if (result != swss::StatusCode::SWSS_RC_SUCCESS) { + SWSS_LOG_ERROR("Error while waiting for state verification: %s", + swss::statusCodeToStr(result).c_str()); + } + return result; +} + +swss::StatusCode InitThread::select_loop( + swss::Selectable &timer_select, swss::SubscriberStateTable &table_sub, + const std::function &initial_check, + const std::function &table_event_callback) { + SWSS_LOG_ENTER(); + + swss::Select s; + s.addSelectable(&m_stop); + s.addSelectable(&table_sub); + s.addSelectable(&timer_select); + + SelectStatus select_status = initial_check(); + if (select_status != SelectStatus::KEEP_WAITING) { + return ToStatusCode(select_status); + } + + while (true) { + swss::Selectable *sel; + int select_result; + select_result = s.select(&sel); + + if (select_result == swss::Select::ERROR) { + SWSS_LOG_ERROR("Error in select loop: %s", strerror(errno)); + continue; + } else if (select_result != swss::Select::OBJECT) { + SWSS_LOG_NOTICE("Got unexpected non-object from select: %d", + select_result); + continue; + } + + if (sel == &m_stop) { + SWSS_LOG_ERROR("m_stop rx'd (SIGTERM) in select loop"); + return swss::StatusCode::SWSS_RC_INTERNAL; + } else if (sel == &timer_select) { + SWSS_LOG_ERROR("Timed out in select loop"); + return swss::StatusCode::SWSS_RC_DEADLINE_EXCEEDED; + } else if (sel == &table_sub) { + swss::KeyOpFieldsValuesTuple kco; + table_sub.pop(kco); + select_status = table_event_callback(kco); + if (select_status != SelectStatus::KEEP_WAITING) { + return ToStatusCode(select_status); + } + } else { + SWSS_LOG_ERROR("Got unexpected object event in select loop"); + } + } +} + +} // namespace rebootbackend diff --git a/src/sonic-framework/rebootbackend/init_thread.h b/src/sonic-framework/rebootbackend/init_thread.h new file mode 100644 index 000000000000..e79201dfa320 --- /dev/null +++ b/src/sonic-framework/rebootbackend/init_thread.h @@ -0,0 +1,260 @@ +#pragma once + +#include +#include +#include +#include + +#include "dbconnector.h" +#include "notificationproducer.h" +#include "reboot_common.h" +#include "reboot_interfaces.h" +#include "redis_utils.h" +#include "select.h" +#include "selectableevent.h" +#include "selectabletimer.h" +#include "subscriberstatetable.h" +#include "system/system.pb.h" +#include "warm_restart.h" + +namespace rebootbackend { + +// Holds a thread safe representation of the InitThread internal state. +// Thread-safe: the expectation is one thread will write and multiple threads +// will read. +class InitThreadStatus { + public: + enum ThreadStatus { + NOT_STARTED = 0, + PENDING = 1, + WAITING_FOR_REGISTRATION = 2, + WAITING_FOR_RECONCILIATION = 3, + WAITING_FOR_STATE_VERIFICATION = 4, + WAITING_FOR_UNFREEZE = 5, + FINALIZE = 6, + DONE = 7, + ERROR = 8, + }; + + enum ErrorCondition { + NO_ERROR = 0, + UNKNOWN = 1, + INTERNAL_ERROR = 2, + REGISTRATION_FAILED = 3, + RECONCILIATION_FAILED = 4, + STATE_VERIFICATION_FAILED = 5, + UNFREEZE_FAILED = 6, + DETECTED_CRITICAL_STATE = 7, + }; + + struct DetailedStatus { + gnoi::system::RebootStatusResponse thread_state; + InitThreadStatus::ThreadStatus detailed_thread_status = + InitThreadStatus::ThreadStatus::NOT_STARTED; + InitThreadStatus::ErrorCondition detailed_thread_error_condition = + InitThreadStatus::ErrorCondition::NO_ERROR; + }; + + InitThreadStatus() { + m_status.detailed_thread_status = ThreadStatus::NOT_STARTED; + m_status.detailed_thread_error_condition = ErrorCondition::NO_ERROR; + + m_status.thread_state.set_active(false); + m_status.thread_state.set_method(gnoi::system::RebootMethod::COLD); + m_status.thread_state.mutable_status()->set_status( + gnoi::system::RebootStatus_Status::RebootStatus_Status_STATUS_SUCCESS); + m_status.thread_state.mutable_status()->set_message(""); + } + + void set_start_status() { + const std::lock_guard lock(m_mutex); + m_status.detailed_thread_status = ThreadStatus::PENDING; + m_status.detailed_thread_error_condition = ErrorCondition::NO_ERROR; + + m_status.thread_state.set_active(true); + m_status.thread_state.set_method(gnoi::system::RebootMethod::NSF); + m_status.thread_state.mutable_status()->set_status( + gnoi::system::RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN); + m_status.thread_state.mutable_status()->set_message(""); + } + + bool get_active(void) { + const std::lock_guard lock(m_mutex); + return m_status.thread_state.active(); + } + + void set_detailed_thread_status(ThreadStatus new_status) { + const std::lock_guard lock(m_mutex); + if (m_status.thread_state.active()) { + m_status.detailed_thread_status = new_status; + } + } + + void set_success() { + const std::lock_guard lock(m_mutex); + if (m_status.thread_state.active()) { + m_status.detailed_thread_status = ThreadStatus::DONE; + m_status.thread_state.mutable_status()->set_status( + gnoi::system::RebootStatus_Status:: + RebootStatus_Status_STATUS_SUCCESS); + } + } + + void set_error(ErrorCondition error_condition, + const std::string &error_message) { + const std::lock_guard lock(m_mutex); + if (m_status.thread_state.active()) { + m_status.detailed_thread_status = ThreadStatus::ERROR; + m_status.detailed_thread_error_condition = error_condition; + m_status.thread_state.mutable_status()->set_status( + gnoi::system::RebootStatus_Status:: + RebootStatus_Status_STATUS_FAILURE); + m_status.thread_state.mutable_status()->set_message(error_message); + } + } + + void set_inactive() { + const std::lock_guard lock(m_mutex); + m_status.thread_state.set_active(false); + } + + DetailedStatus get_detailed_thread_status() { + const std::lock_guard lock(m_mutex); + return m_status; + } + + gnoi::system::RebootStatusResponse get_response() { + const std::lock_guard lock(m_mutex); + return m_status.thread_state; + } + + private: + std::mutex m_mutex; + DetailedStatus m_status; +}; + +class InitThread { + public: + InitThread(CriticalStateInterface &critical_interface, + TelemetryInterface &telemetry_interface, + swss::SelectableEvent &m_finished, + swss::SelectableEvent &m_stack_unfrozen); + virtual ~InitThread() = default; + + // Starts running the init thread tasks. Returns SWSS_RC_SUCCESS if the + // internal thread was started successfully, and an error otherwise. If an + // error is returned, this call may safely be retried, but will likely + // continue to return errors. + virtual swss::StatusCode Start(); + + // Request InitThread stop/exit. Notifies the internal thread that it must + // exit. Only used when platform is shutting down all containers/processes. + virtual void Stop(void); + + // Must be called by launching task after notification is sent to m_finished. + virtual bool Join(void); + + // Return Status of last reboot attempt. + virtual gnoi::system::RebootStatusResponse GetResponse(); + + // Returns a representation of the detailed thread status. + virtual InitThreadStatus::DetailedStatus GetDetailedStatus(); + + private: + enum class SelectStatus { SUCCESS, FAILURE, KEEP_WAITING }; + + static SelectStatus ToSelectStatus(Registration::Response result); + static SelectStatus ToSelectStatus(InitRegistration::Status status); + static swss::StatusCode ToStatusCode(SelectStatus select_status); + + // Internal implementation of Start(). Returns SWSS_RC_SUCCESS if the init + // thread was started successfully, and an error otherwise. If an error is + // returned, final cleanup actions must be taken. + swss::StatusCode internal_start(); + + // Function containing the core logic. Invoked as a separate thread. Runs + // through the steps required for reconciliation monitoring. + void init_thread(void); + + // Perform the final required actions before exiting: + // 1. Clear the NSF flag. + // 2. Record final stats (if able) + void do_final_failed_actions(); + + // Helper function for the registration step. Waits for all applications that + // had registered warmboot intent before the warmboot to re-register warmboot + // intent after the warmboot. The provided timer must already have been + // started prior to this function call. + // Returns SWSS_RC_SUCCESS if re-registration is successful, and an error + // otherwise. + swss::StatusCode handle_registration_step( + swss::SelectableTimer &timer_select); + + // Helper function for the reconciliation step. Wait for all apps to reach + // the reconcilied state. The provided timer must already have been started + // prior to this function call. + // Returns SWSS_RC_SUCCESS if waiting for reconciliation is successful, and an + // error otherwise. + swss::StatusCode handle_reconciliation_step( + swss::SelectableTimer &timer_select); + + // Helper function for the unfreeze step. Wait for all apps to reach + // the completed state. + // Returns SWSS_RC_SUCCESS if waiting for unfreeze is successful, and an + // error otherwise. + swss::StatusCode handle_unfreeze_step(); + + // Wait until all apps reach a target state, or until the provided timer + // expires. The timer must already have been started prior to this function + // call. + swss::StatusCode wait_for_state(swss::WarmStart::WarmBootStage nsf_stage, + swss::SelectableTimer &timer_select); + + // Helper function for the state verification step. Trigger state verification + // then wait for all apps to report successful state verification. + // Returns SWSS_RC_SUCCESS if waiting for state verification is successful, + // and an error otherwise. + swss::StatusCode handle_state_verification_step(); + + // Helper function for select loops. Sets up events for m_done, timer_select, + // and table_sub, checks if already done with initial_check, then enters + // an event handling loop, forwarding events to table_event_callback until + // the callback indicates operation is complete. + swss::StatusCode select_loop( + swss::Selectable &timer_select, swss::SubscriberStateTable &table_sub, + const std::function &initial_check, + const std::function + &table_event_callback); + + // Thread and internal status. + std::thread m_thread; + InitThreadStatus m_status; + + // Event handles used to notify the caller when InitThread is finished, when + // the stack is unfrozen, and pass Stop events through to the dependent + // thread to stop operation prematurely. + swss::SelectableEvent &m_finished; + swss::SelectableEvent &m_stack_unfrozen; + swss::SelectableEvent m_stop; + + // Interfaces to external systems: the Redis database and critical state + // system. + swss::DBConnector m_db; + CriticalStateInterface &m_critical_interface; + TelemetryInterface &m_telemetry; + + // Various operation timeouts in seconds: allow unit test to shorten. + static constexpr uint32_t kReconciliationTimeout = 300; + uint32_t m_reconciliation_timeout = kReconciliationTimeout; + + // Various operation timeouts in seconds: allow unit test to shorten. + static constexpr uint32_t kStateVerificationTimeout = 180; + uint32_t m_state_verification_timeout = kStateVerificationTimeout; + + static constexpr uint32_t kUnfreezeTimeout = 60; + uint32_t m_unfreeze_timeout = kUnfreezeTimeout; + + friend class InitThreadTest; +}; + +} // namespace rebootbackend diff --git a/src/sonic-framework/rebootbackend/interfaces.cpp b/src/sonic-framework/rebootbackend/interfaces.cpp new file mode 100644 index 000000000000..f8f95476367f --- /dev/null +++ b/src/sonic-framework/rebootbackend/interfaces.cpp @@ -0,0 +1,133 @@ +#include "interfaces.h" + +#include // DBus + +#include "component_state_helper.h" +#include "reboot_interfaces.h" + +constexpr char kRebootBusName[] = "org.SONiC.HostService.gnoi_reboot"; +constexpr char kRebootPath[] = "/org/SONiC/HostService/gnoi_reboot"; + +constexpr char kContainerShutdownBusName[] = "org.SONiC.HostService.gnoi_container_shutdown"; +constexpr char kContainerShutdownPath[] = "/org/SONiC/HostService/gnoi_container_shutdown"; + +// DBus::BusDispatcher dispatcher; +DBus::Connection& HostServiceDbus::getConnection(void) { + static DBus::Connection* connPtr = nullptr; + if (connPtr == nullptr) { + static DBus::BusDispatcher dispatcher; + DBus::default_dispatcher = &dispatcher; + + static DBus::Connection conn = DBus::Connection::SystemBus(); + connPtr = &conn; + } + return *connPtr; +} + +DbusInterface::DbusResponse HostServiceDbus::Reboot( + const std::string& json_reboot_request) { + int32_t status; + std::string ret_string; + std::vector options; + options.push_back(json_reboot_request); + + GnoiDbusReboot reboot_client(getConnection(), kRebootBusName, kRebootPath); + try { + reboot_client.issue_reboot(options, status, ret_string); + } catch (DBus::Error& ex) { + return DbusResponse{ + DbusStatus::DBUS_FAIL, + "HostServiceDbus::Reboot: failed to call reboot host service"}; + } + + // gnoi_reboot.py returns 0 for success, 1 for failure + if (status == 0) { + // Successful reboot response is an empty string. + return DbusResponse{DbusStatus::DBUS_SUCCESS, ""}; + } + return DbusResponse{DbusStatus::DBUS_FAIL, ret_string}; +} + +DbusInterface::DbusResponse HostServiceDbus::RebootStatus( + const std::string& json_status_request) { + int32_t status; + std::string ret_string; + + GnoiDbusReboot reboot_client(getConnection(), kRebootBusName, kRebootPath); + try { + reboot_client.get_reboot_status(status, ret_string); + } catch (DBus::Error& ex) { + return DbusResponse{ + DbusStatus::DBUS_FAIL, + "HostServiceDbus::RebootStatus: failed to call reboot status " + "host service"}; + } + + // gnoi_reboot.py returns 0 for success, 1 for failure + if (status == 0) { + return DbusResponse{DbusStatus::DBUS_SUCCESS, ret_string}; + } + return DbusResponse{DbusStatus::DBUS_FAIL, ret_string}; +} + +DbusInterface::DbusResponse HostServiceDbus::StopContainers( + const std::string& json_stop_request) { + int32_t status; + std::string ret_string; + std::vector options; + options.push_back(json_stop_request); + + GnoiDbusContainerShutdown container_client(getConnection(), kContainerShutdownBusName, + kContainerShutdownPath); + try { + container_client.stop_container(options, status, ret_string); + } catch (DBus::Error& ex) { + return DbusResponse{DbusStatus::DBUS_FAIL, + "HostServiceDbus::StopContainer: failed to call stop " + "container host service"}; + } + + // gnoi_container_shutdown.py returns 0 for success, 1 for failure + if (status == 0) { + return DbusResponse{DbusStatus::DBUS_SUCCESS, ""}; + } + return DbusResponse{DbusStatus::DBUS_FAIL, ret_string}; +} + +DbusInterface::DbusResponse HostServiceDbus::StopContainerStatus( + const std::string& json_status_request) { + int32_t status; + std::string ret_string; + std::vector options; + options.push_back(json_status_request); + + GnoiDbusContainerShutdown container_client(getConnection(), kContainerShutdownBusName, + kContainerShutdownPath); + try { + container_client.stop_container_status(options, status, ret_string); + } catch (DBus::Error& ex) { + return DbusResponse{DbusStatus::DBUS_FAIL, + "HostServiceDbus::StopContainerStatus: failed to call " + "stop container status host service"}; + } + + // gnoi_container_shutdown.py returns 0 for success, 1 for failure + if (status == 0) { + return DbusResponse{DbusStatus::DBUS_SUCCESS, ret_string}; + } + return DbusResponse{DbusStatus::DBUS_FAIL, ret_string}; +} + +bool CriticalState::is_system_critical() { + return swss::StateHelperManager::SystemSingleton().IsSystemCritical(); +} + +void CriticalState::report_minor_alarm(const std::string& reason) { + swss::StateHelperManager::ComponentSingleton(swss::SystemComponent::kHost) + .ReportComponentState(swss::ComponentState::kMinor, reason); +} + +void CriticalState::report_critical_state(const std::string& reason) { + swss::StateHelperManager::ComponentSingleton(swss::SystemComponent::kHost) + .ReportComponentState(swss::ComponentState::kError, reason); +} diff --git a/src/sonic-framework/rebootbackend/interfaces.h b/src/sonic-framework/rebootbackend/interfaces.h new file mode 100644 index 000000000000..e93037929224 --- /dev/null +++ b/src/sonic-framework/rebootbackend/interfaces.h @@ -0,0 +1,51 @@ +#pragma once +#include + +#include + +#include "gnoi_container_shutdown_dbus.h" // auto generated + // gnoi_container_shutdown_proxy +#include "gnoi_reboot_dbus.h" // auto generated gnoi_reboot_proxy +#include "reboot_interfaces.h" + +class GnoiDbusContainerShutdown + : public org::SONiC::HostService::gnoi_container_shutdown_proxy, + public DBus::IntrospectableProxy, + public DBus::ObjectProxy { + public: + GnoiDbusContainerShutdown(DBus::Connection& connection, + const char* dbus_bus_name_p, + const char* dbus_obj_name_p) + : DBus::ObjectProxy(connection, dbus_obj_name_p, dbus_bus_name_p) {} +}; + +class GnoiDbusReboot : public org::SONiC::HostService::gnoi_reboot_proxy, + public DBus::IntrospectableProxy, + public DBus::ObjectProxy { + public: + GnoiDbusReboot(DBus::Connection& connection, const char* dbus_bus_name_p, + const char* dbus_obj_name_p) + : DBus::ObjectProxy(connection, dbus_obj_name_p, dbus_bus_name_p) {} +}; + +class HostServiceDbus : public DbusInterface { + public: + DbusInterface::DbusResponse Reboot( + const std::string& json_reboot_request) override; + DbusInterface::DbusResponse RebootStatus( + const std::string& json_status_request) override; + DbusInterface::DbusResponse StopContainers( + const std::string& json_stop_request) override; + DbusInterface::DbusResponse StopContainerStatus( + const std::string& json_status_request) override; + + private: + static DBus::Connection& getConnection(void); +}; + +class CriticalState : public CriticalStateInterface { + public: + bool is_system_critical() override; + void report_minor_alarm(const std::string& reason) override; + void report_critical_state(const std::string& reason) override; +}; diff --git a/src/sonic-framework/rebootbackend/reboot_common.cpp b/src/sonic-framework/rebootbackend/reboot_common.cpp new file mode 100644 index 000000000000..e79021302bc7 --- /dev/null +++ b/src/sonic-framework/rebootbackend/reboot_common.cpp @@ -0,0 +1,14 @@ +#include "reboot_common.h" + +#include + +namespace rebootbackend { + +timespec milliseconds_to_timespec(uint64_t time_ms) { + timespec l_timespec; + l_timespec.tv_sec = time_ms / ONE_THOUSAND; + l_timespec.tv_nsec = (time_ms % ONE_THOUSAND) * ONE_THOUSAND * ONE_THOUSAND; + return l_timespec; +} + +} // namespace rebootbackend diff --git a/src/sonic-framework/rebootbackend/reboot_common.h b/src/sonic-framework/rebootbackend/reboot_common.h new file mode 100644 index 000000000000..9a6795376534 --- /dev/null +++ b/src/sonic-framework/rebootbackend/reboot_common.h @@ -0,0 +1,20 @@ +#pragma once + +#include + +#include "status_code_util.h" + +namespace rebootbackend { + +#define ONE_THOUSAND (1000) + +extern bool sigterm_requested; + +extern timespec milliseconds_to_timespec(uint64_t time_ms); + +struct NotificationResponse { + swss::StatusCode status; + std::string json_string; +}; + +} // namespace rebootbackend diff --git a/src/sonic-framework/rebootbackend/reboot_interfaces.h b/src/sonic-framework/rebootbackend/reboot_interfaces.h new file mode 100644 index 000000000000..07c5441780ed --- /dev/null +++ b/src/sonic-framework/rebootbackend/reboot_interfaces.h @@ -0,0 +1,88 @@ +#pragma once + +#include + +#include "warm_restart.h" + +class DbusInterface { + public: + enum class DbusStatus { + DBUS_SUCCESS, + DBUS_FAIL, + }; + + struct DbusResponse { + DbusStatus status; + std::string json_string; + }; + + virtual ~DbusInterface() = default; + virtual DbusResponse Reboot(const std::string& json_reboot_request) = 0; + virtual DbusResponse RebootStatus(const std::string& json_status_request) = 0; + virtual DbusResponse StopContainers(const std::string& json_stop_request) = 0; + virtual DbusResponse StopContainerStatus( + const std::string& json_status_request) = 0; +}; + +class CriticalStateInterface { + public: + virtual ~CriticalStateInterface() = default; + virtual bool is_system_critical() = 0; + virtual void report_minor_alarm(const std::string& reason) = 0; + virtual void report_critical_state(const std::string& reason) = 0; +}; + +namespace rebootbackend { + +// Class to help interfacing with the telemetry tables in Redis. Not thread +// safe. +class TelemetryInterface { + public: + virtual ~TelemetryInterface() = default; + + // Records the warmboot start time. Also increments the warmboot counter. + // Writes entries: + // WARM_RESTART_PERFORMANCE_TABLE|system, fields: status, start-timestamp + // WARM_RESTART_PERFORMANCE_HISTORY||system + // fields: status, start-timestamp + // to the state DB. + // Must be called before snapshot_stage_start or the warmboot counter may be + // corrupted. + virtual void record_overall_start() = 0; + + // Records the warmboot end time, when all operations in the NSF boot have + // been completed. Writes to both the performance and history tables. + // Writes entries: + // WARM_RESTART_PERFORMANCE_TABLE|system, fields: status, finish-timestamp + // WARM_RESTART_PERFORMANCE_HISTORY||system + // fields: status, finish-timestamp + // to the state DB. + virtual void record_overall_end(bool success) = 0; + + // Records the start time of a particular warmboot stage. + // Writes entries: + // WARM_RESTART_PERFORMANCE_TABLE|, fields: status, start-timestamp + // WARM_RESTART_PERFORMANCE_TABLE|| + // fields: status, start-timestamp + // WARM_RESTART_PERFORMANCE_HISTORY||, + // fields: status, start-timestamp + // WARM_RESTART_PERFORMANCE_HISTORY||| + // fields: status, start-timestamp + // to the state DB. + virtual void record_stage_start(swss::WarmStart::WarmBootStage nsf_stage) = 0; + // Records the end time of a particular warmboot stage. + // Writes entries: + // WARM_RESTART_PERFORMANCE_TABLE| + // fields: status, finish-timestamp + // WARM_RESTART_PERFORMANCE_TABLE|| + // fields: status, finish-timestamp + // WARM_RESTART_PERFORMANCE_HISTORY||, + // fields: status, finish-timestamp + // WARM_RESTART_PERFORMANCE_HISTORY||| + // fields: status, finish-timestamp + // to the state DB. + virtual void record_stage_end(swss::WarmStart::WarmBootStage nsf_stage, + bool success) = 0; +}; + +} // namespace rebootbackend \ No newline at end of file diff --git a/src/sonic-framework/rebootbackend/reboot_thread.cpp b/src/sonic-framework/rebootbackend/reboot_thread.cpp new file mode 100644 index 000000000000..59a6b02878ba --- /dev/null +++ b/src/sonic-framework/rebootbackend/reboot_thread.cpp @@ -0,0 +1,961 @@ +#include "reboot_thread.h" + +#include + +#include + +#include "component_state_helper.h" +#include "container_stop.pb.h" +#include "dbconnector.h" +#include "logger.h" +#include "notificationproducer.h" +#include "reboot_common.h" +#include "reboot_interfaces.h" +#include "redis_utils.h" +#include "select.h" +#include "selectableevent.h" +#include "selectabletimer.h" +#include "stateverification.h" +#include "subscriberstatetable.h" +#include "system/system.pb.h" +#include "timestamp.h" +#include "warm_restart.h" + +namespace rebootbackend { + +using namespace ::gnoi::system; +using steady_clock = std::chrono::steady_clock; +using Progress = ::rebootbackend::RebootThread::Progress; +using WarmBootStage = ::swss::WarmStart::WarmBootStage; +using WarmStartState = ::swss::WarmStart::WarmStartState; +namespace gpu = ::google::protobuf::util; + +RebootThread::RebootThread(DbusInterface &dbus_interface, + CriticalStateInterface &critical_interface, + TelemetryInterface &telemetry_interface, + swss::SelectableEvent &m_finished) + : m_db("STATE_DB", 0), + m_finished(m_finished), + m_dbus_interface(dbus_interface), + m_critical_interface(critical_interface), + m_telemetry(telemetry_interface), + m_registration() {} + +void RebootThread::Stop(void) { + SWSS_LOG_ENTER(); + // Notify reboot thread that stop has been requested. + m_stop.notify(); +} + +bool RebootThread::Join(void) { + SWSS_LOG_ENTER(); + + if (!m_thread.joinable()) { + SWSS_LOG_ERROR("RebootThread::Join called, but not joinable"); + return false; + } + + try { + m_thread.join(); + m_status.set_inactive(); + return true; + } catch (const std::system_error &e) { + SWSS_LOG_ERROR("Exception calling join: %s", e.what()); + return false; + } +} + +RebootStatusResponse RebootThread::GetResponse(void) { + return m_status.get_response(); +} + +bool RebootThread::HasRun() { return m_status.get_reboot_count() > 0; } + +Progress RebootThread::platform_reboot_select(swss::Select &s, + swss::SelectableTimer &l_timer) { + SWSS_LOG_ENTER(); + + while (true) { + swss::Selectable *sel; + int select_ret; + select_ret = s.select(&sel); + + if (select_ret == swss::Select::ERROR) { + SWSS_LOG_NOTICE("Error: %s!", strerror(errno)); + } else if (select_ret == swss::Select::OBJECT) { + if (sel == &m_stop) { + // SIGTERM expected after platform reboot request + SWSS_LOG_NOTICE( + "m_stop rx'd (SIGTERM) while waiting for platform reboot"); + return Progress::EXIT_EARLY; + } else if (sel == &l_timer) { + return Progress::PROCEED; + } + } + } +} + +Progress RebootThread::wait_for_platform_reboot(swss::Select &s) { + SWSS_LOG_ENTER(); + + if (check_and_log_critical_state( + "system entered critical state after platfrom reboot request")) { + return Progress::EXIT_EARLY; + } + + // Sleep for a long time: 260 seconds. + // During this time platform should kill us as part of reboot. + swss::SelectableTimer l_timer( + timespec{.tv_sec = m_reboot_timeout, .tv_nsec = 0}); + s.addSelectable(&l_timer); + + l_timer.start(); + + Progress progress = platform_reboot_select(s, l_timer); + + l_timer.stop(); + s.removeSelectable(&l_timer); + return progress; +} + +void RebootThread::do_reboot(void) { + SWSS_LOG_ENTER(); + + swss::Select s; + s.addSelectable(&m_stop); + + // Check if stop was requested before Selectable was setup + if (sigterm_requested) { + SWSS_LOG_ERROR("sigterm_requested was raised, exiting"); + return; + } + + if (m_request.method() == RebootMethod::COLD) { + do_cold_reboot(s); + } else if (m_request.method() == RebootMethod::NSF) { + do_nsf_reboot(s); + } else { + // This shouldn't be possible. Reference check_start_preconditions() + SWSS_LOG_ERROR("Received unrecognized method type = %s", + RebootMethod_Name(m_request.method()).c_str()); + } +} + +RebootThread::Progress RebootThread::send_dbus_reboot_request() { + SWSS_LOG_ENTER(); + SWSS_LOG_NOTICE("Sending reboot request to platform"); + + std::string json_string; + gpu::Status status = gpu::MessageToJsonString(m_request, &json_string); + if (!status.ok()) { + std::string error_string = "unable to convert reboot protobuf to json: " + + status.message().as_string(); + log_error_and_set_non_retry_failure(error_string); + return Progress::EXIT_EARLY; + } + + // Send the reboot request to the reboot host service via dbus. + DbusInterface::DbusResponse dbus_response = + m_dbus_interface.Reboot(json_string); + + if (dbus_response.status == DbusInterface::DbusStatus::DBUS_FAIL) { + log_error_and_set_non_retry_failure(dbus_response.json_string); + return Progress::EXIT_EARLY; + } + return Progress::PROCEED; +} + +RebootThread::Progress RebootThread::nsf_reboot_helper(swss::Select &s) { + SWSS_LOG_ENTER(); + + SWSS_LOG_NOTICE("starting state verification: if enabled"); + if (Progress::EXIT_EARLY == perform_state_verification(s)) { + SWSS_LOG_ERROR("state verification returned EXIT_EARLY"); + return Progress::EXIT_EARLY; + } + + SWSS_LOG_NOTICE("starting freeze and container stop"); + m_telemetry.record_stage_start(WarmBootStage::STAGE_FREEZE); + if (Progress::EXIT_EARLY == perform_freeze_w_container_stop(s)) { + SWSS_LOG_ERROR("perform_freeze_w_container_stop: returned EXIT_EARLY"); + m_telemetry.record_stage_end(WarmBootStage::STAGE_FREEZE, + /*success=*/false); + return Progress::EXIT_EARLY; + } + m_telemetry.record_stage_end(WarmBootStage::STAGE_FREEZE, /*success=*/true); + + SWSS_LOG_NOTICE("starting checkpoint"); + m_telemetry.record_stage_start(WarmBootStage::STAGE_CHECKPOINT); + if (Progress::EXIT_EARLY == perform_checkpoint(s)) { + SWSS_LOG_ERROR("perform_checkpoint: returned EXIT_EARLY"); + m_telemetry.record_stage_end(WarmBootStage::STAGE_CHECKPOINT, + /*success=*/false); + return Progress::EXIT_EARLY; + } + m_telemetry.record_stage_end(WarmBootStage::STAGE_CHECKPOINT, + /*success=*/true); + + SWSS_LOG_NOTICE( + "done all pre-reboot steps, sending reboot request to platform"); + if (send_dbus_reboot_request() == Progress::EXIT_EARLY) { + return Progress::EXIT_EARLY; + } + + // Wait for platform to reboot. If we return, reboot failed. + // Logging, error status and monitoring for critical state are handled within. + return wait_for_platform_reboot(s); +} + +void RebootThread::do_nsf_reboot(swss::Select &s) { + SWSS_LOG_ENTER(); + SWSS_LOG_NOTICE("Starting NSF reboot"); + + // Delete the warm restart state and timestamp for all application + init_warm_reboot_states(m_db); + + m_registration.fetch_registration_info(); + + // Save the list of registered apps. + m_registration.clear_all_init_apps(); + m_registration.save_all_init_apps(); + + m_telemetry.record_overall_start(); + + // Enable system warm restart: WARM_RESTART_ENABLE_TABLE|system + set_warm_restart_enable(m_db, true); + + RebootThread::Progress progress = nsf_reboot_helper(s); + if (progress == Progress::PROCEED) { + // We shouldn't be here. No errors (EXIT_EARLY) occurred during + // reboot process under our control. Platform reboot should've killed us. + log_error_and_set_non_retry_failure("platform failed to reboot"); + + // Set critical state + m_critical_interface.report_critical_state("platform failed to reboot"); + } + + // NSF has failed. Either an error (EXIT_EARLY from nsf_reboot_helper) + // or platform failed to kill us after waiting m_reboot_timeout. + // Clear warm restart flag, close out telemetry. + m_telemetry.record_overall_end(/*success=*/false); + set_warm_restart_enable(m_db, false); +} + +void RebootThread::do_cold_reboot(swss::Select &s) { + SWSS_LOG_ENTER(); + SWSS_LOG_NOTICE("Sending cold reboot request to platform"); + if (send_dbus_reboot_request() == Progress::EXIT_EARLY) { + return; + } + + // Wait for platform to reboot. If we return, reboot failed. + // Logging, error status and monitoring for critical state are handled within. + if (wait_for_platform_reboot(s) == Progress::EXIT_EARLY) { + return; + } + + // We shouldn't be here. Platform reboot should've killed us. + log_error_and_set_non_retry_failure("platform failed to reboot"); + + // Set critical state + m_critical_interface.report_critical_state("platform failed to reboot"); + return; +} + +void RebootThread::reboot_thread(void) { + SWSS_LOG_ENTER(); + + do_reboot(); + + // Notify calling thread that reboot thread has exited. + // Calling thread will call Join(): join and set thread status to inactive. + m_finished.notify(); +} + +bool RebootThread::check_start_preconditions(const RebootRequest &request, + NotificationResponse &response) { + // We have to join a previous executing thread before restarting. + // Active is cleared in Join. + if (m_status.get_active()) { + response.json_string = "RebootThread: can't Start while active"; + response.status = swss::StatusCode::SWSS_RC_IN_USE; + } else if (request.method() != RebootMethod::COLD && + request.method() != RebootMethod::NSF) { + response.json_string = "RebootThread: Start rx'd unsupported method"; + response.status = swss::StatusCode::SWSS_RC_INVALID_PARAM; + } else if (request.delay() != 0) { + response.json_string = "RebootThread: delayed start not supported"; + response.status = swss::StatusCode::SWSS_RC_INVALID_PARAM; + } else if (request.method() == RebootMethod::NSF) { + if (m_status.get_last_reboot_status() == + RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE) { + // If the last reboot failed with a non-retriable failure, don't retry. + // But, we will allow a cold boot to recover. + response.json_string = + "RebootThread: last NSF failed with non-retriable failure"; + response.status = swss::StatusCode::SWSS_RC_FAILED_PRECONDITION; + } else if (m_critical_interface.is_system_critical()) { + response.json_string = "RebootThread: in critical state, NSF not allowed"; + response.status = swss::StatusCode::SWSS_RC_FAILED_PRECONDITION; + } + } + + if (response.status == swss::StatusCode::SWSS_RC_SUCCESS) { + return true; + } + + SWSS_LOG_ERROR("%s", response.json_string.c_str()); + + // Log the reboot request contents. + gpu::Status status; + std::string json_request; + status = gpu::MessageToJsonString(request, &json_request); + if (status.ok()) { + SWSS_LOG_ERROR("check_start_preconditions: RebootRequest = %s", + json_request.c_str()); + } else { + SWSS_LOG_ERROR( + "check_start_preconditions: error calling MessageToJsonString"); + } + return false; +} + +NotificationResponse RebootThread::Start(const RebootRequest &request) { + SWSS_LOG_ENTER(); + + NotificationResponse response = {.status = swss::StatusCode::SWSS_RC_SUCCESS, + .json_string = ""}; + + // Confirm we're not running, method is supported and we're not delayed. + if (!check_start_preconditions(request, response)) { + // Errors logged in check_start_preconditions. + return response; + } + + m_request = request; + + // From this point errors will be reported via RebootStatusRequest. + m_status.set_start_status(request.method(), request.message()); + + try { + m_thread = std::thread(&RebootThread::reboot_thread, this); + } catch (const std::system_error &e) { + std::string error_string = "Exception launching reboot thread: "; + error_string += e.what(); + log_error_and_set_failure_as_retriable(error_string); + + // Notify calling thread that thread has finished. + // Calling thread MUST call Join, which will join and clear active bit. + m_finished.notify(); + } + return response; +} + +bool RebootThread::check_and_log_critical_state( + const std::string error_string) { + SWSS_LOG_ENTER(); + if (m_critical_interface.is_system_critical()) { + // Critical state isn't retriable. + log_error_and_set_non_retry_failure(error_string); + return true; + } + return false; +} + +void RebootThread::log_error_and_set_non_retry_failure( + const std::string error_string) { + SWSS_LOG_ENTER(); + SWSS_LOG_ERROR("%s", error_string.c_str()); + m_status.set_completed_status( + RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, error_string); +} + +void RebootThread::log_error_and_set_failure_as_retriable( + const std::string error_string) { + SWSS_LOG_ENTER(); + SWSS_LOG_ERROR("%s", error_string.c_str()); + m_status.set_completed_status( + RebootStatus_Status::RebootStatus_Status_STATUS_RETRIABLE_FAILURE, + error_string); +} + +RebootThread::Status RebootThread::handle_state_verification_event( + swss::SubscriberStateTable &sub, std::string ×tamp) { + swss::KeyOpFieldsValuesTuple kco; + sub.pop(kco); + + std::string key = kfvKey(kco); + + if (key != ALL_COMPONENT) { + // we only care about updates to the "all" key + return Status::KEEP_WAITING; + } + + std::vector fvs = kfvFieldsValues(kco); + std::string status; + std::string ts; + + for (const auto &fv : fvs) { + if (fvField(fv) == TIMESTAMP_FIELD) { + ts = fvValue(fv); + } else if (fvField(fv) == STATUS_FIELD) { + status = fvValue(fv); + } + } + + if (ts != timestamp) { + // if this wasn't our state verification request + return Status::KEEP_WAITING; + } + + // We've received a valid state verification update + // key was ALL_COMPONENT and timestamp matched our + // last request. + + if (status == SV_NOT_RUN) { + // restart state verification + timestamp = send_state_verification_notification(m_db, false); + return Status::KEEP_WAITING; + } + + if (status == SV_PASS) { + return Status::SUCCESS; + } else if (status == SV_FAIL) { + // Hard failure is not retriable: not_run as final status + // is retriable. + log_error_and_set_non_retry_failure( + "state verification failed during reboot"); + return Status::FAILURE; + } + + return Status::KEEP_WAITING; +} + +Progress RebootThread::state_verification_select( + swss::Select &s, swss::SelectableTimer &l_timer, + swss::SubscriberStateTable &sub, std::string ×tamp) { + SWSS_LOG_ENTER(); + + while (true) { + swss::Selectable *sel; + int select_ret; + select_ret = s.select(&sel); + + if (select_ret == swss::Select::ERROR) { + SWSS_LOG_NOTICE("Error: %s!", strerror(errno)); + continue; + } + if (select_ret != swss::Select::OBJECT) { + SWSS_LOG_NOTICE("select returned unexpedted non-OBJECT"); + continue; + } + + if (sel == &m_stop) { + SWSS_LOG_ERROR("m_stop (sigterm) rx'd during reboot state verification"); + return Progress::EXIT_EARLY; + } else if (sel == &l_timer) { + // Timeout during state verification is a retriable error. + log_error_and_set_failure_as_retriable( + "timeout occurred during reboot state verification: retriable error"); + return Progress::EXIT_EARLY; + } else if (sel == &sub) { + Status status = handle_state_verification_event(sub, timestamp); + + if (status == Status::SUCCESS) { + SWSS_LOG_NOTICE("state verification reported success"); + return Progress::PROCEED; + } else if (status == Status::FAILURE) { + // error is logged and error string set in + // handle_state_verification_event. + return Progress::EXIT_EARLY; + } else { + continue; + } + } + } +} + +Progress RebootThread::perform_state_verification(swss::Select &s) { + if (check_and_log_critical_state( + "system entered critical state before reboot state verification")) { + return Progress::EXIT_EARLY; + } + + if (!swss::WarmStart::isStateVerificationShutdownEnabled()) { + // if state verification isn't enabled in CONFIG_DB: skip state verification + SWSS_LOG_NOTICE("State verification not enabled"); + return Progress::PROCEED; + } + + swss::SelectableTimer l_timer( + timespec{.tv_sec = m_state_verification_timeout, .tv_nsec = 0}); + s.addSelectable(&l_timer); + + swss::SubscriberStateTable sub(&m_db, STATE_VERIFICATION_RESP_TABLE); + s.addSelectable(&sub); + + l_timer.start(); + + // Send a non-frozen state verifiation request. + std::string timestamp = send_state_verification_notification(m_db, false); + SWSS_LOG_NOTICE("State verification triggered, waiting for result"); + + Progress progress = state_verification_select(s, l_timer, sub, timestamp); + + l_timer.stop(); + s.removeSelectable(&l_timer); + s.removeSelectable(&sub); + return progress; +} + +// +// Stop On Freeze Support +// + +// Perform quiescence and container stop in parallel. +// First we request container stop. +// Freeze is sent to all containers. +// We wait for containers to quiesce (or checkpoint). +// We wait for containers to stop. +Progress RebootThread::perform_freeze_w_container_stop(swss::Select &s) { + SWSS_LOG_ENTER(); + std::string request_id; + + SWSS_LOG_NOTICE("Requesting container stop on freeze"); + if (Progress::EXIT_EARLY == request_stop_on_freeze(request_id)) { + SWSS_LOG_ERROR("request_stop_on_freeze: returned EXIT_EARLY"); + return Progress::EXIT_EARLY; + } + SWSS_LOG_NOTICE("Stop on freeze request sent."); + + swss::SelectableTimer l_timer( + milliseconds_to_timespec(m_quiescence_timeout_ms)); + s.addSelectable(&l_timer); + l_timer.start(); + + Progress progress = + wait_for_container_stop_and_quiescence(s, l_timer, request_id); + + s.removeSelectable(&l_timer); + return progress; +} + +Progress RebootThread::wait_for_container_stop_and_quiescence( + swss::Select &s, swss::SelectableTimer &l_timer, + const std::string &request_id) { + SWSS_LOG_NOTICE("waiting for containers to stop"); + if (Progress::EXIT_EARLY == wait_for_container_stop(s, request_id, l_timer)) { + SWSS_LOG_ERROR("wait_for_container_stop: returned EXIT_EARLY"); + return Progress::EXIT_EARLY; + } + + SWSS_LOG_NOTICE("starting freeze quiescence"); + if (Progress::EXIT_EARLY == perform_freeze_quiescence(s, l_timer)) { + SWSS_LOG_ERROR( + "perform_freeze_quiescence: returned EXIT_EARLY. Outstanding apps: %s", + m_registration + .join_pending_apps(swss::WarmStart::WarmBootStage::STAGE_FREEZE) + .c_str()); + return Progress::EXIT_EARLY; + } + + return Progress::PROCEED; +} + +Progress RebootThread::build_stop_container_request(std::string &json_request, + std::string &request_id) { + SWSS_LOG_ENTER(); + + request_id = swss::getTimestamp(); + StopContainersRequest request; + request.set_request_id(request_id); + + // Get the list of apps that need to be stopped + Registration::RegistrationSet stop_set = + m_registration.get_stop_on_freeze_set(); + + for (const std::string &app : stop_set) { + request.add_container_names(app); + } + + gpu::Status status = gpu::MessageToJsonString(request, &json_request); + + if (!status.ok()) { + std::string error_string = + "unable to convert StopContainersRequest protobuf to json: " + + status.message().as_string(); + log_error_and_set_non_retry_failure(error_string); + return Progress::EXIT_EARLY; + } + return Progress::PROCEED; +} + +Progress RebootThread::request_stop_on_freeze(std::string &request_id) { + SWSS_LOG_ENTER(); + + // Get the list of apps that need to be stopped + Registration::RegistrationSet stop_set = + m_registration.get_stop_on_freeze_set(); + + if (stop_set.empty()) { + return Progress::PROCEED; + } + + std::string json_request; + if (build_stop_container_request(json_request, request_id) == + Progress::EXIT_EARLY) { + return Progress::EXIT_EARLY; + } + + // Send the stop containers request to the stop container host service via + // dbus. + DbusInterface::DbusResponse dbus_response = + m_dbus_interface.StopContainers(json_request); + + if (dbus_response.status == DbusInterface::DbusStatus::DBUS_FAIL) { + log_error_and_set_non_retry_failure(dbus_response.json_string); + return Progress::EXIT_EARLY; + } + return Progress::PROCEED; +} + +RebootThread::Status RebootThread::check_container_stop( + const std::string &request_id) { + SWSS_LOG_ENTER(); + StopContainersStatusRequest request; + request.set_request_id(request_id); + + std::string json_request; + gpu::Status status = gpu::MessageToJsonString(request, &json_request); + + if (!status.ok()) { + SWSS_LOG_ERROR( + "unable to convert StopContainersStatusRequest protobuf to json: %s", + status.message().as_string().c_str()); + return Status::FAILURE; + } + + // Send the stop containers request to the stop container host service via + // dbus. + DbusInterface::DbusResponse dbus_response = + m_dbus_interface.StopContainerStatus(json_request); + + if (dbus_response.status == DbusInterface::DbusStatus::DBUS_FAIL) { + SWSS_LOG_ERROR("StopContainersStatus returned ERROR: %s", + dbus_response.json_string.c_str()); + return Status::FAILURE; + } + + StopContainersResponse response; + status = gpu::JsonStringToMessage(dbus_response.json_string, &response); + if (!status.ok()) { + SWSS_LOG_ERROR( + "unable to convert StopContainersStatus json |%s| to prototobuf: |%s|", + dbus_response.json_string.c_str(), + status.message().as_string().c_str()); + return Status::FAILURE; + } + + if (response.status() == ShutdownStatus::DONE) { + return Status::SUCCESS; + } else if (response.status() == ShutdownStatus::ERROR) { + SWSS_LOG_ERROR( + "Container stop service reported error shutting down containers: %s", + response.DebugString().c_str()); + } + return Status::KEEP_WAITING; +} + +RebootThread::Status RebootThread::precheck_wait_for_container_stop( + const std::string &request_id) { + // Get the list of apps that need to be stopped + Registration::RegistrationSet stop_set = + m_registration.get_stop_on_freeze_set(); + + if (stop_set.empty()) { + return Status::SUCCESS; + } + if (check_container_stop(request_id) == Status::SUCCESS) { + return Status::SUCCESS; + } + return Status::KEEP_WAITING; +} + +Progress RebootThread::wait_for_container_stop(swss::Select &s, + const std::string &request_id, + swss::SelectableTimer &l_timer) { + SWSS_LOG_ENTER(); + + // Have containers stopped? Are there no containers to stop? + if (Status::SUCCESS == precheck_wait_for_container_stop(request_id)) { + return Progress::PROCEED; + } + + if (check_and_log_critical_state("system entered critical state while " + "waiting for containers to stop")) { + return Progress::EXIT_EARLY; + } + + while (true) { + swss::Selectable *sel; + int select_ret; + select_ret = s.select(&sel, SELECT_TIMEOUT_500_MS); + + if (Status::SUCCESS == check_container_stop(request_id)) { + return Progress::PROCEED; + } + + if (select_ret == swss::Select::ERROR) { + SWSS_LOG_NOTICE("Error: %s!", strerror(errno)); + continue; + } + + if (select_ret == swss::Select::TIMEOUT) { + // Don't flood logs on timeout. + continue; + } + + if (select_ret != swss::Select::OBJECT) { + SWSS_LOG_NOTICE("select returned unexpected non-OBJECT"); + continue; + } + + if (sel == &m_stop) { + SWSS_LOG_NOTICE( + "m_stop (sigterm) rx'd while waiting for containers to stop"); + return Progress::EXIT_EARLY; + } else if (sel == &l_timer) { + log_error_and_set_non_retry_failure( + "timeout occurred waiting for containers to stop"); + return Progress::EXIT_EARLY; + } + } +} + +// +// Freeze Quiescence Support +// +Progress RebootThread::perform_freeze_quiescence( + swss::Select &s, swss::SelectableTimer &l_timer) { + SWSS_LOG_ENTER(); + if (check_and_log_critical_state( + "system entered critical state before freezing")) { + return Progress::EXIT_EARLY; + } + + swss::SubscriberStateTable sub(&m_db, STATE_WARM_RESTART_TABLE_NAME); + s.addSelectable(&sub); + + send_nsf_manager_notification(m_db, + swss::WarmStart::WarmBootNotification::kFreeze); + SWSS_LOG_NOTICE( + "freeze signal sent, waiting for apps to reach frozen state: %s", + m_registration + .join_pending_apps(swss::WarmStart::WarmBootStage::STAGE_FREEZE) + .c_str()); + + Progress progress = freeze_quiescence_select(s, l_timer, sub); + + s.removeSelectable(&sub); + return progress; +} + +Progress RebootThread::freeze_quiescence_select( + swss::Select &s, swss::SelectableTimer &l_timer, + swss::SubscriberStateTable &sub) { + SWSS_LOG_ENTER(); + + steady_clock::time_point start_time; + bool quiesced = false; + + // Check the current status of all registered apps. + Registration::Response response = m_registration.check_quiesced(); + + if (response.status == Registration::Status::FAILURE) { + log_error_and_set_non_retry_failure(response.error_string); + return Progress::EXIT_EARLY; + } + + if (response.status == Registration::Status::COMPLETED) { + // We're quiesced: set start time for 10 second quiescence hold timer + quiesced = true; + start_time = steady_clock::now(); + } + + while (true) { + swss::Selectable *sel; + int select_ret; + + // Set timeout to 250 milli-seconds. We'll wake up at least every + // quarter second and can check if quiescence hold time has passed. + select_ret = s.select(&sel, SELECT_TIMEOUT_250_MS); + + if (quiesced) { + if (steady_clock::now() - start_time > + std::chrono::milliseconds(m_quiescence_hold_time_ms)) { + // We've been quiesced for 10 seconds: we're ready to PROCEED. + return Progress::PROCEED; + } + } + + if (select_ret == swss::Select::TIMEOUT) { + // Don't flood logs on timeout. + continue; + } + + if (select_ret == swss::Select::ERROR) { + SWSS_LOG_NOTICE("Error: %s!", strerror(errno)); + continue; + } + if (select_ret != swss::Select::OBJECT) { + SWSS_LOG_NOTICE("select returned unexpected non-OBJECT"); + continue; + } + + if (sel == &m_stop) { + std::string error_string = + "m_stop (sigterm) rx'd during reboot " + + Registration::get_warm_boot_stage_name(WarmBootStage::STAGE_FREEZE); + SWSS_LOG_ERROR("%s\n", error_string.c_str()); + return Progress::EXIT_EARLY; + } else if (sel == &l_timer) { + // TODO: use getWarmBootStageFromState() or warmBootStateToStageMap() + // to get warm restart stage rather state + std::string error_string = + "timeout occurred during reboot stage " + + Registration::get_warm_boot_stage_name(WarmBootStage::STAGE_FREEZE); + log_error_and_set_non_retry_failure(error_string); + return Progress::EXIT_EARLY; + } else if (sel == &sub) { + swss::KeyOpFieldsValuesTuple kco; + sub.pop(kco); + + Registration::Response response = + m_registration.handle_state_event(WarmBootStage::STAGE_FREEZE, kco); + + if (response.status == Registration::Status::FAILURE) { + log_error_and_set_non_retry_failure(response.error_string); + return Progress::EXIT_EARLY; + } else if (response.status == Registration::Status::COMPLETED) { + // We're quiesced: set start time for 10 second quiescence hold timer + quiesced = true; + start_time = steady_clock::now(); + } else { + // Registration::Status::IN_PROCESS + quiesced = false; + } + } + } +} + +// +// Checkpoint support. +// +Progress RebootThread::perform_checkpoint(swss::Select &s) { + SWSS_LOG_ENTER(); + if (check_and_log_critical_state( + "system entered critical state before checkpointing")) { + return Progress::EXIT_EARLY; + } + + swss::SelectableTimer l_timer( + timespec{.tv_sec = m_checkpoint_timeout, .tv_nsec = 0}); + s.addSelectable(&l_timer); + + swss::SubscriberStateTable sub(&m_db, STATE_WARM_RESTART_TABLE_NAME); + s.addSelectable(&sub); + + l_timer.start(); + + send_nsf_manager_notification( + m_db, swss::WarmStart::WarmBootNotification::kCheckpoint); + SWSS_LOG_NOTICE( + "checkpoint signal sent, waiting for apps to reach checkpointed state: " + "%s", + m_registration + .join_pending_apps(swss::WarmStart::WarmBootStage::STAGE_CHECKPOINT) + .c_str()); + + Progress progress = checkpoint_select_stage_one(s, l_timer, sub); + + l_timer.stop(); + s.removeSelectable(&l_timer); + s.removeSelectable(&sub); + return progress; +} + +Progress RebootThread::checkpoint_select_stage_one( + swss::Select &s, swss::SelectableTimer &l_timer, + swss::SubscriberStateTable &sub) { + SWSS_LOG_ENTER(); + + // We're subscribed: so we wont miss any events. + // Check the current status of all registered apps. + Registration::Response response = m_registration.check_checkpointed(); + if (response.status == Registration::Status::COMPLETED) { + return Progress::PROCEED; + } + if (response.status == Registration::Status::FAILURE) { + log_error_and_set_non_retry_failure( + "check_checkpointed returned error: " + response.error_string + + ". Outstanding apps: " + + m_registration.join_pending_apps( + swss::WarmStart::WarmBootStage::STAGE_CHECKPOINT)); + return Progress::EXIT_EARLY; + } + return checkpoint_stage_two(s, l_timer, sub); +} + +Progress RebootThread::checkpoint_stage_two(swss::Select &s, + swss::SelectableTimer &l_timer, + swss::SubscriberStateTable &sub) { + while (true) { + swss::Selectable *sel; + int select_ret; + select_ret = s.select(&sel); + + if (select_ret == swss::Select::ERROR) { + SWSS_LOG_NOTICE("Error: %s!", strerror(errno)); + continue; + } + if (select_ret != swss::Select::OBJECT) { + SWSS_LOG_NOTICE("select returned unexpected non-OBJECT"); + continue; + } + + if (sel == &m_stop) { + std::string error_string = "m_stop (sigterm) rx'd during reboot " + + Registration::get_warm_boot_stage_name( + WarmBootStage::STAGE_CHECKPOINT); + SWSS_LOG_ERROR("%s\n", error_string.c_str()); + return Progress::EXIT_EARLY; + } else if (sel == &l_timer) { + // TODO: use getWarmBootStageFromState() or warmBootStateToStageMap() + // to get warm restart stage rather state + std::string error_string = + "timeout occurred during reboot stage " + + Registration::get_warm_boot_stage_name( + WarmBootStage::STAGE_CHECKPOINT) + + ". Outstanding apps: " + + m_registration.join_pending_apps( + swss::WarmStart::WarmBootStage::STAGE_CHECKPOINT); + log_error_and_set_non_retry_failure(error_string); + return Progress::EXIT_EARLY; + } else if (sel == &sub) { + swss::KeyOpFieldsValuesTuple kco; + sub.pop(kco); + Registration::Response response = m_registration.handle_state_event( + WarmBootStage::STAGE_CHECKPOINT, kco); + if (response.status == Registration::Status::COMPLETED) { + return Progress::PROCEED; + } else if (response.status == Registration::Status::FAILURE) { + log_error_and_set_non_retry_failure(response.error_string); + return Progress::EXIT_EARLY; + } else { + continue; + } + } + } +} + +} // namespace rebootbackend diff --git a/src/sonic-framework/rebootbackend/reboot_thread.h b/src/sonic-framework/rebootbackend/reboot_thread.h new file mode 100644 index 000000000000..d1558faa52b4 --- /dev/null +++ b/src/sonic-framework/rebootbackend/reboot_thread.h @@ -0,0 +1,418 @@ +#pragma once + +#include +#include +#include + +#include "dbconnector.h" +#include "notificationproducer.h" +#include "reboot_common.h" +#include "reboot_interfaces.h" +#include "redis_utils.h" +#include "select.h" +#include "selectableevent.h" +#include "selectabletimer.h" +#include "subscriberstatetable.h" +#include "system/system.pb.h" + +namespace rebootbackend { + +#define SELECT_TIMEOUT_250_MS (250) +#define SELECT_TIMEOUT_500_MS (500) + +// Hold/manage the contents of a RebootStatusResponse as defined +// in system.proto +// Thread-safe: expectation is one thread will write and multiple +// threads can read. +class ThreadStatus { + public: + ThreadStatus() { + m_proto_status.set_active(false); + + // Reason for reboot as specified in message from a RebootRequest. + // This is "message" in RebootRequest. + m_proto_status.set_reason(""); + + // Number of reboots since active. + m_proto_status.set_count(0); + + // RebootMethod is type of of reboot: cold, nsf, warm, fast from a + // RebootRequest + m_proto_status.set_method(gnoi::system::RebootMethod::UNKNOWN); + + // Status can be UNKNOWN, SUCCESS, RETRIABLE_FAILURE or FAILURE. + m_proto_status.mutable_status()->set_status( + gnoi::system::RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN); + + // In the event of error: message is human readable error explanation. + m_proto_status.mutable_status()->set_message(""); + } + + void set_start_status(const gnoi::system::RebootMethod &method, + const std::string &reason) { + m_mutex.lock(); + + m_proto_status.set_active(true); + m_proto_status.set_reason(reason); + m_proto_status.set_count(m_proto_status.count() + 1); + m_proto_status.set_method(method); + m_proto_status.mutable_status()->set_status( + gnoi::system::RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN); + m_proto_status.mutable_status()->set_message(""); + + // set when to time reboot starts + std::chrono::nanoseconds ns = + std::chrono::system_clock::now().time_since_epoch(); + m_proto_status.set_when(ns.count()); + + m_mutex.unlock(); + } + + bool get_active(void) { + m_mutex.lock(); + bool ret = m_proto_status.active(); + m_mutex.unlock(); + return ret; + } + + void set_completed_status(const gnoi::system::RebootStatus_Status &status, + const std::string &message) { + m_mutex.lock(); + + // Status should only be updated while reboot is active + if (m_proto_status.active()) { + m_proto_status.mutable_status()->set_status(status); + m_proto_status.mutable_status()->set_message(message); + } + + m_mutex.unlock(); + } + + void set_inactive(void) { + m_mutex.lock(); + m_proto_status.set_active(false); + m_mutex.unlock(); + } + + int get_reboot_count() { + const std::lock_guard lock(m_mutex); + return m_proto_status.count(); + } + + gnoi::system::RebootStatus_Status get_last_reboot_status(void) { + gnoi::system::RebootStatusResponse response = get_response(); + return response.status().status(); + } + + gnoi::system::RebootStatusResponse get_response(void) { + m_mutex.lock(); + // make a copy + gnoi::system::RebootStatusResponse lstatus = m_proto_status; + m_mutex.unlock(); + + if (lstatus.active()) { + // RebootStatus isn't applicable if we're active + lstatus.mutable_status()->set_status( + gnoi::system::RebootStatus_Status:: + RebootStatus_Status_STATUS_UNKNOWN); + lstatus.mutable_status()->set_message(""); + } else { + // When is only valid while we're active (since delayed + // start isn't supported). Value is set when reboot begins. + lstatus.set_when(0); + } + + return lstatus; + } + + private: + std::mutex m_mutex; + gnoi::system::RebootStatusResponse m_proto_status; +}; + +// RebootThread performs reboot actions leading up to a platform +// request to reboot. +// thread-compatible: expectation is Stop, Start and Join will be +// called from the same thread. +class RebootThread { + public: + enum class Status { SUCCESS, FAILURE, KEEP_WAITING }; + enum class Progress { PROCEED, EXIT_EARLY }; + + // interface: dbus reboot host service access + // m_finished: let launching task know thread has finished + RebootThread(DbusInterface &dbus_interface, + CriticalStateInterface &critical_interface, + TelemetryInterface &telemetry_interface, + swss::SelectableEvent &m_finished); + + NotificationResponse Start(const gnoi::system::RebootRequest &request); + + // Request thread stop/exit. Only used when platform is shutting down + // all containers/processes. + void Stop(void); + + // Called by launching task after notification sent to m_finished. + bool Join(void); + + // Return Status of last reboot attempt + gnoi::system::RebootStatusResponse GetResponse(); + + // Returns true if the RebootThread has been started since the last reboot, + // and false otherwise. + bool HasRun(); + + private: + void reboot_thread(void); + void do_reboot(void); + Progress send_dbus_reboot_request(); + Progress nsf_reboot_helper(swss::Select &s); + void do_nsf_reboot(swss::Select &s); + void do_cold_reboot(swss::Select &s); + + // Inner loop select handler to wait for platform reboot. + // wait for timeout + // wait for a stop request (sigterm) + // Returns: + // EXIT_EARLY: an issue occurred that stops NSF + // PROCEED: if reboot timeout expired + Progress platform_reboot_select(swss::Select &s, + swss::SelectableTimer &l_timer); + + // Wait for platform to reboot while waiting for possible stop + // Returns: + // EXIT_EARLY: an issue occurred that stops NSF + // PROCEED: if reboot timeout expired + Progress wait_for_platform_reboot(swss::Select &s); + + // Check for critical state: log error and update status. + // Returns: + // true: if system is in critical state + // false: all is well + bool check_and_log_critical_state(const std::string error_string); + + // Log error string, set status to RebootStatus_Status_STATUS_FAILURE + // Set status message to error_string. + void log_error_and_set_non_retry_failure(const std::string error_string); + + // Log error string, set status to + // RebootStatus_Status_STATUS_RETRIABLE_FAILURE Set status message to + // error_string. + void log_error_and_set_failure_as_retriable(const std::string error_string); + + // Handle a database subscription update to STATE_VERIFICATION_RESP_TABLE + // Confirm update is for the "all" component with correct timestamp. + // if update is not_run: then restart + // Args: + // sub: [input] selectable subscription to STATE_VERIFICATION_RESP_TABLE + // data is pending + // timestamp: [input && output] the timestamp sent to state verification + // with the state verification request + // Returns: + // KEEP_WAITING: keep waiting for success, fail or timeout + // SUCCESS: state verification passed, procced + // FAILURE: state verification failed + Status handle_state_verification_event(swss::SubscriberStateTable &sub, + std::string ×tamp); + + // Infinite lop select for state verification + // Listen for a stop, timer expiration or database update + // Args: + // s: select to be monitored (stop, timer, subscription) events + // l_timer: timeout expiration selectable + // sub: subscription to STATE_VERIFICATION_RESP_TABLE + // timestamp: [input] the timestamp sent with the state verification + // request. Used to match response with request. + // Returns: + // EXIT_EARLY: an issue occurred that stops NSF + // PROCEED: if reboot timeout expired + Progress state_verification_select(swss::Select &s, + swss::SelectableTimer &l_timer, + swss::SubscriberStateTable &sub, + std::string ×tamp); + + // If enabled: perform non-frozen state verification + // Check for critical state, listen for stop, handle a timeout + // Returns: + // EXIT_EARLY: an issue occurred that stops NSF + // PROCEED: if reboot timeout expired + Progress perform_state_verification(swss::Select &s); + + // Perform freeze/quiescence with container stop support. + // Request platform stop (stop on freeze) containers. + // Request applications freeze and wait for registered apps + // to quiesce or checkpoint. + // Poll platform till containers have stopped. + // Check for critical state, listen for stop, handle a timeout + // Returns: + // EXIT_EARLY: an issue occurred that stops NSF + // PROCEED: if reboot timeout expired + Progress perform_freeze_w_container_stop(swss::Select &s); + + // Wait for stop_on_freeze containers to exit, then + // wait for freeze containers to quiesce (or checkpoint). + // Check for critical state, listen for stop, handle a timeout + // Args: + // s: select to be monitored (stop, timer, subscription) events + // l_timer: timeout expiration selectable, running at function + // start + // request_id: [input] request_id populated in StopContainersRequest. + // This is used/needed when sending StopContainersStatusRequest. + // Returns: + // EXIT_EARLY: an issue occurred that stops NSF + // PROCEED: if reboot timeout expired + Progress wait_for_container_stop_and_quiescence( + swss::Select &s, swss::SelectableTimer &l_timer, + const std::string &request_id); + + // Build a json formatted stop container request proto message. + // Message populated with containers that registered stop on freeze. + // Args: + // json_request: [output] json formatted StopContainersRequest + // message + // request_id: [output] request_id populated in StopContainersRequest. + // This is used/needed when sending StopContainersStatusRequest. + Progress build_stop_container_request(std::string &json_request, + std::string &request_id); + + // Send a StopContainersRequest message to the gnoi_stop_container + // sonic host service requesting list of containres be stopped. + // Args: + // request_id: [output] request_id populated in StopContainersRequest. + // This is used/needed when sending StopContainersStatusRequest. + Progress request_stop_on_freeze(std::string &request_id); + + // Send a StopContainersStatusRequest message to the gnoi_stop_container + // sonic host service to check if all containers have stopped. + // Args: + // request_id: [input] request_id populated in StopContainersRequest. + // This is used/needed when sending StopContainersStatusRequest. + // Returns: + // SUCCESS: containers have stopped, we're done. + // KEEP_WAITING: containers haven't stopped + // FAILURE: dbus error or protobuf conversion error + // suggest retry till timeout + Status check_container_stop(const std::string &request_id); + + // Check if containers have stopped, or there are no containers to stop + // Args: + // request_id: [input] request_id populated in StopContainersRequest. + // This is used/needed when sending StopContainersStatusRequest. + // Returns: + // SUCCESS: containers have stopped, we're done. + // KEEP_WAITING: containers haven't stopped + // FAILURE: dbus error or protobuf conversion error + // suggest retry till timeout + Status precheck_wait_for_container_stop(const std::string &request_id); + + // Poll the gnoi_stop_container host service to determine if requested + // set of containers have exited. + // Check for critical state, listen for stop, handle a timeout + // Args: + // s: [input] select statement that has m_stop as a selectable + // request_id: [input] request_id populated in StopContainersRequest. + // This is used/needed when sending StopContainersStatusRequest. + // l_timer: timeout expiration selectable, running at function + // start + Progress wait_for_container_stop(swss::Select &s, + const std::string &request_id, + swss::SelectableTimer &l_timer); + + // Perform freeze/quiescence. + // Request applications freeze and wait for registered apps + // to quiesce or checkpoint. + // Check for critical state, listen for stop, handle a timeout + // Args: + // s: [input] select statement that has m_stop as a selectable + // l_timer: [inpu] timeout expiration selectable, running at function start + // Returns: + // EXIT_EARLY: an issue occurred that stops NSF + // PROCEED: if reboot timeout expired + Progress perform_freeze_quiescence(swss::Select &s, + swss::SelectableTimer &l_timer); + + // Helper function for freeze quiescence stage: + // Check current database status before waiting for subscriptions updates. + // infinite loop select for checkpoint + // Listen for a stop, timer expiration or database update + // We must be quiescent for 10 seconds before PROCEED. + // Returns: + // EXIT_EARLY: an issue occurred that stops NSF + // PROCEED: if reboot timeout expired + Progress freeze_quiescence_select(swss::Select &s, + swss::SelectableTimer &l_timer, + swss::SubscriberStateTable &sub); + + // Perform checkpointing + // Request applications checkpoint and wait for registered apps + // to checkpoint. + // Check for critical state, listen for stop, handle a timeout + // Returns: + // EXIT_EARLY: an issue occurred that stops NSF + // PROCEED: if reboot timeout expired + Progress perform_checkpoint(swss::Select &s); + + // Hepler function for checkpoint: Check initial checkpoint + // status before entering select loop. + // Returns: + // EXIT_EARLY: an issue occurred that stops NSF + // PROCEED: if reboot timeout expired + Progress checkpoint_select_stage_one(swss::Select &s, + swss::SelectableTimer &l_timer, + swss::SubscriberStateTable &sub); + + // Helper function for checkpoint stage: + // infinite loop select for checkpoint + // Listen for a stop, timer expiration or database update + // Returns: + // EXIT_EARLY: an issue occurred that stops NSF + // PROCEED: if reboot timeout expired + Progress checkpoint_stage_two(swss::Select &s, swss::SelectableTimer &l_timer, + swss::SubscriberStateTable &sub); + + // Request is input only. + // Response is ouput only. + // Return true if preconditions met, false otherwise. + bool check_start_preconditions(const gnoi::system::RebootRequest &request, + NotificationResponse &response); + + std::thread m_thread; + + // Signal m_finished to let main thread know weve completed. + // Main thread should call Join. + swss::SelectableEvent &m_finished; + + // m_stop signalled by main thread on sigterm: cleanup and exit. + swss::SelectableEvent m_stop; + DbusInterface &m_dbus_interface; + CriticalStateInterface &m_critical_interface; + TelemetryInterface &m_telemetry; + swss::DBConnector m_db; + ThreadStatus m_status; + gnoi::system::RebootRequest m_request; + Registration m_registration; + + // Wait for system to reboot: allow unit test to shorten. + // TODO: there is a plan to make these timer values + // available in CONFIG_DB + static constexpr uint32_t kRebootTime = 260; + uint32_t m_reboot_timeout = kRebootTime; + + static constexpr uint32_t kStateVerificationTime = 180; + uint32_t m_state_verification_timeout = kStateVerificationTime; + + static constexpr uint32_t kQuiescenceTimeMs = 60 * ONE_THOUSAND; + uint32_t m_quiescence_timeout_ms = kQuiescenceTimeMs; + + // We must remain quiescent for 5 seconds. + static constexpr uint32_t kQuiescenceHoldTimeMs = 5 * ONE_THOUSAND; + uint32_t m_quiescence_hold_time_ms = kQuiescenceHoldTimeMs; + + static constexpr uint32_t kCheckpointTime = 30; + uint32_t m_checkpoint_timeout = kCheckpointTime; + + friend class RebootBETestWithoutStop; + friend class RebootThreadTest; +}; + +} // namespace rebootbackend diff --git a/src/sonic-framework/rebootbackend/rebootbackend.cpp b/src/sonic-framework/rebootbackend/rebootbackend.cpp new file mode 100644 index 000000000000..27c12c33edf3 --- /dev/null +++ b/src/sonic-framework/rebootbackend/rebootbackend.cpp @@ -0,0 +1,15 @@ +#include "interfaces.h" +#include "reboot_interfaces.h" +#include "rebootbe.h" +#include "telemetry_helper.h" + +using namespace ::rebootbackend; + +int main(int argc, char** argv) { + HostServiceDbus dbus_interface; + CriticalState critical_interface; + TelemetryHelper telemetry_helper; + RebootBE rebootbe(dbus_interface, critical_interface, telemetry_helper); + rebootbe.Start(); + return 0; +} diff --git a/src/sonic-framework/rebootbackend/rebootbe.cpp b/src/sonic-framework/rebootbackend/rebootbe.cpp new file mode 100644 index 000000000000..6871f10d1728 --- /dev/null +++ b/src/sonic-framework/rebootbackend/rebootbe.cpp @@ -0,0 +1,330 @@ +#include "rebootbe.h" + +#include +#include + +#include +#include +#include + +#include "init_thread.h" +#include "logger.h" +#include "notificationconsumer.h" +#include "notificationproducer.h" +#include "reboot_common.h" +#include "reboot_interfaces.h" +#include "select.h" +#include "status_code_util.h" +#include "warm_restart.h" + +namespace rebootbackend { + +namespace gpu = ::google::protobuf::util; + +bool sigterm_requested = false; + +RebootBE::RebootBE(DbusInterface &dbus_interface, + CriticalStateInterface &critical_interface, + TelemetryInterface &telemetry_interface) + : m_db("STATE_DB", 0), + m_rebootResponse(&m_db, REBOOT_RESPONSE_NOTIFICATION_CHANNEL), + m_notificationConsumer(&m_db, REBOOT_REQUEST_NOTIFICATION_CHANNEL), + m_dbus(dbus_interface), + m_critical(critical_interface), + m_telemetry(telemetry_interface), + m_init_thread( + std::make_unique(critical_interface, telemetry_interface, + m_init_thread_done, m_stack_unfrozen)), + m_reboot_thread(dbus_interface, critical_interface, telemetry_interface, + m_reboot_thread_finished) { + swss::Logger::linkToDbNative("rebootbackend"); +} + +RebootBE::NsfManagerStatus RebootBE::GetCurrentStatus() { + const std::lock_guard lock(m_status_mutex); + return m_current_status; +} + +void RebootBE::SetCurrentStatus(NsfManagerStatus new_status) { + const std::lock_guard lock(m_status_mutex); + m_current_status = new_status; +} + +void RebootBE::Start() { + SWSS_LOG_ENTER(); + SWSS_LOG_NOTICE("--- Starting rebootbackend ---"); + + swss::WarmStart::initialize("rebootbackend", "sonic-framework"); + swss::WarmStart::checkWarmStart("rebootbackend", "sonic-framework", + /*incr_restore_cnt=*/false); + + swss::Select s; + s.addSelectable(&m_notificationConsumer); + s.addSelectable(&m_done); + s.addSelectable(&m_init_thread_done); + s.addSelectable(&m_stack_unfrozen); + s.addSelectable(&m_reboot_thread_finished); + + if (swss::WarmStart::isWarmStart()) { + SWSS_LOG_NOTICE("Launching init thread for warm start"); + SetCurrentStatus(NsfManagerStatus::NSF_INIT_WAIT); + swss::StatusCode result = m_init_thread->Start(); + if (result != swss::StatusCode::SWSS_RC_SUCCESS) { + SetCurrentStatus(NsfManagerStatus::IDLE); + SWSS_LOG_ERROR("Error launching init thread: %s", + swss::statusCodeToStr(result).c_str()); + } + } else { + SWSS_LOG_NOTICE("Warm restart not enabled, not starting init thread"); + } + + SWSS_LOG_NOTICE("RebootBE entering operational loop"); + while (true) { + swss::Selectable *sel; + int ret; + + ret = s.select(&sel); + if (ret == swss::Select::ERROR) { + SWSS_LOG_NOTICE("Error: %s!", strerror(errno)); + } else if (ret == swss::Select::OBJECT) { + if (sel == &m_notificationConsumer) { + do_task(m_notificationConsumer); + } else if (sel == &m_stack_unfrozen) { + handle_unfreeze(); + } else if (sel == &m_init_thread_done) { + handle_init_finish(); + } else if (sel == &m_reboot_thread_finished) { + handle_reboot_finish(); + } else if (sel == &m_done) { + handle_done(); + break; + } + } + } + return; +} + +void RebootBE::Stop() { + SWSS_LOG_ENTER(); + m_done.notify(); + return; +} + +bool RebootBE::retrieve_notification_data( + swss::NotificationConsumer &consumer, + RebootBE::NotificationRequest &request) { + SWSS_LOG_ENTER(); + + request.op = ""; + request.ret_string = ""; + + std::string data; + std::vector values; + consumer.pop(request.op, data, values); + + for (auto &fv : values) { + if (DATA_TUPLE_KEY == fvField(fv)) { + request.ret_string = fvValue(fv); + return true; + } + } + return false; +} + +// Send a response on the Reboot_Response_Channel notification channel.. +// Key is one of: Reboot, RebootStatus, or CancelReboot +// code is swss::StatusCode, hopefully SWSS_RC_SUCCESS. +// message is json formatted RebootResponse, RebootStatusResponse +// or CancelRebootResponse as defined in system.proto +void RebootBE::send_notification_response(const std::string key, + const swss::StatusCode code, + const std::string message) { + SWSS_LOG_ENTER(); + + std::vector ret_values; + ret_values.push_back(swss::FieldValueTuple(DATA_TUPLE_KEY, message)); + + m_rebootResponse.send(key, swss::statusCodeToStr(code), ret_values); +} + +NotificationResponse RebootBE::handle_reboot_request( + const std::string &json_reboot_request) { + using namespace google::protobuf::util; + + SWSS_LOG_ENTER(); + + // On success an emtpy string is returned. RebootResponse in system.proto + // is an empty proto. + NotificationResponse response = {.status = swss::StatusCode::SWSS_RC_SUCCESS, + .json_string = ""}; + + gnoi::system::RebootRequest request; + Status status = gpu::JsonStringToMessage(json_reboot_request, &request); + + if (!status.ok()) { + std::string error_string = + "unable to convert json to rebootRequest protobuf: " + + status.message().as_string(); + SWSS_LOG_ERROR("%s", error_string.c_str()); + SWSS_LOG_ERROR("json = |%s|", json_reboot_request.c_str()); + response.status = swss::StatusCode::SWSS_RC_INTERNAL, + response.json_string = error_string; + return response; + } + + if (!reboot_allowed(request.method())) { + response.status = swss::StatusCode::SWSS_RC_IN_USE; + response.json_string = + "Reboot not allowed at this time. Reboot or " + "post-warmboot NSF in progress"; + SWSS_LOG_WARN("%s", response.json_string.c_str()); + return response; + } + + SWSS_LOG_NOTICE("Forwarding request to RebootThread: %s", + request.DebugString().c_str()); + response = m_reboot_thread.Start(request); + if (response.status == swss::StatusCode::SWSS_RC_SUCCESS) { + if (request.method() == gnoi::system::RebootMethod::COLD) { + SetCurrentStatus(NsfManagerStatus::COLD_REBOOT_IN_PROGRESS); + } else if (request.method() == gnoi::system::RebootMethod::NSF) { + SetCurrentStatus(NsfManagerStatus::NSF_REBOOT_IN_PROGRESS); + } + } + return response; +} + +bool RebootBE::reboot_allowed(const gnoi::system::RebootMethod reboot_method) { + NsfManagerStatus current_status = GetCurrentStatus(); + switch (current_status) { + case NsfManagerStatus::COLD_REBOOT_IN_PROGRESS: + case NsfManagerStatus::NSF_REBOOT_IN_PROGRESS: { + return false; + } + case NsfManagerStatus::NSF_INIT_WAIT: { + return reboot_method == gnoi::system::RebootMethod::COLD; + } + case NsfManagerStatus::IDLE: { + return true; + } + default: { + return true; + } + } +} + +NotificationResponse RebootBE::handle_status_request( + const std::string &json_status_request) { + SWSS_LOG_ENTER(); + + gnoi::system::RebootStatusResponse reboot_response = + m_reboot_thread.HasRun() ? m_reboot_thread.GetResponse() + : m_init_thread->GetResponse(); + + std::string json_reboot_response_string; + google::protobuf::util::Status status = + gpu::MessageToJsonString(reboot_response, &json_reboot_response_string); + + NotificationResponse response; + if (status.ok()) { + response.status = swss::StatusCode::SWSS_RC_SUCCESS; + response.json_string = json_reboot_response_string; + } else { + std::string error_string = + "unable to convert reboot status response protobuf to json: " + + status.message().as_string(); + SWSS_LOG_ERROR("%s", error_string.c_str()); + response.status = swss::StatusCode::SWSS_RC_INTERNAL; + response.json_string = error_string; + } + + return response; +} + +NotificationResponse RebootBE::handle_cancel_request( + const std::string &json_cancel_request) { + SWSS_LOG_ENTER(); + + NotificationResponse response; + + // CancelReboot isn't supported: not needed until/unless delayed support + // is added: return unimplemented. + response.status = swss::StatusCode::SWSS_RC_UNIMPLEMENTED; + response.json_string = "Cancel reboot isn't supported"; + SWSS_LOG_WARN("%s", response.json_string.c_str()); + return response; +} + +void RebootBE::do_task(swss::NotificationConsumer &consumer) { + SWSS_LOG_ENTER(); + + NotificationResponse response; + RebootBE::NotificationRequest request; + + if (!retrieve_notification_data(consumer, request)) { + // Response is simple string (not json) on error. + response.json_string = + "MESSAGE not present in reboot notification request message, op = " + + request.op; + SWSS_LOG_ERROR("%s", response.json_string.c_str()); + response.status = swss::StatusCode::SWSS_RC_INVALID_PARAM; + } else if (request.op == REBOOT_KEY) { + response = handle_reboot_request(request.ret_string); + } else if (request.op == REBOOT_STATUS_KEY) { + response = handle_status_request(request.ret_string); + } else if (request.op == CANCEL_REBOOT_KEY) { + response = handle_cancel_request(request.ret_string); + } else { + // Response is simple string (not json) on error. + response.json_string = + "Unrecognized op in reboot request, op = " + request.op; + SWSS_LOG_ERROR("%s", response.json_string.c_str()); + response.status = swss::StatusCode::SWSS_RC_INVALID_PARAM; + } + send_notification_response(request.op, response.status, response.json_string); +} + +void RebootBE::handle_unfreeze() { + SWSS_LOG_ENTER(); + SWSS_LOG_NOTICE("Receieved notification that UNFREEZE signal has been sent"); +} + +void RebootBE::handle_init_finish() { + SWSS_LOG_ENTER(); + SWSS_LOG_NOTICE("Receieved notification that InitThread is done"); + NsfManagerStatus current_status = GetCurrentStatus(); + if (current_status == NsfManagerStatus::NSF_INIT_WAIT) { + SetCurrentStatus(NsfManagerStatus::IDLE); + } + if (m_init_thread->GetResponse().active()) { + bool result = m_init_thread->Join(); + if (!result) { + SWSS_LOG_ERROR("Encountered error trying to join init thread"); + } + } +} + +void RebootBE::handle_reboot_finish() { + SWSS_LOG_ENTER(); + SWSS_LOG_WARN( + "Receieved notification that reboot has finished. This probably means " + "something is wrong"); + m_reboot_thread.Join(); + SetCurrentStatus(m_init_thread->GetResponse().active() + ? NsfManagerStatus::NSF_INIT_WAIT + : NsfManagerStatus::IDLE); +} + +void RebootBE::handle_done() { + SWSS_LOG_INFO("RebootBE received signal to stop"); + if (m_init_thread->GetResponse().active()) { + m_init_thread->Stop(); + m_init_thread->Join(); + } + if (m_reboot_thread.GetResponse().active()) { + m_reboot_thread.Stop(); + m_reboot_thread.Join(); + } +} + +} // namespace rebootbackend diff --git a/src/sonic-framework/rebootbackend/rebootbe.h b/src/sonic-framework/rebootbackend/rebootbe.h new file mode 100644 index 000000000000..86afd07c92c7 --- /dev/null +++ b/src/sonic-framework/rebootbackend/rebootbe.h @@ -0,0 +1,101 @@ +#pragma once +#include "dbconnector.h" +#include "init_thread.h" +#include "notificationconsumer.h" +#include "notificationproducer.h" +#include "reboot_common.h" +#include "reboot_interfaces.h" +#include "reboot_thread.h" +#include "selectableevent.h" +#include "status_code_util.h" + +namespace rebootbackend { + +#define REBOOT_REQUEST_NOTIFICATION_CHANNEL "Reboot_Request_Channel" +#define REBOOT_RESPONSE_NOTIFICATION_CHANNEL "Reboot_Response_Channel" +#define REBOOT_KEY "Reboot" +#define REBOOT_STATUS_KEY "RebootStatus" +#define CANCEL_REBOOT_KEY "CancelReboot" +#define DATA_TUPLE_KEY "MESSAGE" + +class RebootBE { + public: + struct NotificationRequest { + std::string op; + std::string ret_string; + }; + + enum class NsfManagerStatus { + NSF_INIT_WAIT, + IDLE, + COLD_REBOOT_IN_PROGRESS, + NSF_REBOOT_IN_PROGRESS + }; + + RebootBE(DbusInterface &interface, + CriticalStateInterface &critical_interface, + TelemetryInterface &telemetry_interface); + + NsfManagerStatus GetCurrentStatus(); + + void Start(); + void Stop(); + + private: + std::mutex m_status_mutex; + NsfManagerStatus m_current_status = NsfManagerStatus::IDLE; + swss::SelectableEvent m_done; + + swss::DBConnector m_db; + swss::NotificationProducer m_rebootResponse; + swss::NotificationConsumer m_notificationConsumer; + + DbusInterface &m_dbus; + CriticalStateInterface &m_critical; + TelemetryInterface &m_telemetry; + + // Signals for init thread. + swss::SelectableEvent m_init_thread_done; + swss::SelectableEvent m_stack_unfrozen; + std::unique_ptr m_init_thread; + + // Signalled by reboot thread when thread completes. + swss::SelectableEvent m_reboot_thread_finished; + RebootThread m_reboot_thread; + + void SetCurrentStatus(NsfManagerStatus new_status); + + // Reboot_Request_Channel notifications should all contain {"MESSAGE" : Data} + // in the notification Data field. + // Return true if "MESSAGE" is found, false otherwise. + // Set message_value to the Data string if found, "" otherwise. + // consumer is input: this is the consumer from which we pop + // reboot/cancel/status requests. + // request is output: this the request recevied from consumer + bool retrieve_notification_data(swss::NotificationConsumer &consumer, + NotificationRequest &request); + NotificationResponse handle_reboot_request( + const std::string &json_reboot_request); + NotificationResponse handle_status_request( + const std::string &json_status_request); + NotificationResponse handle_cancel_request( + const std::string &json_cancel_request); + void send_notification_response(const std::string key, + const swss::StatusCode code, + const std::string message); + + // Returns true if a reboot is allowed at this time given the current NSF + // manager state and reboot type, and false otherwise. + bool reboot_allowed(const gnoi::system::RebootMethod reboot_method); + + void do_task(swss::NotificationConsumer &consumer); + + void handle_unfreeze(); + void handle_init_finish(); + void handle_reboot_finish(); + void handle_done(); + + friend class RebootBETestWithoutStop; +}; + +} // namespace rebootbackend diff --git a/src/sonic-framework/rebootbackend/redis_utils.cpp b/src/sonic-framework/rebootbackend/redis_utils.cpp new file mode 100644 index 000000000000..9e23239b9c6b --- /dev/null +++ b/src/sonic-framework/rebootbackend/redis_utils.cpp @@ -0,0 +1,474 @@ +#include "redis_utils.h" + +#include +#include +#include +#include + +#include "dbconnector.h" +#include "notificationproducer.h" +#include "stateverification.h" +#include "table.h" +#include "timestamp.h" +#include "warm_restart.h" + +namespace rebootbackend { + +using WarmStartState = ::swss::WarmStart::WarmStartState; + +const std::unordered_map> + Registration::kStageToTargetStates = { + {swss::WarmStart::WarmBootStage::STAGE_FREEZE, + {get_warm_start_state_name(WarmStartState::QUIESCENT), + get_warm_start_state_name(WarmStartState::CHECKPOINTED)}}, + {swss::WarmStart::WarmBootStage::STAGE_CHECKPOINT, + {get_warm_start_state_name(WarmStartState::CHECKPOINTED)}}, + {swss::WarmStart::WarmBootStage::STAGE_RECONCILIATION, + {get_warm_start_state_name(WarmStartState::RECONCILED)}}, + {swss::WarmStart::WarmBootStage::STAGE_UNFREEZE, + {get_warm_start_state_name(WarmStartState::COMPLETED)}}, +}; + +void send_nsf_manager_notification( + swss::DBConnector &db, swss::WarmStart::WarmBootNotification notification) { + swss::NotificationProducer producer( + &db, swss::WarmStart::kNsfManagerNotificationChannel); + + std::vector values; + std::string notification_string = + swss::WarmStart::warmBootNotificationNameMap()->at(notification); + + producer.send(notification_string, "", values); +} + +std::string send_state_verification_notification(swss::DBConnector &ldb, + const bool freeze) { + swss::NotificationProducer producer(&ldb, STATE_VERIFICATION_REQ_CHANNEL); + + std::vector values; + values.push_back( + swss::FieldValueTuple(FREEZE_FIELD, freeze ? "true" : "false")); + + std::string timestamp = swss::getTimestamp(); + producer.send(ALL_COMPONENT, timestamp, values); + return timestamp; +} + +void init_warm_reboot_states(const swss::DBConnector &db) { + swss::Table table(&db, STATE_WARM_RESTART_TABLE_NAME); + std::vector keys; + + table.getKeys(keys); + for (auto &key : keys) { + table.hdel(key, "state"); + table.hdel(key, "timestamp"); + } +} + +void set_warm_restart_enable(const swss::DBConnector &db, bool enabled) { + swss::Table table(&db, STATE_WARM_RESTART_ENABLE_TABLE_NAME); + table.hset("system", "enable", enabled ? "true" : "false"); +} + +bool is_valid_key(const std::string &key, const std::string &separator) { + if (separator.empty()) { + return false; + } + + size_t pos = key.find(separator); + // The separator must exist in the string, and cannot be the first or last + // character. + return !(pos == std::string::npos || pos == 0 || pos == key.size() - 1); +} + +bool get_docker_app_from_key(const std::string &key, + const std::string &separator, std::string &docker, + std::string &app) { + SWSS_LOG_ENTER(); + + size_t pos = key.find(separator); + + if (separator.empty()) { + SWSS_LOG_ERROR("separator [%s] shouldn't be empty", separator.c_str()); + return false; + } + + if (pos == std::string::npos) { + SWSS_LOG_ERROR("key [%s] should contain separator [%s]", key.c_str(), + separator.c_str()); + return false; + } + + docker = key.substr(0, pos); + app = key.substr(pos + separator.length(), std::string::npos); + + if (docker.empty()) { + SWSS_LOG_ERROR("docker name shouldn't be empty, key = %s", key.c_str()); + return false; + } + + if (app.empty()) { + SWSS_LOG_ERROR("app name shouldn't be empty, key = %s", key.c_str()); + return false; + } + return true; +} + +std::string get_warm_start_state_name(swss::WarmStart::WarmStartState state) { + return swss::WarmStart::warmStartStateNameMap()->at(state).c_str(); +} + +void set_warm_restart_counter(swss::DBConnector &db, int count) { + swss::Table table(&db, "BOOT_INFO"); + table.hset("system", "warmboot-count", std::to_string(count)); +} + +std::string get_warm_restart_counter(swss::DBConnector &db) { + swss::Table warmRestartTable(&db, "BOOT_INFO"); + std::string counter; + warmRestartTable.hget("system", "warmboot-count", counter); + return counter; +} + +Registration::Registration() + : m_db("STATE_DB", 0), + m_separator(swss::TableBase::getTableSeparator(m_db.getDbId())), + m_remaining_apps( + {{swss::WarmStart::WarmBootStage::STAGE_FREEZE, {}}, + {swss::WarmStart::WarmBootStage::STAGE_CHECKPOINT, {}}, + {swss::WarmStart::WarmBootStage::STAGE_RECONCILIATION, {}}, + {swss::WarmStart::WarmBootStage::STAGE_UNFREEZE, {}}}) {} + +std::string Registration::get_warm_boot_stage_name( + swss::WarmStart::WarmBootStage stage) { + return swss::WarmStart::warmBootStageToNameMap()->at(stage); +} + +void Registration::clear_contents() { + m_registered.clear(); + m_stop.clear(); + for (auto &stage_set_pair : m_remaining_apps) { + stage_set_pair.second.clear(); + } +} + +Registration::RegistrationSet Registration::get_stop_on_freeze_set() { + return m_stop; +} + +Registration::RegistrationSet Registration::get_registered_app_set() { + return m_registered; +} + +void Registration::fetch_registration_info() { + SWSS_LOG_ENTER(); + + clear_contents(); + + swss::Table table(&m_db, STATE_WARM_RESTART_REGISTRATION_TABLE_NAME); + std::vector keys; + + table.getKeys(keys); + for (auto &key : keys) { + std::string docker, app; + if (!get_docker_app_from_key(key, m_separator, docker, app)) { + SWSS_LOG_ERROR("skipping registration for key = %s", key.c_str()); + continue; + } + + m_registered.insert(key); + + std::vector values; + table.get(key, values); + for (auto &v : values) { + // We only care about this field if value is "true". + // Skip this key if value is false. + if ("false" == fvValue(v)) continue; + + if (swss::WarmStart::kRegistrationStopOnFreezeKey == fvField(v)) { + m_stop.insert(docker); + } + if (swss::WarmStart::kRegistrationFreezeKey == fvField(v)) { + m_ro_quiescent_list.insert(app); + m_remaining_apps.at(swss::WarmStart::WarmBootStage::STAGE_FREEZE) + .insert(app); + m_remaining_apps.at(swss::WarmStart::WarmBootStage::STAGE_UNFREEZE) + .insert(app); + } + if (swss::WarmStart::kRegistrationCheckpointKey == fvField(v)) { + m_remaining_apps.at(swss::WarmStart::WarmBootStage::STAGE_CHECKPOINT) + .insert(app); + } + if (swss::WarmStart::kRegistrationReconciliationKey == fvField(v)) { + m_remaining_apps + .at(swss::WarmStart::WarmBootStage::STAGE_RECONCILIATION) + .insert(app); + } + } + } +} + +Registration::Response Registration::check_quiesced() { + return check_stage(swss::WarmStart::WarmBootStage::STAGE_FREEZE); +} + +Registration::Response Registration::check_checkpointed() { + return check_stage(swss::WarmStart::WarmBootStage::STAGE_CHECKPOINT); +} + +Registration::Response Registration::check_reconciled() { + return check_stage(swss::WarmStart::WarmBootStage::STAGE_RECONCILIATION); +} + +Registration::Response Registration::check_unfrozen() { + return check_stage(swss::WarmStart::WarmBootStage::STAGE_UNFREEZE); +} + +Registration::Response Registration::check_stage( + swss::WarmStart::WarmBootStage nsf_stage) { + Registration::Response response = check_states_are( + m_remaining_apps.at(nsf_stage), kStageToTargetStates.at(nsf_stage)); + if (response.status == Registration::Status::FAILURE) { + response.error_string = + "check_stage: app: " + response.error_string + + " reported FAILED during stage: " + get_warm_boot_stage_name(nsf_stage); + } + return response; +} + +Registration::Response Registration::check_states_are( + RegistrationSet &set_to_check, + const std::unordered_set &state_names) { + Registration::Response response; + + swss::Table warmRestartTable(&m_db, STATE_WARM_RESTART_TABLE_NAME); + for (auto key = set_to_check.begin(); key != set_to_check.end();) { + std::string state; + + warmRestartTable.hget(*key, "state", state); + + if (state == get_warm_start_state_name(WarmStartState::FAILED)) { + response.status = Registration::Status::FAILURE; + response.error_string = *key; + return response; + } + + if (state_names.find(state) != std::end(state_names)) { + key = set_to_check.erase(key); + } else { + ++key; + } + } + + if (set_to_check.empty()) { + response.status = Registration::Status::COMPLETED; + return response; + } + + response.status = Registration::Status::IN_PROCESS; + return response; +} + +Registration::Response Registration::handle_state_event( + swss::WarmStart::WarmBootStage monitored_stage, + const swss::KeyOpFieldsValuesTuple &kco) { + SWSS_LOG_ENTER(); + + RegistrationSet &set_to_check = m_remaining_apps.at(monitored_stage); + + std::string op = kfvOp(kco); + if (op != "SET") { + SWSS_LOG_ERROR("ignoring non-SET event: %s", op.c_str()); + return {set_to_check.empty() ? Registration::Status::COMPLETED + : Registration::Status::IN_PROCESS, + ""}; + } + + Registration::Response response = + filter_app_list(set_to_check, kfvKey(kco), monitored_stage, kco, + kStageToTargetStates.at(monitored_stage)); + + if (response.status == Registration::Status::FAILURE) { + return response; + } + + if (monitored_stage == swss::WarmStart::WarmBootStage::STAGE_FREEZE) { + response = handle_quiescence_event(kco); + } + return response; +} + +// Helper for handle_state_event. +// Pre-condition: caller verifies that operation is a set. +// Caller handles apps entering FAILED state. +Registration::Response Registration::handle_quiescence_event( + const swss::KeyOpFieldsValuesTuple &kco) { + SWSS_LOG_ENTER(); + + RegistrationSet &set_to_check = + m_remaining_apps.at(swss::WarmStart::WarmBootStage::STAGE_FREEZE); + + std::string app_name = kfvKey(kco); + std::string new_state = extract_event_state(kco); + + std::unordered_set state_names = + kStageToTargetStates.at(swss::WarmStart::WarmBootStage::STAGE_FREEZE); + + if (state_names.find(new_state) == std::end(state_names)) { + // the new_state is not QUIESCENT or CHECKPOINTED, the app isn't quiescent + if (m_ro_quiescent_list.find(app_name) != std::end(m_ro_quiescent_list)) { + // We are monitoring quiescence for this app: app isn't quiescent + // readd app to monitoring list (it might already be there ...) + set_to_check.insert(app_name); + } + } + return {set_to_check.empty() ? Registration::Status::COMPLETED + : Registration::Status::IN_PROCESS, + ""}; +} + +Registration::Response Registration::filter_app_list( + RegistrationSet &set_to_filter, const std::string app_name, + const swss::WarmStart::WarmBootStage monitored_stage, + const swss::KeyOpFieldsValuesTuple &kco, + const std::unordered_set &state_names) { + std::string new_state = extract_event_state(kco); + + if (new_state == get_warm_start_state_name(WarmStartState::FAILED)) { + return {Registration::Status::FAILURE, + "handle_state_event: app: " + app_name + + " reported FAILED when looking for state: " + + get_warm_boot_stage_name(monitored_stage)}; + } + + if (state_names.find(new_state) != std::end(state_names)) { + set_to_filter.erase(app_name); + } + + Registration::Response response; + response.status = set_to_filter.empty() ? Registration::Status::COMPLETED + : Registration::Status::IN_PROCESS; + + return response; +} + +std::string Registration::extract_event_state( + const swss::KeyOpFieldsValuesTuple &kco) { + for (const auto &field_value : kfvFieldsValues(kco)) { + if (fvField(field_value) == "state") { + return fvValue(field_value); + } + } + return ""; +} + +void Registration::clear_all_init_apps() { + swss::Table table(&m_db, STATE_WARM_RESTART_INIT_TABLE_NAME); + std::vector keys; + table.getKeys(keys); + for (auto &key : keys) { + table.del(key); + } +} + +void Registration::save_all_init_apps() { + SWSS_LOG_ENTER(); + std::ostringstream stream; + std::copy(m_registered.begin(), m_registered.end(), + std::ostream_iterator(stream, ",")); + SWSS_LOG_NOTICE("Saving registered apps to init table: %s", + stream.str().c_str()); + + swss::Table table(&m_db, STATE_WARM_RESTART_INIT_TABLE_NAME); + std::string separator = swss::TableBase::getTableSeparator(m_db.getDbId()); + for (const auto &key : m_registered) { + if (is_valid_key(key, separator)) { + table.hset(key, "timestamp", swss::getTimestamp()); + } else { + SWSS_LOG_ERROR("skipping saving key = %s", key.c_str()); + } + } +} + +std::string Registration::join_pending_apps( + swss::WarmStart::WarmBootStage target_stage) { + std::ostringstream stream; + std::copy(m_remaining_apps.at(target_stage).begin(), + m_remaining_apps.at(target_stage).end(), + std::ostream_iterator(stream, ",")); + return stream.str(); +} + +InitRegistration::InitRegistration() + : m_db("STATE_DB", 0), + m_separator(swss::TableBase::getTableSeparator(m_db.getDbId())) {} + +void InitRegistration::fetch_init_app_info() { + m_missing_registrations.clear(); + + swss::Table table(&m_db, STATE_WARM_RESTART_INIT_TABLE_NAME); + std::vector keys; + + table.getKeys(keys); + for (auto &key : keys) { + if (!is_valid_key(key, m_separator)) { + SWSS_LOG_ERROR("Could not parse init app name from key = %s", + key.c_str()); + continue; + } + + m_missing_registrations.insert(key); + } +} + +InitRegistration::Status InitRegistration::get_reregistration_status() { + return m_missing_registrations.empty() + ? InitRegistration::Status::COMPLETED + : InitRegistration::Status::IN_PROGRESS; +} + +InitRegistration::Status InitRegistration::check_reregistration_status() { + SWSS_LOG_ENTER(); + + swss::Table table(&m_db, STATE_WARM_RESTART_REGISTRATION_TABLE_NAME); + std::vector keys; + + table.getKeys(keys); + for (auto &key : keys) { + remove_pending_app(key); + } + + return get_reregistration_status(); +} + +InitRegistration::Status InitRegistration::handle_registration_event( + const swss::KeyOpFieldsValuesTuple &kco) { + remove_pending_app(kfvKey(kco)); + return get_reregistration_status(); +} + +const InitRegistration::RegistrationSet &InitRegistration::get_pending_apps() + const { + return m_missing_registrations; +} + +void InitRegistration::remove_pending_app(const std::string &key) { + SWSS_LOG_ENTER(); + if (!is_valid_key(key, m_separator)) { + SWSS_LOG_ERROR("ignoring invalid key for reregistration = %s", key.c_str()); + return; + } + if (m_missing_registrations.find(key) != std::end(m_missing_registrations)) { + m_missing_registrations.erase(key); + } +} + +std::string InitRegistration::join_pending_apps() { + std::ostringstream stream; + std::copy(m_missing_registrations.begin(), m_missing_registrations.end(), + std::ostream_iterator(stream, ",")); + return stream.str(); +} + +} // namespace rebootbackend diff --git a/src/sonic-framework/rebootbackend/redis_utils.h b/src/sonic-framework/rebootbackend/redis_utils.h new file mode 100644 index 000000000000..e071e69f01e1 --- /dev/null +++ b/src/sonic-framework/rebootbackend/redis_utils.h @@ -0,0 +1,289 @@ +#pragma once +#include +#include +#include + +#include "dbconnector.h" +#include "notificationconsumer.h" +#include "notificationproducer.h" +#include "selectableevent.h" +#include "status_code_util.h" +#include "warm_restart.h" + +namespace rebootbackend { + +// Return string corresponding to state +std::string get_warm_start_state_name( + const swss::WarmStart::WarmStartState state); + +// Send Freeze, Checkpoint or Unfreeze notifications to all subscribed +// apps on NSF_MANAGER_COMMON_NOTIFICATION_CHANNEL +void send_nsf_manager_notification( + swss::DBConnector &db, swss::WarmStart::WarmBootNotification notification); + +// For all keys in STATE_WARM_RESTART_TABLE_NAME: delete state field +// This is executed at the beginning of nsf/warm restart to clear out +// existing states. +// From: +// https://github.com/sonic-net/sonic-utilities/blob/20d1495b6f7e82c4d9aa377c3c281d8d0d9d8594/scripts/fast-reboot#L167 +void init_warm_reboot_states(const swss::DBConnector &db); + +// Set the system warm start state to a new enabled/disabled state. +// STATE_WARM_RESTART_TABLE_NAME +// key = system, field = enable, value = "true"/"false" +void set_warm_restart_enable(const swss::DBConnector &db, bool enabled); + +// Send a request to state verifiation daemon to perform state +// verification. +// Set freeze == true if system is frozen (i.e. after reboot +// before reconciliation and unfreeze have occurred. +// timestamp is used to verify that a state update is a response +// to our request, and not someone else's. +std::string send_state_verification_notification(swss::DBConnector &db, + const bool freeze); + +// Returns true if key is in the formm "texttext", and false +// otherwise. +bool is_valid_key(const std::string &key, const std::string &separator); + +// Helper function: given key of form "docker|app" +// extract docker and app. (separator = | in this case) +// return false if docker or app are empty or separator +// isn't present, else true. +// key and separator are inputs +// docker and app are outputs +bool get_docker_app_from_key(const std::string &key, + const std::string &separator, std::string &docker, + std::string &app); + +std::string get_warm_start_state_name(swss::WarmStart::WarmStartState state); + +// Sets the warm restart count in the database. +void set_warm_restart_counter(swss::DBConnector &db, int count); + +// Returns the current warm restart count from the database. Returns an empty +// string if the warm restart count is not set, and a string representation +// of an integer otherwise. +std::string get_warm_restart_counter(swss::DBConnector &db); + +// This class is meant to handle registration information in the +// STATE_WARM_RESTART_REGISTRATION_TABLE_NAME. +// - we maintain the list of registered applications +// used after to reboot to wait for all to register +// - the list of apps we will wait to quiesce +// - the list of apps we will wait to checkpoint +// - the list of apps we will wait reconcile +// - the list of containers that have requested stop on freeze. +// Not thread safe. +class Registration { + public: + enum class Status { COMPLETED, FAILURE, IN_PROCESS }; + + struct Response { + Status status = Status::IN_PROCESS; + std::string error_string = ""; + }; + + typedef std::unordered_set RegistrationSet; + + Registration(); + + static std::string get_warm_boot_stage_name( + swss::WarmStart::WarmBootStage stage); + + // Populate this class with contents of + // STATE_WARM_RESTART_REGISTRATION_TABLE_NAME. + void fetch_registration_info(); + + // Return the set of containers that have requested stop on freeze. + RegistrationSet get_stop_on_freeze_set(); + + // Return the set of applications that have registered. + RegistrationSet get_registered_app_set(); + + // Check application states in warm restart table. + // Pop applications that are quiesced or checkpointed. + // Returns: + // IN_PROCESS: not all apps have quiesced. + // COMPLETED: all registered apps have quiesced. + // FAILURE: an application set its state to failed. + // error_string in response is populated + Response check_quiesced(); + + // Check application states in warm restart table. + // Pop applications that are checkpointed. + // Returns: + // IN_PROCESS: not all apps have checkpointed. + // COMPLETED: all registered apps have checkpointed. + // FAILURE: an application set its state to failed. + // error_string in response is populated + Response check_checkpointed(); + + // Check application states in warm restart table. + // Pop applications that are checkpointed. + // Returns: + // IN_PROCESS: not all apps have checkpointed. + // COMPLETED: all registered apps have checkpointed. + // FAILURE: an application set its state to failed. + // error_string in response is populated + Response check_reconciled(); + + // Check application states in warm restart table. + // Pop applications that are unfrozen. + // Returns: + // IN_PROCESS: not all apps have unfrozen. + // COMPLETED: all registered apps have unfrozen. + // FAILURE: an application set its state to failed. + // error_string in response is populated + Response check_unfrozen(); + + // Check application states in warm restart table. + // Pop applications that have reached the target state. + // Returns: + // IN_PROCESS: not all apps have reached the target state. + // COMPLETED: all registered apps have reached the target state. + // FAILURE: an application set its state to failed. + // error_string in response is populated + Response check_stage(swss::WarmStart::WarmBootStage nsf_stage); + + // Handle an application state change, and update the tracked progress + // towards that state. + // Pop applications that have reached the monitored state. + // FAILURE statuses are sticky, and subsequent calls to handle_state_event + // will return the FAILURE status. + // Returns: + // IN_PROCESS: not all apps have reached monitored_stage. + // COMPLETED: all registered apps have monitored_stage. + // FAILURE: an application set its state to failed. + // error_string in response is populated + Response handle_state_event(swss::WarmStart::WarmBootStage monitored_stage, + const swss::KeyOpFieldsValuesTuple &kco); + + // Clear all registered app names in the WARM_RESTART_INIT_TABLE. + void clear_all_init_apps(); + + // Saves all registered app names to the WARM_RESTART_INIT_TABLE. + void save_all_init_apps(); + + // Returns a string representation of the current apps that have not + // reached the target state For debug and logging only. + std::string join_pending_apps(swss::WarmStart::WarmBootStage target_stage); + + private: + // Helper function for check_ functions. + // Pop applications that have reached one of the target state names. + // Returns: + // IN_PROCESS: not all apps have reached the correct state. + // COMPLETED: all registered apps have reached the correct state. + // FAILURE: an application set its state to failed. + // error_string in response is populated + Response check_states_are(RegistrationSet &set_to_check, + const std::unordered_set &state_names); + + // Helper for handle_state_event: specific handling for FREZE stage. + // Pre-condition: caller verifies that operation is a set. + // caller handles apps entering FAILED state. + // Push applications that report a state other than quiescence/checkpointed. + // -- applications may exit/enter quiescence state. + // Returns: + // IN_PROCESS: not all apps have reached monitored_stage. + // COMPLETED: all registered apps have monitored_stage. + Response handle_quiescence_event(const swss::KeyOpFieldsValuesTuple &kco); + + // Filter a specified app list based on an application state change event. + // Pop applications that have reached one of the target states. + // IN_PROCESS: not all apps have reached the correct state. + // COMPLETED: all registered apps have reached the correct state. + // FAILURE: an application set its state to failed. + // error_string in response is populated + Response filter_app_list(RegistrationSet &set_to_filter, + const std::string app_name, + const swss::WarmStart::WarmBootStage monitored_state, + const swss::KeyOpFieldsValuesTuple &kco, + const std::unordered_set &state_names); + + // Extracts a "status" field from the event, and returns the value of it. + // Returns an empty string if the "status" field is not present. + std::string extract_event_state(const swss::KeyOpFieldsValuesTuple &kco); + + // Clear contents of all sets. + void clear_contents(); + + const static std::unordered_map> + kStageToTargetStates; + + RegistrationSet m_registered; + RegistrationSet m_stop; + RegistrationSet m_ro_quiescent_list; + std::unordered_map + m_remaining_apps; + + swss::DBConnector m_db; + + std::string m_separator; + + friend class RebootThreadTest; + friend class RedisTest; +}; + +// This class handles the monitoring of applications re-registered warmboot +// requirements after a warmboot. In general, data from +// WARM_RESTART_INIT_TABLE is used to generate a list of apps that must +// re-register, and the provided API functions monitor +// WARM_RESTART_REGISTRATION_TABLE to determine if all of these apps +// have registered again. +class InitRegistration { + public: + enum class Status { COMPLETED, IN_PROGRESS }; + + typedef std::unordered_set RegistrationSet; + + InitRegistration(); + + // Reads the list of apps that must reregister from the + // WARM_RESTART_INIT_TABLE. + void fetch_init_app_info(); + + // Returns the current state of re-registration. + // Returns: + // COMPLETED: all apps that were registered before the warmboot have + // re-registered. This is a requirement for warmboot. + // IN_PROGRESS: not all apps have re-registered. + Status get_reregistration_status(); + + // Polls the WARM_RESTART_REGISTRATION_TABLE to retermine if re-registration + // is complete. Pops elements from the internal set. + // Returns: + // COMPLETED: all apps that were registered before the warmboot have + // re-registered. This is a requirement for warmboot. + // IN_PROGRESS: not all apps have re-registered. + Status check_reregistration_status(); + + // Handles an registration event in the WARM_RESTART_REGISTRATION_TABLE. Pops + // elements from the internal set, and returns the current re-registration + // status. + // Returns: + // COMPLETED: all apps that were registered before the warmboot have + // re-registered. This is a requirement for warmboot. + // IN_PROGRESS: not all apps have re-registered. + Status handle_registration_event(const swss::KeyOpFieldsValuesTuple &kco); + + const RegistrationSet &get_pending_apps() const; + + // Returns a string representation of the current apps that have not + // re-registered. For debug and logging only. + std::string join_pending_apps(); + + private: + // Pops the app name in key from the internal set, if it exists. + void remove_pending_app(const std::string &key); + + RegistrationSet m_missing_registrations; + swss::DBConnector m_db; + std::string m_separator; + + friend class RedisTest; +}; + +} // namespace rebootbackend diff --git a/src/sonic-framework/rebootbackend/telemetry_helper.cpp b/src/sonic-framework/rebootbackend/telemetry_helper.cpp new file mode 100644 index 000000000000..0dc8264423ff --- /dev/null +++ b/src/sonic-framework/rebootbackend/telemetry_helper.cpp @@ -0,0 +1,120 @@ +#include "telemetry_helper.h" + +#include +#include +#include + +#include "dbconnector.h" +#include "reboot_interfaces.h" +#include "redis_utils.h" +#include "table.h" +#include "warm_restart.h" + +namespace rebootbackend { + +TelemetryHelper::TelemetryHelper() + : m_db("STATE_DB", 0), + m_separator(swss::TableBase::getTableSeparator(m_db.getDbId())) {} + +void TelemetryHelper::record_overall_start() { + clear_performance_table(); + + swss::WarmStart::updateSystemWarmBootStart(); + + initialize_warmboot_count(); + backup_overall_values(m_reboot_count); +} + +void TelemetryHelper::record_overall_end(bool success) { + swss::WarmStart::updateSystemWarmBootEnd(success ? "success" : "failure"); + + backup_overall_values(get_reboot_count()); +} + +void TelemetryHelper::record_stage_start( + swss::WarmStart::WarmBootStage nsf_stage) { + swss::WarmStart::updateWarmBootStageStart(nsf_stage); + + backup_stage_values(nsf_stage); +} + +void TelemetryHelper::record_stage_end(swss::WarmStart::WarmBootStage nsf_stage, + bool success) { + swss::WarmStart::updateWarmBootStageEnd(nsf_stage, + success ? "success" : "failure"); + + backup_stage_values(nsf_stage); +} + +// Called at start of an nsf reboot: at least one nsf reboot (the one in +// progress) has occurred +void TelemetryHelper::initialize_warmboot_count() { + std::string warmboot_counter_str = get_warm_restart_counter(m_db); + if (warmboot_counter_str.empty()) { + m_reboot_count = 1; + } + try { + m_reboot_count = std::stoi(warmboot_counter_str) + 1; + } catch (const std::logic_error &e) { + m_reboot_count = 1; + } + set_warm_restart_counter(m_db, m_reboot_count); + m_fetched_count = true; +} + +void TelemetryHelper::backup_overall_values(int count) { + swss::Table perf_table(&m_db, STATE_WARM_RESTART_PERF_TABLE_NAME); + std::vector values; + if (!perf_table.get("system", values)) { + return; + } + + swss::Table hist_table(&m_db, STATE_WARM_RESTART_PERF_HISTORY_TABLE_NAME); + hist_table.set(std::to_string(count) + m_separator + "system", values); +} + +void TelemetryHelper::backup_stage_values( + swss::WarmStart::WarmBootStage nsf_stage) { + std::string stage_name = + swss::WarmStart::warmBootStageToNameMap()->at(nsf_stage); + int count = get_reboot_count(); + + swss::Table perf_table(&m_db, STATE_WARM_RESTART_PERF_TABLE_NAME); + swss::Table hist_table(&m_db, STATE_WARM_RESTART_PERF_HISTORY_TABLE_NAME); + + std::vector perf_keys; + perf_table.getKeys(perf_keys); + for (const auto &key : perf_keys) { + if (key.rfind(stage_name, 0) == 0) { + std::vector values; + if (!perf_table.get(key, values)) { + continue; + } + + hist_table.set(std::to_string(count) + m_separator + key, values); + } + } +} + +int TelemetryHelper::get_reboot_count() { + if (!m_fetched_count) { + try { + m_reboot_count = std::stoi(get_warm_restart_counter(m_db)); + } catch (const std::logic_error &e) { + m_reboot_count = 0; + } + m_fetched_count = true; + } + return m_reboot_count; +} + +void TelemetryHelper::clear_performance_table() { + swss::Table perf_table(&m_db, STATE_WARM_RESTART_PERF_TABLE_NAME); + std::vector perf_keys; + perf_table.getKeys(perf_keys); + for (const auto &key : perf_keys) { + perf_table.del(key); + } +} + +} // namespace rebootbackend diff --git a/src/sonic-framework/rebootbackend/telemetry_helper.h b/src/sonic-framework/rebootbackend/telemetry_helper.h new file mode 100644 index 000000000000..242bfa1be411 --- /dev/null +++ b/src/sonic-framework/rebootbackend/telemetry_helper.h @@ -0,0 +1,49 @@ +#pragma once + +#include "dbconnector.h" +#include "reboot_interfaces.h" +#include "warm_restart.h" + +namespace rebootbackend { + +class TelemetryHelper : public TelemetryInterface { + public: + TelemetryHelper(); + ~TelemetryHelper() override = default; + + void record_overall_start() override; + void record_overall_end(bool success) override; + + void record_stage_start(swss::WarmStart::WarmBootStage nsf_stage) override; + void record_stage_end(swss::WarmStart::WarmBootStage nsf_stage, + bool success) override; + + private: + // Initialize the warmboot count in Redis for a new reboot. Sets the count to + // 0 if it is not populated, and increment the value otherwise. + void initialize_warmboot_count(); + + // Backup the performance entries for the overall NSF reboot from + // WARM_RESTART_PERFORMANCE_TABLE to WARM_RESTART_PERFORMANCE_HISTORY_TABLE. + void backup_overall_values(int count); + + // Backup the performance entries for a particular stage of the NSF reboot + // from WARM_RESTART_PERFORMANCE_TABLE to + // WARM_RESTART_PERFORMANCE_HISTORY_TABLE. + void backup_stage_values(swss::WarmStart::WarmBootStage nsf_stage); + + // Fetch the reboot warmboot count value, or return the cached + // value for the appropriate value if it has already been fetched. + int get_reboot_count(); + + // Clears all keys from the performance table in preparation for a new reboot. + void clear_performance_table(); + + swss::DBConnector m_db; + std::string m_separator; + + bool m_fetched_count = false; + int m_reboot_count = 0; +}; + +} // namespace rebootbackend \ No newline at end of file diff --git a/src/sonic-framework/tests/Makefile.am b/src/sonic-framework/tests/Makefile.am new file mode 100644 index 000000000000..a61aaadd4886 --- /dev/null +++ b/src/sonic-framework/tests/Makefile.am @@ -0,0 +1,60 @@ +INCLUDES = -I $(top_srcdir) -I $(top_srcdir)/rebootbackend -I $(top_srcdir)/rebootbackend/system + +TESTS = tests tests_asan tests_tsan tests_usan + +noinst_PROGRAMS = tests tests_asan tests_tsan tests_usan + +if DEBUG +DBGFLAGS = -ggdb -DDEBUG +else +DBGFLAGS = -g -DNDEBUG +endif + +CFLAGS_GTEST = +LDADD_GTEST = -lgtest -lgtest_main -lgmock -lgmock_main +CFLAGS_COVERAGE = --coverage -fprofile-arcs -ftest-coverage +LDADD_COVERAGE = -lgcov +CFLAGS_ASAN = -fsanitize=address +CFLAGS_TSAN = -fsanitize=thread +CFLAGS_USAN = -fsanitize=undefined + +tests_SOURCES = test_utils_common.cpp \ + $(top_srcdir)/rebootbackend/telemetry_helper.cpp \ + telemetry_helper_test.cpp \ + rebootbe_test.cpp \ + $(top_srcdir)/rebootbackend/rebootbe.cpp \ + reboot_thread_test.cpp \ + $(top_srcdir)/rebootbackend/reboot_thread.cpp \ + init_thread_test.cpp \ + $(top_srcdir)/rebootbackend/init_thread.cpp \ + $(top_srcdir)/rebootbackend/system/system.pb.cc \ + $(top_srcdir)/rebootbackend/types/types.pb.cc \ + $(top_srcdir)/rebootbackend/common/common.pb.cc \ + $(top_srcdir)/rebootbackend/container_stop.pb.cc \ + redis_utils_test.cpp \ + $(top_srcdir)/rebootbackend/redis_utils.cpp \ + reboot_common_test.cpp \ + $(top_srcdir)/rebootbackend/reboot_common.cpp \ + test_main.cpp + +tests_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_GTEST) $(CFLAGS_COVERAGE) $(CFLAGS_SAI) +tests_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_GTEST) $(CFLAGS_COVERAGE) $(CFLAGS_SAI) +tests_LDADD = $(LDADD_GTEST) $(LDADD_COVERAGE) -lswsscommon -lpthread -lprotobuf + +tests_asan_SOURCES = $(tests_SOURCES) +tests_asan_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_GTEST) $(CFLAGS_ASAN) $(CFLAGS_SAI) +tests_asan_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_GTEST) $(CFLAGS_ASAN) $(CFLAGS_SAI) +tests_asan_LDFLAGS = $(CFLAGS_ASAN) +tests_asan_LDADD = $(LDADD_GTEST) -lswsscommon -lpthread -lprotobuf + +tests_tsan_SOURCES = $(tests_SOURCES) +tests_tsan_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_GTEST) $(CFLAGS_TSAN) $(CFLAGS_SAI) +tests_tsan_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_GTEST) $(CFLAGS_TSAN) $(CFLAGS_SAI) +tests_tsan_LDFLAGS = $(CFLAGS_TSAN) +tests_tsan_LDADD = $(LDADD_GTEST) -lswsscommon -lpthread -lprotobuf + +tests_usan_SOURCES = $(tests_SOURCES) +tests_usan_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_GTEST) $(CFLAGS_USAN) $(CFLAGS_SAI) +tests_usan_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_GTEST) $(CFLAGS_USAN) $(CFLAGS_SAI) +tests_usan_LDFLAGS = $(CFLAGS_USAN) +tests_usan_LDADD = $(LDADD_GTEST) -lswsscommon -lpthread -lprotobuf diff --git a/src/sonic-framework/tests/init_thread_test.cpp b/src/sonic-framework/tests/init_thread_test.cpp new file mode 100644 index 000000000000..482170b76d6b --- /dev/null +++ b/src/sonic-framework/tests/init_thread_test.cpp @@ -0,0 +1,923 @@ +#include "init_thread.h" + +#include +#include +#include + +#include +#include +#include +#include + +#include "mock_reboot_interfaces.h" +#include "reboot_interfaces.h" +#include "redis_utils.h" +#include "select.h" +#include "selectableevent.h" +#include "stateverification.h" +#include "status_code_util.h" +#include "table.h" +#include "test_utils_common.h" +#include "timestamp.h" +#include "warm_restart.h" + +namespace rebootbackend { + +using WarmBootStage = ::swss::WarmStart::WarmBootStage; + +using ::testing::_; +using ::testing::AtLeast; +using ::testing::ExplainMatchResult; +using ::testing::IsEmpty; +using ::testing::Not; +using ::testing::Return; +using ::testing::StrEq; +using ::testing::StrictMock; + +constexpr int kSelectTimeoutSeconds = 5; +constexpr int kShortSelectTimeoutSeconds = 1; + +MATCHER(IsDoneStatus, "") { + const InitThreadStatus::DetailedStatus &status = arg; + if (status.thread_state.active()) { + *result_listener << "Status was active, expected inactive"; + return false; + } + if (status.detailed_thread_status != InitThreadStatus::ThreadStatus::DONE) { + *result_listener << "Status was not DONE: " + << status.detailed_thread_status; + return false; + } + if (status.thread_state.method() != gnoi::system::RebootMethod::NSF) { + *result_listener << "Proto method was not NSF: " + << status.thread_state.status().status(); + } + if (status.thread_state.status().status() != + gnoi::system::RebootStatus_Status::RebootStatus_Status_STATUS_SUCCESS) { + *result_listener << "Proto status was not SUCCESS: " + << status.thread_state.status().status(); + return false; + } + return true; +} + +MATCHER_P(IsActiveStatus, state_matcher, "") { + const InitThreadStatus::DetailedStatus &status = arg; + if (!status.thread_state.active()) { + *result_listener << "Status was inactive, expected active"; + return false; + } + if (status.thread_state.status().status() != + gnoi::system::RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN) { + *result_listener << "Proto status was not UNKNOWN: " + << status.thread_state.status().status(); + return false; + } + return ExplainMatchResult(state_matcher, status.detailed_thread_status, + result_listener); +} + +MATCHER_P(IsErrorStatus, error_condition_matcher, "") { + const InitThreadStatus::DetailedStatus &status = arg; + if (status.thread_state.active()) { + *result_listener << "Status was active, expected inactive"; + return false; + } + if (status.detailed_thread_status != InitThreadStatus::ThreadStatus::ERROR) { + *result_listener << "Status was not ERROR: " + << status.detailed_thread_status; + return false; + } + if (status.thread_state.method() != gnoi::system::RebootMethod::NSF) { + *result_listener << "Proto method was not NSF: " + << status.thread_state.status().status(); + } + if (status.thread_state.status().status() != + gnoi::system::RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE) { + *result_listener << "Proto status was not FAILURE: " + << status.thread_state.status().status(); + return false; + } + return ExplainMatchResult(error_condition_matcher, + status.detailed_thread_error_condition, + result_listener); +} + +class InitThreadTest : public ::testing::Test { + public: + InitThreadTest() + : m_db("STATE_DB", 0), + m_config_db("CONFIG_DB", 0), + m_critical_interface(), + m_nsf_channel(&m_db, swss::WarmStart::kNsfManagerNotificationChannel), + m_init_thread(m_critical_interface, m_telemetry, m_finished, + m_stack_unfrozen) { + swss::WarmStart::initialize("fake_app", "fake_docker"); + // sigterm_requested and the Redis tables have global state that is + // maintained across tests. + sigterm_requested = false; + TestUtils::clear_tables(m_db); + init_redis_defaults(); + overwrite_reconciliation_timeout(5); + } + + void init_redis_defaults() { + set_warm_restart_enable(m_db, true); + TestUtils::set_state_verification_enable(m_config_db, /*bootup=*/true, + /*enabled=*/true); + } + + void set_default_success_expects() { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(false)); + EXPECT_CALL(m_telemetry, record_overall_end(true)).Times(1); + } + + void check_final_success_expects(bool state_verification_enabled) { + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/false); + if (state_verification_enabled) { + check_nsf_manager_notification_sent( + swss::WarmStart::WarmBootNotification::kUnfreeze); + } + } + + void check_final_failure_expects() { + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/false); + } + + void check_nsf_manager_notification_sent( + swss::WarmStart::WarmBootNotification notification_type) { + swss::Select s; + s.addSelectable(&m_nsf_channel); + + swss::Selectable *sel; + int select_result = s.select(&sel, 2000); + EXPECT_EQ(select_result, swss::Select::OBJECT); + + if (sel == &m_nsf_channel) { + std::string op, data; + std::vector values; + m_nsf_channel.pop(op, data, values); + EXPECT_EQ(op, swss::WarmStart::warmBootNotificationNameMap()->at( + notification_type)); + } + } + + void set_telemetry_stage_expects(WarmBootStage nsf_stage, bool success) { + EXPECT_CALL(m_telemetry, record_stage_start(nsf_stage)).Times(1); + EXPECT_CALL(m_telemetry, record_stage_end(nsf_stage, success)).Times(1); + } + + void overwrite_reconciliation_timeout(uint32_t timeout_seconds) { + m_init_thread.m_reconciliation_timeout = timeout_seconds; + } + + void overwrite_state_verification_timeout(uint32_t timeout_seconds) { + m_init_thread.m_state_verification_timeout = timeout_seconds; + } + + void overwrite_unfreeze_timeout(uint32_t timeout_seconds) { + m_init_thread.m_unfreeze_timeout = timeout_seconds; + } + + void populate_default_init_table() { + swss::Table initTable(&m_db, STATE_WARM_RESTART_INIT_TABLE_NAME); + initTable.hset("docker1|app1", "timestamp", ""); + initTable.hset("docker2|app2", "timestamp", ""); + initTable.hset("docker3|app3", "timestamp", ""); + // The invalid entry should not end up in the list of apps. + initTable.hset("invalid", "timestamp", ""); + } + + void advance_through_registration() { + populate_default_init_table(); + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + false, true); + TestUtils::populate_registration_table(m_db, "docker2|app2", true, true, + true, false); + TestUtils::populate_registration_table(m_db, "docker3|app3", false, false, + true, false); + } + + void set_apps_to_state(std::string state) { + TestUtils::populate_restart_table_state(m_db, "app1", state); + TestUtils::populate_restart_table_state(m_db, "app2", state); + TestUtils::populate_restart_table_state(m_db, "app3", state); + } + + void advance_through_reconciliation() { + EXPECT_CALL(m_telemetry, + record_stage_end(WarmBootStage::STAGE_RECONCILIATION, + /*success=*/true)) + .Times(1); + advance_through_registration(); + set_apps_to_state("reconciled"); + } + + // Must be run in a separate thread. + void advance_through_state_verification() { + swss::NotificationConsumer nc(&m_db, STATE_VERIFICATION_REQ_CHANNEL); + + advance_through_reconciliation(); + + std::string timestamp = TestUtils::wait_for_state_verification_trigger( + nc, kSelectTimeoutSeconds, /*freeze=*/true); + EXPECT_THAT(timestamp, Not(IsEmpty())); + + if (timestamp.empty()) { + return; + } + + TestUtils::write_state_verification_result(m_db, ALL_COMPONENT, SV_PASS, + timestamp); + } + + // Must be run in a separate thread. + void advance_through_unfreeze_with_state(std::string final_state) { + set_telemetry_stage_expects(WarmBootStage::STAGE_UNFREEZE, + final_state == "completed"); + + swss::NotificationConsumer nc(&m_db, STATE_VERIFICATION_REQ_CHANNEL); + + advance_through_reconciliation(); + + std::string timestamp = TestUtils::wait_for_state_verification_trigger( + nc, kSelectTimeoutSeconds, /*freeze=*/true); + EXPECT_THAT(timestamp, Not(IsEmpty())); + if (timestamp.empty()) { + return; + } + + // Set apps to their final state before unfreeze runs. + set_apps_to_state(final_state); + + TestUtils::write_state_verification_result(m_db, ALL_COMPONENT, SV_PASS, + timestamp); + } + + protected: + swss::DBConnector m_db; + swss::DBConnector m_config_db; + StrictMock m_critical_interface; + StrictMock m_telemetry; + swss::NotificationConsumer m_nsf_channel; + swss::SelectableEvent m_finished; + swss::SelectableEvent m_stack_unfrozen; + InitThread m_init_thread; +}; + +TEST_F(InitThreadTest, TestJoinWithoutStart) { + EXPECT_FALSE(m_init_thread.Join()); +} + +TEST_F(InitThreadTest, NoNsfIfCritical) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(true)); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + + EXPECT_EQ(m_init_thread.Start(), + swss::StatusCode::SWSS_RC_FAILED_PRECONDITION); + + EXPECT_THAT( + m_init_thread.GetDetailedStatus(), + IsErrorStatus(InitThreadStatus::ErrorCondition::DETECTED_CRITICAL_STATE)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, RegistrationCancelsForCritical) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(2)) + .WillOnce(Return(false)) + .WillRepeatedly(Return(true)); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + + populate_default_init_table(); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + m_init_thread.Join(); + + EXPECT_THAT( + m_init_thread.GetDetailedStatus(), + IsErrorStatus(InitThreadStatus::ErrorCondition::REGISTRATION_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, SigtermRequestedBeforeRun) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(false)); + EXPECT_CALL(m_critical_interface, report_critical_state(_)).Times(1); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + + populate_default_init_table(); + sigterm_requested = true; + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + m_init_thread.Join(); + + EXPECT_THAT(m_init_thread.GetDetailedStatus(), IsErrorStatus(_)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, RegistrationStopped) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(false)); + EXPECT_CALL(m_critical_interface, report_critical_state(_)).Times(1); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + + populate_default_init_table(); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + // This will stop the thread in the registration loop. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + m_init_thread.Stop(); + m_init_thread.Join(); + + EXPECT_THAT( + m_init_thread.GetDetailedStatus(), + IsErrorStatus(InitThreadStatus::ErrorCondition::REGISTRATION_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, RegistrationTimeout) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(false)); + EXPECT_CALL(m_critical_interface, report_critical_state(_)).Times(1); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + + populate_default_init_table(); + overwrite_reconciliation_timeout(1); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + m_init_thread.Join(); + + EXPECT_THAT( + m_init_thread.GetDetailedStatus(), + IsErrorStatus(InitThreadStatus::ErrorCondition::REGISTRATION_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, ReconciliationStopped) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(false)); + EXPECT_CALL(m_critical_interface, report_critical_state(_)).Times(1); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + EXPECT_CALL(m_telemetry, record_stage_end(WarmBootStage::STAGE_RECONCILIATION, + /*success=*/false)) + .Times(1); + + advance_through_registration(); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + // Registration is done, so this will stop the thread in the reconciliation + // loop. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + m_init_thread.Stop(); + m_init_thread.Join(); + + EXPECT_THAT( + m_init_thread.GetDetailedStatus(), + IsErrorStatus(InitThreadStatus::ErrorCondition::RECONCILIATION_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, ReconciliationCritical) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(3)) + .WillOnce(Return(false)) + .WillOnce(Return(false)) + .WillRepeatedly(Return(true)); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + EXPECT_CALL(m_telemetry, record_stage_end(WarmBootStage::STAGE_RECONCILIATION, + /*success=*/false)) + .Times(1); + + advance_through_registration(); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + m_init_thread.Join(); + + EXPECT_THAT( + m_init_thread.GetDetailedStatus(), + IsErrorStatus(InitThreadStatus::ErrorCondition::RECONCILIATION_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, ReconciliationTimeout) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(false)); + EXPECT_CALL(m_critical_interface, report_critical_state(_)).Times(1); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + EXPECT_CALL(m_telemetry, record_stage_end(WarmBootStage::STAGE_RECONCILIATION, + /*success=*/false)) + .Times(1); + + advance_through_registration(); + + overwrite_reconciliation_timeout(1); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + m_init_thread.Join(); + + EXPECT_THAT( + m_init_thread.GetDetailedStatus(), + IsErrorStatus(InitThreadStatus::ErrorCondition::RECONCILIATION_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, ReconciliationAlreadyFailed) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(false)); + EXPECT_CALL(m_critical_interface, report_critical_state(_)).Times(1); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + EXPECT_CALL(m_telemetry, record_stage_end(WarmBootStage::STAGE_RECONCILIATION, + /*success=*/false)) + .Times(1); + + advance_through_registration(); + TestUtils::populate_restart_table_state(m_db, "app1", "failed"); + + swss::Select s; + s.addSelectable(&m_finished); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + m_init_thread.Join(); + + EXPECT_THAT( + m_init_thread.GetDetailedStatus(), + IsErrorStatus(InitThreadStatus::ErrorCondition::RECONCILIATION_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, ReconciliationAlreadyDone) { + TestUtils::set_state_verification_enable(m_config_db, /*bootup=*/true, + /*enabled=*/false); + set_default_success_expects(); + EXPECT_CALL(m_telemetry, record_stage_end(WarmBootStage::STAGE_RECONCILIATION, + /*success=*/true)) + .Times(1); + + advance_through_registration(); + TestUtils::populate_restart_table_state(m_db, "app1", "reconciled"); + TestUtils::populate_restart_table_state(m_db, "app2", "reconciled"); + TestUtils::populate_restart_table_state(m_db, "app3", "reconciled"); + + swss::Select s; + s.addSelectable(&m_finished); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + TestUtils::wait_for_finish(s, m_finished, kSelectTimeoutSeconds); + m_init_thread.Join(); + + EXPECT_THAT(m_init_thread.GetDetailedStatus(), IsDoneStatus()); + check_final_success_expects(/*state_verification_enabled=*/false); +} + +TEST_F(InitThreadTest, ReconciliationFails) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(false)); + EXPECT_CALL(m_critical_interface, report_critical_state(_)).Times(1); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + EXPECT_CALL(m_telemetry, record_stage_end(WarmBootStage::STAGE_RECONCILIATION, + /*success=*/false)) + .Times(1); + + advance_through_registration(); + + // Apps register one-by-one. + auto test_sequence = [&] { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + TestUtils::populate_restart_table_state(m_db, "app1", "failed"); + }; + std::thread test_thread = std::thread(test_sequence); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + swss::Select s; + s.addSelectable(&m_finished); + TestUtils::wait_for_finish(s, m_finished, kSelectTimeoutSeconds); + m_init_thread.Join(); + + test_thread.join(); + + EXPECT_THAT( + m_init_thread.GetDetailedStatus(), + IsErrorStatus(InitThreadStatus::ErrorCondition::RECONCILIATION_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, StateVerificationStopped) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(false)); + EXPECT_CALL(m_critical_interface, report_critical_state(_)).Times(1); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + + advance_through_reconciliation(); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + // Registration and reconciliation are done, so this will stop the thread in + // the state verification loop. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + m_init_thread.Stop(); + m_init_thread.Join(); + + EXPECT_THAT(m_init_thread.GetDetailedStatus(), + IsErrorStatus( + InitThreadStatus::ErrorCondition::STATE_VERIFICATION_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, StateVerificationCritical) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(4)) + .WillOnce(Return(false)) + .WillOnce(Return(false)) + .WillOnce(Return(false)) + .WillRepeatedly(Return(true)); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + + advance_through_reconciliation(); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + m_init_thread.Join(); + + EXPECT_THAT(m_init_thread.GetDetailedStatus(), + IsErrorStatus( + InitThreadStatus::ErrorCondition::STATE_VERIFICATION_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, StateVerificationFails) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(false)); + EXPECT_CALL(m_critical_interface, report_critical_state(_)).Times(1); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + + advance_through_reconciliation(); + + overwrite_state_verification_timeout(1); + + std::string timestamp; + auto test_sequence = [&] { + swss::NotificationConsumer nc(&m_db, STATE_VERIFICATION_REQ_CHANNEL); + timestamp = TestUtils::wait_for_state_verification_trigger( + nc, kSelectTimeoutSeconds, /*freeze=*/true); + + TestUtils::write_state_verification_result(m_db, ALL_COMPONENT, SV_FAIL, + timestamp); + }; + std::thread test_thread = std::thread(test_sequence); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + m_init_thread.Join(); + + test_thread.join(); + + EXPECT_THAT(timestamp, Not(IsEmpty())); + EXPECT_THAT(m_init_thread.GetDetailedStatus(), + IsErrorStatus( + InitThreadStatus::ErrorCondition::STATE_VERIFICATION_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, UnfreezeStopped) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(false)); + EXPECT_CALL(m_critical_interface, report_minor_alarm(_)).Times(1); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + set_telemetry_stage_expects(WarmBootStage::STAGE_UNFREEZE, + /*success=*/false); + + swss::Select s; + s.addSelectable(&m_stack_unfrozen); + std::thread test_thread = + std::thread(&InitThreadTest::advance_through_state_verification, this); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + TestUtils::wait_for_finish(s, m_stack_unfrozen, kSelectTimeoutSeconds); + // Registration, reconciliation, and state verification are done, so this + // will stop the thread in the unfreeze loop. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + m_init_thread.Stop(); + m_init_thread.Join(); + + test_thread.join(); + + EXPECT_THAT(m_init_thread.GetDetailedStatus(), + IsErrorStatus(InitThreadStatus::ErrorCondition::UNFREEZE_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, UnfreezeCritical) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(5)) + .WillOnce(Return(false)) + .WillOnce(Return(false)) + .WillOnce(Return(false)) + .WillOnce(Return(false)) + .WillRepeatedly(Return(true)); + EXPECT_CALL(m_critical_interface, report_minor_alarm(_)).Times(1); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + set_telemetry_stage_expects(WarmBootStage::STAGE_UNFREEZE, + /*success=*/false); + + swss::Select s; + s.addSelectable(&m_stack_unfrozen); + std::thread test_thread = + std::thread(&InitThreadTest::advance_through_state_verification, this); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + TestUtils::wait_for_finish(s, m_stack_unfrozen, kSelectTimeoutSeconds); + m_init_thread.Join(); + + test_thread.join(); + + EXPECT_THAT(m_init_thread.GetDetailedStatus(), + IsErrorStatus(InitThreadStatus::ErrorCondition::UNFREEZE_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, UnfreezeTimeout) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(false)); + EXPECT_CALL(m_critical_interface, report_minor_alarm(_)).Times(1); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + set_telemetry_stage_expects(WarmBootStage::STAGE_UNFREEZE, + /*success=*/false); + + swss::Select s; + s.addSelectable(&m_stack_unfrozen); + std::thread test_thread = + std::thread(&InitThreadTest::advance_through_state_verification, this); + + overwrite_unfreeze_timeout(1); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + TestUtils::wait_for_finish(s, m_stack_unfrozen, kSelectTimeoutSeconds); + m_init_thread.Join(); + + test_thread.join(); + + EXPECT_THAT(m_init_thread.GetDetailedStatus(), + IsErrorStatus(InitThreadStatus::ErrorCondition::UNFREEZE_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, UnfreezeAlreadyFailed) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(false)); + EXPECT_CALL(m_critical_interface, report_minor_alarm(_)).Times(1); + EXPECT_CALL(m_telemetry, record_overall_end(false)).Times(1); + + swss::Select s; + s.addSelectable(&m_stack_unfrozen); + std::thread test_thread = std::thread( + &InitThreadTest::advance_through_unfreeze_with_state, this, "failed"); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + TestUtils::wait_for_finish(s, m_stack_unfrozen, kSelectTimeoutSeconds); + m_init_thread.Join(); + + test_thread.join(); + + EXPECT_THAT(m_init_thread.GetDetailedStatus(), + IsErrorStatus(InitThreadStatus::ErrorCondition::UNFREEZE_FAILED)); + check_final_failure_expects(); +} + +TEST_F(InitThreadTest, UnfreezeAlreadyDone) { + set_default_success_expects(); + + swss::Select s; + s.addSelectable(&m_stack_unfrozen); + std::thread test_thread = std::thread( + &InitThreadTest::advance_through_unfreeze_with_state, this, "completed"); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + TestUtils::wait_for_finish(s, m_stack_unfrozen, kSelectTimeoutSeconds); + m_init_thread.Join(); + + test_thread.join(); + + EXPECT_THAT(m_init_thread.GetDetailedStatus(), IsDoneStatus()); + check_final_success_expects(/*state_verification_enabled=*/true); +} + +class InitThreadTestWithSvResult + : public InitThreadTest, + public testing::WithParamInterface {}; + +TEST_P(InitThreadTestWithSvResult, FullPassWorks) { + set_default_success_expects(); + if (GetParam() == SV_NOT_RUN) { + EXPECT_CALL(m_critical_interface, report_minor_alarm(_)).Times(1); + } + EXPECT_CALL(m_telemetry, record_stage_end(WarmBootStage::STAGE_RECONCILIATION, + /*success=*/true)) + .Times(1); + set_telemetry_stage_expects(WarmBootStage::STAGE_UNFREEZE, + /*success=*/true); + + populate_default_init_table(); + + // Apps register one-by-one. + auto test_sequence = [&] { + // Registration step. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + EXPECT_THAT(m_init_thread.GetDetailedStatus(), + IsActiveStatus( + InitThreadStatus::ThreadStatus::WAITING_FOR_REGISTRATION)); + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + true, true); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + true, true); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + TestUtils::populate_registration_table(m_db, "docker3|app3", false, true, + true, true); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Reconciliation step. + EXPECT_THAT( + m_init_thread.GetDetailedStatus(), + IsActiveStatus( + InitThreadStatus::ThreadStatus::WAITING_FOR_RECONCILIATION)); + TestUtils::populate_restart_table_state(m_db, "app1", "reconciled"); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + TestUtils::populate_restart_table_state(m_db, "app2", "reconciled"); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + // Start listening for the verfication request just before completing + // reconciliation. + swss::NotificationConsumer nc(&m_db, STATE_VERIFICATION_REQ_CHANNEL); + TestUtils::populate_restart_table_state(m_db, "app3", "reconciled"); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // State verification step. + EXPECT_THAT( + m_init_thread.GetDetailedStatus(), + IsActiveStatus( + InitThreadStatus::ThreadStatus::WAITING_FOR_STATE_VERIFICATION)); + std::string timestamp = TestUtils::wait_for_state_verification_trigger( + nc, kSelectTimeoutSeconds, /*freeze=*/true); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + TestUtils::write_state_verification_result(m_db, ALL_COMPONENT, GetParam(), + "wrong_timestamp"); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + TestUtils::write_state_verification_result(m_db, "fake_component", + GetParam(), "timestamp"); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + swss::Select unfreeze_select; + unfreeze_select.addSelectable(&m_stack_unfrozen); + TestUtils::write_state_verification_result(m_db, ALL_COMPONENT, GetParam(), + timestamp); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Unfreeze step. + EXPECT_THAT( + m_init_thread.GetDetailedStatus(), + IsActiveStatus(InitThreadStatus::ThreadStatus::WAITING_FOR_UNFREEZE)); + TestUtils::wait_for_finish(unfreeze_select, m_stack_unfrozen, + kSelectTimeoutSeconds); + TestUtils::populate_restart_table_state(m_db, "app1", "completed"); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + TestUtils::populate_restart_table_state(m_db, "app2", "completed"); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + TestUtils::populate_restart_table_state(m_db, "app3", "completed"); + }; + std::thread test_thread = std::thread(test_sequence); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + + swss::Select s; + s.addSelectable(&m_finished); + TestUtils::wait_for_finish(s, m_finished, kSelectTimeoutSeconds); + + test_thread.join(); + m_init_thread.Join(); + + EXPECT_THAT(m_init_thread.GetDetailedStatus(), IsDoneStatus()); + check_final_success_expects(/*state_verification_enabled=*/true); +} + +INSTANTIATE_TEST_SUITE_P(FullPassSuite, InitThreadTestWithSvResult, + testing::Values(SV_PASS, SV_NOT_RUN)); + +TEST_F(InitThreadTest, StateVerificationTimeoutIsSuccess) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(AtLeast(1)) + .WillRepeatedly(Return(false)); + EXPECT_CALL(m_critical_interface, report_minor_alarm(_)).Times(1); + EXPECT_CALL(m_telemetry, record_overall_end(true)).Times(1); + + advance_through_reconciliation(); + set_telemetry_stage_expects(WarmBootStage::STAGE_UNFREEZE, + /*success=*/true); + + overwrite_state_verification_timeout(1); + + std::string timestamp; + auto test_sequence = [&] { + swss::Select unfreeze_select; + unfreeze_select.addSelectable(&m_stack_unfrozen); + + // State verification step. State Verification daemon reports no results + // and times out. + swss::NotificationConsumer nc(&m_db, STATE_VERIFICATION_REQ_CHANNEL); + timestamp = TestUtils::wait_for_state_verification_trigger( + nc, kSelectTimeoutSeconds, /*freeze=*/true); + + // Unfreeze step. + TestUtils::wait_for_finish(unfreeze_select, m_stack_unfrozen, + kSelectTimeoutSeconds); + EXPECT_THAT( + m_init_thread.GetDetailedStatus(), + IsActiveStatus(InitThreadStatus::ThreadStatus::WAITING_FOR_UNFREEZE)); + TestUtils::populate_restart_table_state(m_db, "app1", "completed"); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + TestUtils::populate_restart_table_state(m_db, "app2", "completed"); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + TestUtils::populate_restart_table_state(m_db, "app3", "completed"); + }; + std::thread test_thread = std::thread(test_sequence); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + + swss::Select s; + s.addSelectable(&m_finished); + TestUtils::wait_for_finish(s, m_finished, kSelectTimeoutSeconds); + + test_thread.join(); + m_init_thread.Join(); + + EXPECT_THAT(m_init_thread.GetDetailedStatus(), IsDoneStatus()); + check_final_success_expects(/*state_verification_enabled=*/true); +} + +TEST_F(InitThreadTest, FullPassNoStateVerification) { + TestUtils::set_state_verification_enable(m_config_db, /*bootup=*/true, + /*enabled=*/false); + + set_default_success_expects(); + EXPECT_CALL(m_telemetry, record_stage_end(WarmBootStage::STAGE_RECONCILIATION, + /*success=*/true)) + .Times(1); + + populate_default_init_table(); + + // Apps register one-by-one. + auto test_sequence = [&] { + // Registration step. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + true, true); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + true, true); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + TestUtils::populate_registration_table(m_db, "docker3|app3", false, true, + true, true); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Reconciliation step. + TestUtils::populate_restart_table_state(m_db, "app1", "reconciled"); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + TestUtils::populate_restart_table_state(m_db, "app2", "reconciled"); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + // Start listening for the verfication request just before completing + // reconciliation. + swss::Select unfreeze_select; + unfreeze_select.addSelectable(&m_stack_unfrozen); + swss::NotificationConsumer nc(&m_db, STATE_VERIFICATION_REQ_CHANNEL); + TestUtils::populate_restart_table_state(m_db, "app3", "reconciled"); + + // No state verification signal. + TestUtils::confirm_no_state_verification_trigger( + nc, kShortSelectTimeoutSeconds); + }; + std::thread test_thread = std::thread(test_sequence); + + EXPECT_EQ(m_init_thread.Start(), swss::StatusCode::SWSS_RC_SUCCESS); + swss::Select s; + s.addSelectable(&m_finished); + TestUtils::wait_for_finish(s, m_finished, kSelectTimeoutSeconds); + + m_init_thread.Join(); + test_thread.join(); + + EXPECT_THAT(m_init_thread.GetDetailedStatus(), IsDoneStatus()); + check_final_success_expects(/*state_verification_enabled=*/false); +} + +} // namespace rebootbackend diff --git a/src/sonic-framework/tests/mock_reboot_interfaces.h b/src/sonic-framework/tests/mock_reboot_interfaces.h new file mode 100644 index 000000000000..58a34792cff2 --- /dev/null +++ b/src/sonic-framework/tests/mock_reboot_interfaces.h @@ -0,0 +1,62 @@ +#pragma once +#include + +#include "init_thread.h" +#include "reboot_interfaces.h" +#include "selectableevent.h" +#include "system/system.pb.h" + +namespace rebootbackend { + +class MockDbusInterface : public DbusInterface { + public: + MOCK_METHOD(DbusInterface::DbusResponse, Reboot, (const std::string &), + (override)); + MOCK_METHOD(DbusInterface::DbusResponse, RebootStatus, (const std::string &), + (override)); + MOCK_METHOD(DbusInterface::DbusResponse, StopContainers, (const std::string&), + (override)); + MOCK_METHOD(DbusInterface::DbusResponse, StopContainerStatus, + (const std::string&), (override)); +}; + +class MockCriticalStateInterface : public CriticalStateInterface { + public: + MOCK_METHOD(bool, is_system_critical, (), (override)); + MOCK_METHOD(void, report_minor_alarm, (const std::string &), (override)); + MOCK_METHOD(void, report_critical_state, (const std::string &), (override)); +}; + +class MockTelemetryInterface : public TelemetryInterface { + public: + ~MockTelemetryInterface() override = default; + + MOCK_METHOD(void, record_overall_start, (), (override)); + MOCK_METHOD(void, record_overall_end, (bool success), (override)); + MOCK_METHOD(void, record_stage_start, + (swss::WarmStart::WarmBootStage nsf_stage), (override)); + MOCK_METHOD(void, record_stage_end, + (swss::WarmStart::WarmBootStage nsf_stage, bool success), + (override)); +}; + +class MockInitThread : public InitThread { + public: + MockInitThread() + : InitThread(m_unused_critical_state, m_unused_telemetry, m_unused_event, + m_unused_event) {} + + MOCK_METHOD(swss::StatusCode, Start, (), (override)); + MOCK_METHOD(void, Stop, (), (override)); + MOCK_METHOD(bool, Join, (), (override)); + MOCK_METHOD(gnoi::system::RebootStatusResponse, GetResponse, (), (override)); + MOCK_METHOD(InitThreadStatus::DetailedStatus, GetDetailedStatus, (), + (override)); + + private: + MockCriticalStateInterface m_unused_critical_state; + MockTelemetryInterface m_unused_telemetry; + swss::SelectableEvent m_unused_event; +}; + +} // namespace rebootbackend \ No newline at end of file diff --git a/src/sonic-framework/tests/reboot_common_test.cpp b/src/sonic-framework/tests/reboot_common_test.cpp new file mode 100644 index 000000000000..72f0cd9dc7ca --- /dev/null +++ b/src/sonic-framework/tests/reboot_common_test.cpp @@ -0,0 +1,27 @@ +#include "reboot_common.h" + +#include +#include +#include + +namespace rebootbackend { + +using ::testing::_; +using ::testing::AtLeast; +using ::testing::ExplainMatchResult; +using ::testing::StrEq; + +MATCHER_P2(CheckTimespec, secs, nsecs, "") { + return (arg.tv_sec == secs && arg.tv_nsec == nsecs); +} + +TEST(RebootCommon, MillisecToTimespec) { + timespec l_timespec = milliseconds_to_timespec(0); + EXPECT_THAT(l_timespec, CheckTimespec(0, 0)); + l_timespec = milliseconds_to_timespec(200); + EXPECT_THAT(l_timespec, CheckTimespec(0, 200 * 1000 * 1000)); + l_timespec = milliseconds_to_timespec(1800); + EXPECT_THAT(l_timespec, CheckTimespec(1, 800 * 1000 * 1000)); +} + +} // namespace rebootbackend diff --git a/src/sonic-framework/tests/reboot_thread_test.cpp b/src/sonic-framework/tests/reboot_thread_test.cpp new file mode 100644 index 000000000000..1bdfa4ea6c88 --- /dev/null +++ b/src/sonic-framework/tests/reboot_thread_test.cpp @@ -0,0 +1,1489 @@ +#include "reboot_thread.h" + +#include +#include +#include +#include + +#include +#include +#include + +#include "container_stop.pb.h" +#include "mock_reboot_interfaces.h" +#include "reboot_common.h" +#include "reboot_interfaces.h" +#include "redis_utils.h" +#include "select.h" +#include "selectableevent.h" +#include "stateverification.h" +#include "status_code_util.h" +#include "system/system.pb.h" +#include "test_utils_common.h" +#include "timestamp.h" +#include "warm_restart.h" + +namespace rebootbackend { + +#define TENTH_SECOND_MS (100) + +using namespace gnoi::system; +namespace gpu = ::google::protobuf::util; +using WarmStartState = ::swss::WarmStart::WarmStartState; +using Progress = ::rebootbackend::RebootThread::Progress; +using RebootThread = ::rebootbackend::RebootThread; +using ::testing::_; +using ::testing::ExplainMatchResult; +using ::testing::HasSubstr; +using ::testing::NiceMock; +using ::testing::Return; +using ::testing::StrEq; +using ::testing::StrictMock; + +MATCHER_P2(IsStatus, status, message, "") { + return (arg.status().status() == status && + ExplainMatchResult(message, arg.status().message(), result_listener)); +} + +class RebootStatusTest : public ::testing::Test { + protected: + RebootStatusTest() : m_status() {} + ThreadStatus m_status; +}; + +TEST_F(RebootStatusTest, TestInit) { + RebootStatusResponse response = m_status.get_response(); + + EXPECT_FALSE(response.active()); + EXPECT_THAT(response.reason(), StrEq("")); + EXPECT_EQ(response.count(), 0); + EXPECT_EQ(response.method(), RebootMethod::UNKNOWN); + EXPECT_EQ(response.status().status(), + RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN); + EXPECT_THAT(response.status().message(), StrEq("")); + + EXPECT_FALSE(m_status.get_active()); +} +TEST_F(RebootStatusTest, TestStartStatus) { + m_status.set_start_status(RebootMethod::NSF, "reboot because"); + + RebootStatusResponse response = m_status.get_response(); + + EXPECT_TRUE(response.active()); + EXPECT_THAT(response.reason(), StrEq("reboot because")); + EXPECT_EQ(response.count(), 1); + EXPECT_EQ(response.method(), RebootMethod::NSF); + EXPECT_THAT(response.status().message(), StrEq("")); + + EXPECT_TRUE(m_status.get_active()); +} + +TEST_F(RebootStatusTest, TestSets) { + m_status.set_start_status(RebootMethod::NSF, "reboot because"); + + RebootStatus_Status reboot_status = m_status.get_last_reboot_status(); + EXPECT_EQ(reboot_status, + RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN); + + m_status.set_completed_status( + RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, "timeout"); + + // Have to be inactive to read the status + m_status.set_inactive(); + EXPECT_FALSE(m_status.get_active()); + + RebootStatusResponse response = m_status.get_response(); + EXPECT_THAT(response.status().message(), StrEq("timeout")); + EXPECT_EQ(response.status().status(), + RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE); + + // Can't set message status while inactive + m_status.set_completed_status( + RebootStatus_Status::RebootStatus_Status_STATUS_RETRIABLE_FAILURE, + "anything"); + response = m_status.get_response(); + EXPECT_THAT(response.status().message(), StrEq("timeout")); + EXPECT_EQ(response.status().status(), + RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE); +} + +TEST_F(RebootStatusTest, TestGetStatus) { + std::chrono::nanoseconds curr_ns = std::chrono::high_resolution_clock::now().time_since_epoch(); + + m_status.set_start_status(RebootMethod::COLD, "reboot because"); + + RebootStatusResponse response = m_status.get_response(); + EXPECT_EQ(response.status().status(), + RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN); + + m_status.set_completed_status( + RebootStatus_Status::RebootStatus_Status_STATUS_SUCCESS, "anything"); + + response = m_status.get_response(); + + // message should be empty while reboot is active + EXPECT_THAT(response.status().message(), StrEq("")); + + uint64_t reboot_ns = response.when(); + EXPECT_TRUE(reboot_ns > (uint64_t)curr_ns.count()); + + m_status.set_inactive(); + response = m_status.get_response(); + EXPECT_THAT(response.status().message(), StrEq("anything")); + EXPECT_EQ(response.status().status(), + RebootStatus_Status::RebootStatus_Status_STATUS_SUCCESS); + EXPECT_EQ(0, response.when()); +} + +class RebootThreadTest : public ::testing::Test { + protected: + RebootThreadTest() + : m_dbus_interface(), + m_critical_interface(), + m_db("STATE_DB", 0), + m_config_db("CONFIG_DB", 0), + m_reboot_thread(m_dbus_interface, m_critical_interface, m_telemetry, + m_finished) { + swss::WarmStart::initialize("app1", "docker1"); + TestUtils::clear_tables(m_db); + sigterm_requested = false; + } + + void overwrite_reboot_timeout(uint32_t timeout_seconds) { + m_reboot_thread.m_reboot_timeout = timeout_seconds; + } + + void overwrite_state_verification_timeout(uint32_t timeout_seconds) { + m_reboot_thread.m_state_verification_timeout = timeout_seconds; + } + + void overwrite_quiescence_timeout_ms(uint32_t timeout_ms) { + m_reboot_thread.m_quiescence_timeout_ms = timeout_ms; + } + + void overwrite_quiescence_hold_time_ms(uint32_t timeout_ms) { + m_reboot_thread.m_quiescence_hold_time_ms = timeout_ms; + } + + void overwrite_checkpoint_timeout(uint32_t timeout_seconds) { + m_reboot_thread.m_checkpoint_timeout = timeout_seconds; + } + + RebootStatusResponse get_response(void) { + return m_reboot_thread.m_status.get_response(); + } + + void set_start_status(const RebootMethod &method, const std::string &reason) { + return m_reboot_thread.m_status.set_start_status(method, reason); + } + + void set_completed_status(const RebootStatus_Status &status, + const std::string &message) { + return m_reboot_thread.m_status.set_completed_status(status, message); + } + + void force_inactive(void) { return m_reboot_thread.m_status.set_inactive(); } + + void force_active(void) { return m_reboot_thread.m_status.set_inactive(); } + + void do_reboot(void) { return m_reboot_thread.do_reboot(); } + + Progress wait_for_platform_reboot(swss::Select &s) { + return m_reboot_thread.wait_for_platform_reboot(s); + } + + Progress perform_state_verification(swss::Select &s) { + return m_reboot_thread.perform_state_verification(s); + } + + Progress perform_state_verification_select(swss::Select &s, + swss::SelectableTimer &l_timer, + swss::SubscriberStateTable &sub, + std::string ×tamp) { + return m_reboot_thread.state_verification_select(s, l_timer, sub, + timestamp); + } + + RebootThread::Status handle_state_verification_event( + swss::SubscriberStateTable &sub, std::string ×tamp) { + return m_reboot_thread.handle_state_verification_event(sub, timestamp); + } + + // stop: set to true if calling m_reboot_thread.Stop() is desired. + // timeout_ms: replaces m_quiescence_timout_ms if > 0 + Progress perform_freeze_quiescence_w_stop(bool stop, int timeout_ms = -1) { + timespec l_timespec; + if (timeout_ms >= 0) { + l_timespec = milliseconds_to_timespec(timeout_ms); + } else { + l_timespec = + milliseconds_to_timespec(m_reboot_thread.m_quiescence_timeout_ms); + } + + swss::SelectableTimer l_timer(l_timespec); + + swss::Select s; + s.addSelectable(&l_timer); + l_timer.start(); + s.addSelectable(&(return_m_stop_reference())); + + if (stop) { + m_reboot_thread.Stop(); + } + return m_reboot_thread.perform_freeze_quiescence(s, l_timer); + } + + Progress checkpoint_stage_two(swss::Select &s, swss::SelectableTimer &l_timer, + swss::SubscriberStateTable &sub) { + return m_reboot_thread.checkpoint_stage_two(s, l_timer, sub); + } + + Progress perform_checkpoint(swss::Select &s) { + return m_reboot_thread.perform_checkpoint(s); + } + + RebootThread::Status check_container_stop(const std::string &request_id) { + return m_reboot_thread.check_container_stop(request_id); + } + + Progress wait_for_container_stop(int timeout_ms = -1) { + timespec l_timespec; + if (timeout_ms >= 0) { + l_timespec = milliseconds_to_timespec(timeout_ms); + } else { + l_timespec = + milliseconds_to_timespec(m_reboot_thread.m_quiescence_timeout_ms); + } + + swss::SelectableTimer l_timer(l_timespec); + + swss::Select s; + s.addSelectable(&l_timer); + l_timer.start(); + s.addSelectable(&(return_m_stop_reference())); + + m_reboot_thread.Stop(); + + return m_reboot_thread.wait_for_container_stop(s, "reqA", l_timer); + } + + void fetch_registration_info() { + m_reboot_thread.m_registration.fetch_registration_info(); + } + + swss::SelectableEvent &return_m_stop_reference() { + return m_reboot_thread.m_stop; + } + + swss::DBConnector m_db; + swss::DBConnector m_config_db; + NiceMock m_dbus_interface; + NiceMock m_critical_interface; + StrictMock m_telemetry; + swss::SelectableEvent m_finished; + RebootThread m_reboot_thread; +}; + +MATCHER_P2(Status, status, message, "") { + return (arg.status().status() == status && arg.status().message() == message); +} + +TEST_F(RebootThreadTest, TestStop) { + EXPECT_CALL(m_dbus_interface, Reboot(_)) + .Times(1) + .WillOnce(Return(DbusInterface::DbusResponse{ + DbusInterface::DbusStatus::DBUS_SUCCESS, ""})); + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + EXPECT_CALL(m_critical_interface, + report_critical_state("platform failed to reboot")) + .Times(0); + RebootRequest request; + request.set_method(RebootMethod::COLD); + overwrite_reboot_timeout(2); + m_reboot_thread.Start(request); + m_reboot_thread.Stop(); + m_reboot_thread.Join(); + gnoi::system::RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN, "")); +} + +TEST_F(RebootThreadTest, TestCleanExit) { + EXPECT_CALL(m_dbus_interface, Reboot(_)) + .Times(1) + .WillOnce(Return(DbusInterface::DbusResponse{ + DbusInterface::DbusStatus::DBUS_SUCCESS, ""})); + EXPECT_CALL(m_critical_interface, + report_critical_state(("platform failed to reboot"))) + .Times(1); + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + overwrite_reboot_timeout(1); + + swss::Select s; + s.addSelectable(&m_finished); + + RebootRequest request; + request.set_method(RebootMethod::COLD); + request.set_message("time to reboot"); + m_reboot_thread.Start(request); + TestUtils::wait_for_finish(s, m_finished, 5); + + // Status should be active until we call join + RebootStatusResponse response = get_response(); + EXPECT_TRUE(response.active()); + EXPECT_THAT(response.reason(), StrEq("time to reboot")); + EXPECT_EQ(response.count(), 1); + + EXPECT_THAT(response.status().message(), StrEq("")); + + m_reboot_thread.Join(); + + response = get_response(); + EXPECT_FALSE(response.active()); + EXPECT_THAT(response.status().message(), StrEq("platform failed to reboot")); +} + +TEST_F(RebootThreadTest, TestJoinWithoutStart) { + bool ret = m_reboot_thread.Join(); + EXPECT_FALSE(ret); +} + +// Call Start a second time while first thread is still executing. +TEST_F(RebootThreadTest, TestStartWhileRunning) { + EXPECT_CALL(m_dbus_interface, Reboot(_)) + .Times(1) + .WillOnce(Return(DbusInterface::DbusResponse{ + DbusInterface::DbusStatus::DBUS_SUCCESS, ""})); + EXPECT_CALL(m_critical_interface, + report_critical_state("platform failed to reboot")) + .Times(1); + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + overwrite_reboot_timeout(2); + + RebootRequest request; + request.set_method(RebootMethod::COLD); + request.set_message("time to reboot"); + m_reboot_thread.Start(request); + + // First thread is still running ... + NotificationResponse response = m_reboot_thread.Start(request); + EXPECT_EQ(response.status, swss::StatusCode::SWSS_RC_IN_USE); + EXPECT_THAT(response.json_string, + StrEq("RebootThread: can't Start while active")); + + bool ret = m_reboot_thread.Join(); + EXPECT_TRUE(ret); +} + +// Call Start a second time after first thread completed +// but before first thread was joined. +// Second start should fail. +TEST_F(RebootThreadTest, TestStartWithoutJoin) { + EXPECT_CALL(m_dbus_interface, Reboot(_)) + .Times(1) + .WillOnce(Return(DbusInterface::DbusResponse{ + DbusInterface::DbusStatus::DBUS_SUCCESS, ""})); + EXPECT_CALL(m_critical_interface, + report_critical_state("platform failed to reboot")) + .Times(1); + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + overwrite_reboot_timeout(1); + + swss::Select s; + s.addSelectable(&m_finished); + + RebootRequest request; + request.set_method(RebootMethod::COLD); + request.set_message("time to reboot"); + m_reboot_thread.Start(request); + TestUtils::wait_for_finish(s, m_finished, 3); + + // First thread has stopped: we need to join before + // restart will succeed + NotificationResponse response = m_reboot_thread.Start(request); + EXPECT_EQ(response.status, swss::StatusCode::SWSS_RC_IN_USE); + + // This should join the first start. + bool ret = m_reboot_thread.Join(); + EXPECT_TRUE(ret); +} + +TEST_F(RebootThreadTest, TestUnsupportedRebootType) { + RebootRequest request; + request.set_method(RebootMethod::POWERDOWN); + + NotificationResponse response = m_reboot_thread.Start(request); + EXPECT_EQ(response.status, swss::StatusCode::SWSS_RC_INVALID_PARAM); + EXPECT_EQ(response.json_string, + "RebootThread: Start rx'd unsupported method"); +} + +TEST_F(RebootThreadTest, TestDelayedStartUnsupported) { + RebootRequest request; + request.set_method(RebootMethod::NSF); + request.set_delay(1); + + NotificationResponse response = m_reboot_thread.Start(request); + EXPECT_EQ(response.status, swss::StatusCode::SWSS_RC_INVALID_PARAM); + EXPECT_THAT(response.json_string, + StrEq("RebootThread: delayed start not supported")); +} + +TEST_F(RebootThreadTest, TestNoNSFIfNonRetriableFailure) { + set_start_status(RebootMethod::NSF, "time to reboot"); + set_completed_status(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "platform failed to reboot"); + force_inactive(); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + + NotificationResponse response = m_reboot_thread.Start(request); + EXPECT_EQ(response.status, swss::StatusCode::SWSS_RC_FAILED_PRECONDITION); + EXPECT_EQ(response.json_string, + "RebootThread: last NSF failed with non-retriable failure"); +} + +TEST_F(RebootThreadTest, TestNoNSFIfSystemCritical) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(true)); + set_start_status(RebootMethod::NSF, "time to reboot"); + + force_inactive(); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + + NotificationResponse response = m_reboot_thread.Start(request); + EXPECT_EQ(response.status, swss::StatusCode::SWSS_RC_FAILED_PRECONDITION); + EXPECT_EQ(response.json_string, + "RebootThread: in critical state, NSF not allowed"); +} + +TEST_F(RebootThreadTest, TestSigTermStartofDoReboot) { + sigterm_requested = true; + set_start_status(RebootMethod::NSF, "time to reboot"); + do_reboot(); + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN, "")); +} + +TEST_F(RebootThreadTest, TestInvalidMethodfDoReboot) { + set_start_status(RebootMethod::POWERUP, "time to reboot"); + do_reboot(); + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN, "")); +} + +TEST_F(RebootThreadTest, TestWaitForRebootPositive) { + overwrite_reboot_timeout(1); + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + set_start_status(RebootMethod::NSF, "time to reboot"); + swss::Select s; + swss::SelectableEvent m_stop; + s.addSelectable(&m_stop); + RebootThread::Progress progress = wait_for_platform_reboot(s); + EXPECT_EQ(progress, RebootThread::Progress::PROCEED); + // EXPECT_EQ(progress, RebootThread::Progress::EXIT_EARLY); +} + +TEST_F(RebootThreadTest, TestWaitForRebootCriticalState) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(true)); + set_start_status(RebootMethod::NSF, "time to reboot"); + swss::Select s; + swss::SelectableEvent m_stop; + s.addSelectable(&m_stop); + RebootThread::Progress progress = wait_for_platform_reboot(s); + EXPECT_EQ(progress, RebootThread::Progress::EXIT_EARLY); + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "system entered critical state after platfrom reboot request")); +} + +TEST_F(RebootThreadTest, TestWaitForRebootRxStop) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + m_reboot_thread.Stop(); + RebootThread::Progress progress = wait_for_platform_reboot(s); + EXPECT_EQ(progress, RebootThread::Progress::EXIT_EARLY); + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN, "")); +} + +// +// State Verification Tests +// + +TEST_F(RebootThreadTest, TestStateVerificationCriticalState) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(true)); + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + + RebootThread::Progress progress = perform_state_verification(s); + EXPECT_EQ(progress, RebootThread::Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus( + RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "system entered critical state before reboot state verification")); +} + +TEST_F(RebootThreadTest, TestStateVerificationDisabled) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Table warmRestartTable(&m_config_db, CFG_WARM_RESTART_TABLE_NAME); + warmRestartTable.hset("system", "state_verification_shutdown", "false"); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + + RebootThread::Progress progress = perform_state_verification(s); + EXPECT_EQ(progress, RebootThread::Progress::PROCEED); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN, "")); +} + +TEST_F(RebootThreadTest, TestStateVerificationTimeout) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + + overwrite_state_verification_timeout(1); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Table warmRestartTable(&m_config_db, CFG_WARM_RESTART_TABLE_NAME); + warmRestartTable.hset("system", "state_verification_shutdown", "true"); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + + RebootThread::Progress progress = perform_state_verification(s); + EXPECT_EQ(progress, RebootThread::Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus( + RebootStatus_Status::RebootStatus_Status_STATUS_RETRIABLE_FAILURE, + "timeout occurred during reboot state verification: retriable " + "error")); +} + +TEST_F(RebootThreadTest, TestStateVerificationStop) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + + overwrite_state_verification_timeout(1); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Table warmRestartTable(&m_config_db, CFG_WARM_RESTART_TABLE_NAME); + warmRestartTable.hset("system", "state_verification_shutdown", "true"); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + m_reboot_thread.Stop(); + + RebootThread::Progress progress = perform_state_verification(s); + EXPECT_EQ(progress, RebootThread::Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN, "")); +} + +TEST_F(RebootThreadTest, TestStateVerificationSelectTimeout) { + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + + swss::SelectableTimer l_timer(timespec{.tv_sec = 1, .tv_nsec = 0}); + s.addSelectable(&l_timer); + + swss::SubscriberStateTable sub(&m_db, STATE_VERIFICATION_RESP_TABLE); + s.addSelectable(&sub); + + l_timer.start(); + + std::string timestamp = "timestamp-b"; + TestUtils::write_state_verification_result(m_db, ALL_COMPONENT, SV_PASS, + "wrong-timestamp"); + + RebootThread::Progress progress = + perform_state_verification_select(s, l_timer, sub, timestamp); + + EXPECT_EQ(progress, RebootThread::Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus( + RebootStatus_Status::RebootStatus_Status_STATUS_RETRIABLE_FAILURE, + "timeout occurred during reboot state verification: retriable " + "error")); +} + +TEST_F(RebootThreadTest, TestStateVerificationSelectTimeoutNotRun) { + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + + swss::SelectableTimer l_timer(timespec{.tv_sec = 1, .tv_nsec = 0}); + s.addSelectable(&l_timer); + + swss::SubscriberStateTable sub(&m_db, STATE_VERIFICATION_RESP_TABLE); + s.addSelectable(&sub); + + l_timer.start(); + + std::string timestamp = "timestamp-b"; + TestUtils::write_state_verification_result(m_db, ALL_COMPONENT, SV_NOT_RUN, + timestamp); + + RebootThread::Progress progress = + perform_state_verification_select(s, l_timer, sub, timestamp); + + EXPECT_EQ(progress, RebootThread::Progress::EXIT_EARLY); + EXPECT_NE("timestamp-b", timestamp); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus( + RebootStatus_Status::RebootStatus_Status_STATUS_RETRIABLE_FAILURE, + "timeout occurred during reboot state verification: retriable " + "error")); +} + +TEST_F(RebootThreadTest, TestHandleStateVerificationKeepWaiting) { + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Select s; + swss::SubscriberStateTable sub(&m_db, STATE_VERIFICATION_RESP_TABLE); + s.addSelectable(&sub); + swss::Selectable *sel; + + // Unrecognized component returns KEEP_WAITING + std::string timestamp = "timestamp-a"; + TestUtils::write_state_verification_result(m_db, "not-all-component", + "dont care", timestamp); + s.select(&sel); + RebootThread::Status status = handle_state_verification_event(sub, timestamp); + EXPECT_EQ(status, RebootThread::Status::KEEP_WAITING); + + // Wrong timestamp returns KEEP_WAITING + TestUtils::write_state_verification_result(m_db, ALL_COMPONENT, SV_PASS, + "wrong-timestamp"); + s.select(&sel); + status = handle_state_verification_event(sub, timestamp); + EXPECT_EQ(status, RebootThread::Status::KEEP_WAITING); + + // Unrecognized status + correct timestamp = KEEP_WAITING + TestUtils::write_state_verification_result(m_db, ALL_COMPONENT, + "undefined-status", timestamp); + s.select(&sel); + status = handle_state_verification_event(sub, timestamp); + EXPECT_EQ(status, RebootThread::Status::KEEP_WAITING); + + // If we receive NOT_RUN as a status: we KEEP_WAITING + // timestamp should be updated with new value after re-request of state + // verification + TestUtils::write_state_verification_result(m_db, ALL_COMPONENT, SV_NOT_RUN, + timestamp); + s.select(&sel); + status = handle_state_verification_event(sub, timestamp); + EXPECT_EQ(status, RebootThread::Status::KEEP_WAITING); + EXPECT_NE(timestamp, "timestamp-a"); +} + +TEST_F(RebootThreadTest, TestHandleStateVerificationSuccess) { + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Select s; + swss::SubscriberStateTable sub(&m_db, STATE_VERIFICATION_RESP_TABLE); + s.addSelectable(&sub); + swss::Selectable *sel; + + // Pass + Correct timestamp == SUCCESS + std::string timestamp = "timestamp-b"; + TestUtils::write_state_verification_result(m_db, ALL_COMPONENT, SV_PASS, + timestamp); + int select_ret = s.select(&sel); + EXPECT_EQ(select_ret, swss::Select::OBJECT); + EXPECT_EQ(sel, &sub); + RebootThread::Status status = handle_state_verification_event(sub, timestamp); + EXPECT_EQ(status, RebootThread::Status::SUCCESS); +} + +TEST_F(RebootThreadTest, TestHandleStateVerificationFail) { + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Select s; + swss::SubscriberStateTable sub(&m_db, STATE_VERIFICATION_RESP_TABLE); + s.addSelectable(&sub); + swss::Selectable *sel; + + // Fail with correct timestampe = FAILURE + // status and message are updated + std::string timestamp = "timestamp-b"; + TestUtils::write_state_verification_result(m_db, ALL_COMPONENT, SV_FAIL, + timestamp); + s.select(&sel); + RebootThread::Status status = handle_state_verification_event(sub, timestamp); + EXPECT_EQ(status, RebootThread::Status::FAILURE); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "state verification failed during reboot")); +} + +// +// Quiescence Tests +// + +TEST_F(RebootThreadTest, TestPerformFreezeQuiescenceCriticalState) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(true)); + set_start_status(RebootMethod::NSF, "time to reboot"); + + Progress progress = perform_freeze_quiescence_w_stop(false); + EXPECT_EQ(progress, RebootThread::Progress::EXIT_EARLY); + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "system entered critical state before freezing")); +} + +TEST_F(RebootThreadTest, TestPerformFreezeQuiescenceTimeout) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + false, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + false, false); + fetch_registration_info(); + + overwrite_quiescence_timeout_ms(300); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + Progress progress = perform_freeze_quiescence_w_stop(false); + EXPECT_EQ(progress, RebootThread::Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "timeout occurred during reboot stage freeze")); +} + +TEST_F(RebootThreadTest, TestPerformFreezeQuiescenceStop) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + false, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + false, false); + fetch_registration_info(); + overwrite_quiescence_timeout_ms(1000); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + Progress progress = perform_freeze_quiescence_w_stop(true); + EXPECT_EQ(progress, RebootThread::Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + + // No error on request to stop, just log. + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN, "")); +} + +TEST_F(RebootThreadTest, TestPerformFreezeQuiescenceStartCompleted) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + false, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + false, false); + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::QUIESCENT)); + fetch_registration_info(); + overwrite_quiescence_hold_time_ms(100); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + Progress progress = perform_freeze_quiescence_w_stop(false, 1000); + EXPECT_EQ(progress, RebootThread::Progress::PROCEED); +} + +TEST_F(RebootThreadTest, + TestPerformFreezeQuiescenceUninterestingStatesAtStart) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + false, false); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::RECONCILED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::INITIALIZED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::FROZEN)); + fetch_registration_info(); + overwrite_quiescence_timeout_ms(1000); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + Progress progress = perform_freeze_quiescence_w_stop(false); + EXPECT_EQ(progress, RebootThread::Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "timeout occurred during reboot stage freeze")); +} + +TEST_F(RebootThreadTest, TestPerformFreezeQuiescenceStartFailed) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + false, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + false, false); + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::FAILED)); + fetch_registration_info(); + overwrite_quiescence_timeout_ms(500); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + Progress progress = perform_freeze_quiescence_w_stop(true); + EXPECT_EQ(progress, RebootThread::Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "check_stage: app: app2 reported FAILED during stage: freeze")); +} + +TEST_F(RebootThreadTest, TestPerformFreezeQuiescenceCompleted) { + overwrite_quiescence_hold_time_ms(100); + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + false, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + false, false); + fetch_registration_info(); + + set_start_status(RebootMethod::NSF, "time to reboot"); + auto test_sequence = [&] { + // We want to skip past the initial completed check at start of + // freeze_quiescence_select and process below as subscriptions updates + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + }; + + std::thread test_thread = std::thread(test_sequence); + + Progress progress = perform_freeze_quiescence_w_stop(false); + EXPECT_EQ(progress, Progress::PROCEED); + test_thread.join(); +} + +TEST_F(RebootThreadTest, + TestPerformFreezeQuiescenceUninterestingStatesViaSubscription) { + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + false, false); + fetch_registration_info(); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + auto test_sequence = [&] { + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::RECONCILED)); + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::WSDISABLED)); + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::WSUNKNOWN)); + }; + + std::thread test_thread = std::thread(test_sequence); + + Progress progress = perform_freeze_quiescence_w_stop(false, 300); + EXPECT_EQ(progress, Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "timeout occurred during reboot stage freeze")); + test_thread.join(); +} + +TEST_F(RebootThreadTest, TestPerformFreezeQuiescenceFailed) { + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + false, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + false, false); + fetch_registration_info(); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + auto test_sequence = [&] { + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::FAILED)); + }; + + std::thread test_thread = std::thread(test_sequence); + + Progress progress = perform_freeze_quiescence_w_stop(false, 500); + EXPECT_EQ(progress, Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "handle_state_event: app: app2 reported FAILED when " + "looking for state: freeze")); + test_thread.join(); +} + +TEST_F(RebootThreadTest, TestQuiescenceTimeoutDuringHoldTime) { + overwrite_quiescence_hold_time_ms(2000); + + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + false, false); + fetch_registration_info(); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + auto test_sequence = [&] { + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::QUIESCENT)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::INITIALIZED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + }; + + std::thread test_thread = std::thread(test_sequence); + + Progress progress = perform_freeze_quiescence_w_stop(false, 500); + EXPECT_EQ(progress, Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "timeout occurred during reboot stage freeze")); + test_thread.join(); +} + +// Same as previous test with shorter hold_time. +TEST_F(RebootThreadTest, TestQuiescenceSuccessAfterHoldTime) { + overwrite_quiescence_hold_time_ms(100); + + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + false, false); + fetch_registration_info(); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + auto test_sequence = [&] { + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::QUIESCENT)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::INITIALIZED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + }; + + std::thread test_thread = std::thread(test_sequence); + + Progress progress = perform_freeze_quiescence_w_stop(false, 500); + EXPECT_EQ(progress, Progress::PROCEED); + test_thread.join(); +} + +TEST_F(RebootThreadTest, TestFailWhenExitQuiescence) { + overwrite_quiescence_hold_time_ms(200); + + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + false, false); + fetch_registration_info(); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + auto test_sequence = [&] { + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + // Enter quiescent state + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::QUIESCENT)); + // Exit quiescent state during hold time. + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::INITIALIZED)); + }; + + std::thread test_thread = std::thread(test_sequence); + + Progress progress = perform_freeze_quiescence_w_stop(false, 500); + EXPECT_EQ(progress, Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "timeout occurred during reboot stage freeze")); + test_thread.join(); +} + +// +// Checkpoint +// + +TEST_F(RebootThreadTest, TestPerformCheckpointCriticalState) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(true)); + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + Progress progress = perform_checkpoint(s); + EXPECT_EQ(progress, Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "system entered critical state before checkpointing")); +} + +TEST_F(RebootThreadTest, TestPerformCheckpointTimeout) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + true, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, false, + true, false); + fetch_registration_info(); + + overwrite_checkpoint_timeout(1); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + + Progress progress = perform_checkpoint(s); + EXPECT_EQ(progress, Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + HasSubstr("timeout occurred during reboot stage checkpoint"))); +} + +TEST_F(RebootThreadTest, TestPerformCheckpointStop) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + true, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, false, + true, false); + fetch_registration_info(); + overwrite_quiescence_timeout_ms(1000); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + m_reboot_thread.Stop(); + + Progress progress = perform_checkpoint(s); + EXPECT_EQ(progress, Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + + // No error on request to stop, just log. + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN, "")); +} + +TEST_F(RebootThreadTest, TestPerformCheckpointStartCompleted) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + true, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, false, + true, false); + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + fetch_registration_info(); + overwrite_quiescence_timeout_ms(1000); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + m_reboot_thread.Stop(); + + Progress progress = perform_checkpoint(s); + EXPECT_EQ(progress, Progress::PROCEED); +} + +TEST_F(RebootThreadTest, TestPerformCheckpointStartFailed) { + EXPECT_CALL(m_critical_interface, is_system_critical()) + .Times(1) + .WillOnce(Return(false)); + + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + true, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, false, + true, false); + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::FAILED)); + fetch_registration_info(); + overwrite_quiescence_timeout_ms(1000); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + m_reboot_thread.Stop(); + + Progress progress = perform_checkpoint(s); + EXPECT_EQ(progress, Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + HasSubstr("check_stage: app: app2 reported FAILED " + "during stage: checkpoint"))); +} + +TEST_F(RebootThreadTest, TestPerformCheckpointCompleted) { + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + true, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, false, + true, false); + fetch_registration_info(); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + + swss::SelectableTimer l_timer(timespec{.tv_sec = 1, .tv_nsec = 0}); + s.addSelectable(&l_timer); + + swss::SubscriberStateTable sub(&m_db, STATE_WARM_RESTART_TABLE_NAME); + s.addSelectable(&sub); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + + l_timer.start(); + + Progress progress = checkpoint_stage_two(s, l_timer, sub); + EXPECT_EQ(progress, Progress::PROCEED); +} + +TEST_F(RebootThreadTest, TestPerformCheckpointUninterestingStatesIgnored) { + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + true, false); + fetch_registration_info(); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + + swss::SelectableTimer l_timer(timespec{.tv_sec = 1, .tv_nsec = 0}); + s.addSelectable(&l_timer); + + swss::SubscriberStateTable sub(&m_db, STATE_WARM_RESTART_TABLE_NAME); + s.addSelectable(&sub); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + // Confirm a non checkpoint state isn't treated as CHECKPOINTED + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::RECONCILED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::INITIALIZED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::FROZEN)); + + l_timer.start(); + + Progress progress = checkpoint_stage_two(s, l_timer, sub); + EXPECT_EQ(progress, Progress::EXIT_EARLY); + + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + HasSubstr("timeout occurred during reboot stage checkpoint"))); +} + +TEST_F(RebootThreadTest, TestPerformCheckpointFailed) { + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + true, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, false, + true, false); + fetch_registration_info(); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + + swss::SelectableTimer l_timer(timespec{.tv_sec = 1, .tv_nsec = 0}); + s.addSelectable(&l_timer); + + swss::SubscriberStateTable sub(&m_db, STATE_WARM_RESTART_TABLE_NAME); + s.addSelectable(&sub); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::FAILED)); + + l_timer.start(); + + Progress progress = checkpoint_stage_two(s, l_timer, sub); + + EXPECT_EQ(progress, Progress::EXIT_EARLY); + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "handle_state_event: app: app2 reported FAILED when looking for " + "state: checkpoint")); +} + +TEST_F(RebootThreadTest, TestPerformCheckpointUnexpectedStatesViaSubscription) { + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + true, false); + fetch_registration_info(); + + swss::Select s; + s.addSelectable(&(return_m_stop_reference())); + + swss::SelectableTimer l_timer(timespec{.tv_sec = 1, .tv_nsec = 0}); + s.addSelectable(&l_timer); + + swss::SubscriberStateTable sub(&m_db, STATE_WARM_RESTART_TABLE_NAME); + s.addSelectable(&sub); + + set_start_status(RebootMethod::NSF, "time to reboot"); + + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::INITIALIZED)); + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::RESTORED)); + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::REPLAYED)); + + l_timer.start(); + + Progress progress = checkpoint_stage_two(s, l_timer, sub); + + EXPECT_EQ(progress, Progress::EXIT_EARLY); + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + HasSubstr("timeout occurred during reboot stage checkpoint"))); +} + +// +// Stop On Freeze Tests +// + +TEST_F(RebootThreadTest, TestCheckContainerStopDbusFail) { + DbusInterface::DbusResponse dbus_response{ + DbusInterface::DbusStatus::DBUS_FAIL, "dbus reboot failed"}; + EXPECT_CALL(m_dbus_interface, StopContainerStatus(_)) + .Times(1) + .WillOnce(Return(dbus_response)); + + RebootThread::Status status = check_container_stop("requestA"); + EXPECT_EQ(status, RebootThread::Status::FAILURE); +} + +TEST_F(RebootThreadTest, TestCheckContainerStopJsonParseFailure) { + DbusInterface::DbusResponse dbus_response{ + DbusInterface::DbusStatus::DBUS_SUCCESS, "dbus reboot failed"}; + EXPECT_CALL(m_dbus_interface, StopContainerStatus(_)) + .Times(1) + .WillOnce(Return(dbus_response)); + + RebootThread::Status status = check_container_stop("requestA"); + EXPECT_EQ(status, RebootThread::Status::FAILURE); +} + +TEST_F(RebootThreadTest, TestCheckContainerStopSuccess) { + StopContainersResponse response; + response.set_status(ShutdownStatus::DONE); + + std::string json_response; + gpu::MessageToJsonString(response, &json_response); + + DbusInterface::DbusResponse dbus_response{ + DbusInterface::DbusStatus::DBUS_SUCCESS, json_response.c_str()}; + EXPECT_CALL(m_dbus_interface, StopContainerStatus(_)) + .Times(1) + .WillOnce(Return(dbus_response)); + + RebootThread::Status status = check_container_stop("requestA"); + EXPECT_EQ(status, RebootThread::Status::SUCCESS); +} + +TEST_F(RebootThreadTest, TestWaitForContainerStopMStopSignal) { + TestUtils::populate_registration_table(m_db, "docker1|app1", true, false, + false, false); + fetch_registration_info(); + set_start_status(RebootMethod::NSF, "time to reboot"); + + Progress progress = wait_for_container_stop(300); + + EXPECT_EQ(progress, Progress::EXIT_EARLY); + force_inactive(); + RebootStatusResponse response = m_reboot_thread.GetResponse(); + + // No error on request to stop, just log. + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN, "")); +} + +TEST_F(RebootThreadTest, TestWaitForContainerStopDbusReturnsStopped) { + StopContainersResponse response; + response.set_status(ShutdownStatus::DONE); + + std::string json_response; + gpu::MessageToJsonString(response, &json_response); + + DbusInterface::DbusResponse dbus_response{ + DbusInterface::DbusStatus::DBUS_SUCCESS, json_response.c_str()}; + EXPECT_CALL(m_dbus_interface, StopContainerStatus(_)) + .Times(1) + .WillOnce(Return(dbus_response)); + + TestUtils::populate_registration_table(m_db, "docker1|app1", true, false, + false, false); + fetch_registration_info(); + set_start_status(RebootMethod::NSF, "time to reboot"); + + Progress progress = wait_for_container_stop(300); + + EXPECT_EQ(progress, Progress::PROCEED); +} + +} // namespace rebootbackend diff --git a/src/sonic-framework/tests/rebootbe_test.cpp b/src/sonic-framework/tests/rebootbe_test.cpp new file mode 100644 index 000000000000..6effce3029be --- /dev/null +++ b/src/sonic-framework/tests/rebootbe_test.cpp @@ -0,0 +1,1294 @@ +#include "rebootbe.h" + +#include +#include +#include +#include + +#include +#include +#include + +#include "container_stop.pb.h" +#include "mock_reboot_interfaces.h" +#include "reboot_common.h" +#include "select.h" +#include "stateverification.h" +#include "status_code_util.h" +#include "system/system.pb.h" +#include "test_utils_common.h" +#include "timestamp.h" + +namespace rebootbackend { + +#define ONE_SECOND (1) +#define TWO_SECONDS (2) +#define TENTH_SECOND_MS (100) +#define HALF_SECOND_MS (500) +#define ONE_SECOND_MS (1000) +#define FIFTEEN_HUNDRED_MS (1500) +#define TWO_SECONDS_MS (2000) + +namespace gpu = ::google::protobuf::util; +using namespace gnoi::system; +using WarmStartState = ::swss::WarmStart::WarmStartState; +using WarmBootStage = ::swss::WarmStart::WarmBootStage; + +using ::testing::_; +using ::testing::AllOf; +using ::testing::AtLeast; +using ::testing::ExplainMatchResult; +using ::testing::HasSubstr; +using ::testing::InSequence; +using ::testing::Invoke; +using ::testing::NiceMock; +using ::testing::Return; +using ::testing::StrEq; +using ::testing::StrictMock; + +MATCHER_P2(IsStatus, status, message, "") { + return (arg.status().status() == status && + ExplainMatchResult(message, arg.status().message(), result_listener)); +} + +MATCHER_P3(ActiveCountMethod, active, count, method, "") { + return (arg.active() == active && arg.count() == (uint32_t)count && + arg.method() == method); +} + +class RebootBETestWithoutStop : public ::testing::Test { + protected: + RebootBETestWithoutStop() + : m_dbus_interface(), + m_critical_interface(), + m_db("STATE_DB", 0), + m_config_db("CONFIG_DB", 0), + m_rebootbeRequestChannel(&m_db, REBOOT_REQUEST_NOTIFICATION_CHANNEL), + m_rebootbeReponseChannel(&m_db, REBOOT_RESPONSE_NOTIFICATION_CHANNEL), + m_rebootbe(m_dbus_interface, m_critical_interface, m_telemetry) { + sigterm_requested = false; + TestUtils::clear_tables(m_db); + + auto mock_init_thread = std::make_unique>(); + m_init_thread = mock_init_thread.get(); + m_rebootbe.m_init_thread = std::move(mock_init_thread); + + m_s.addSelectable(&m_rebootbeReponseChannel); + + // Make the tests log to stdout, instead of syslog. + swss::Table logging_table(&m_config_db, CFG_LOGGER_TABLE_NAME); + logging_table.hset("rebootbackend", swss::DAEMON_LOGOUTPUT, "STDOUT"); + swss::Logger::restartLogger(); + } + virtual ~RebootBETestWithoutStop() = default; + + void force_warm_start_state(bool enabled) { + swss::Table enable_table(&m_db, STATE_WARM_RESTART_ENABLE_TABLE_NAME); + enable_table.hset("system", "enable", enabled ? "true" : "false"); + enable_table.hset("sonic-framework", "enable", enabled ? "true" : "false"); + + swss::Table restart_table(&m_db, STATE_WARM_RESTART_TABLE_NAME); + restart_table.hset("rebootbackend", "restore_count", enabled ? "0" : ""); + } + + gnoi::system::RebootStatusResponse default_not_started_status() { + InitThreadStatus status; + return status.get_response(); + } + + gnoi::system::RebootStatusResponse default_done_status() { + InitThreadStatus status; + // We can't edit the status without it being active. + status.set_start_status(); + status.set_success(); + status.set_inactive(); + return status.get_response(); + } + + gnoi::system::RebootStatusResponse default_running_status() { + InitThreadStatus status; + status.set_start_status(); + status.set_detailed_thread_status( + InitThreadStatus::ThreadStatus::WAITING_FOR_RECONCILIATION); + return status.get_response(); + } + + gnoi::system::RebootStatusResponse default_error_status() { + InitThreadStatus status; + status.set_start_status(); + status.set_error(InitThreadStatus::ErrorCondition::RECONCILIATION_FAILED, + "Fake reconciliation failed"); + return status.get_response(); + } + + void start_rebootbe() { + m_rebootbe_thread = + std::make_unique(&RebootBE::Start, &m_rebootbe); + } + + void set_mock_defaults() { + ON_CALL(m_dbus_interface, Reboot(_)) + .WillByDefault(Return(DbusInterface::DbusResponse{ + DbusInterface::DbusStatus::DBUS_SUCCESS, ""})); + } + + void overwrite_reboot_timeout(uint32_t timeout_seconds) { + m_rebootbe.m_reboot_thread.m_reboot_timeout = timeout_seconds; + } + + void overwrite_state_verification_timeout(uint32_t timeout_seconds) { + m_rebootbe.m_reboot_thread.m_state_verification_timeout = timeout_seconds; + } + + void overwrite_quiescent_timeout_ms(uint64_t timeout_ms) { + m_rebootbe.m_reboot_thread.m_quiescence_timeout_ms = timeout_ms; + } + + void overwrite_quiescent_hold_time_ms(uint64_t timeout_ms) { + m_rebootbe.m_reboot_thread.m_quiescence_hold_time_ms = timeout_ms; + } + + void overwrite_checkpoint_timeout(uint32_t timeout_seconds) { + m_rebootbe.m_reboot_thread.m_checkpoint_timeout = timeout_seconds; + } + + void send_stop_reboot_thread() { m_rebootbe.m_reboot_thread.Stop(); } + + void SendRebootRequest(const std::string &op, const std::string &data, + const std::string &field, const std::string &value) { + std::vector values; + values.push_back(swss::FieldValueTuple{field, value}); + + m_rebootbeRequestChannel.send(op, data, values); + } + + void SendRebootViaProto(RebootRequest &request) { + std::string json_string; + gpu::MessageToJsonString(request, &json_string); + + SendRebootRequest("Reboot", "StatusCode", DATA_TUPLE_KEY, json_string); + } + + void SendRebootStatusRequest(void) { + SendRebootRequest("RebootStatus", "StatusCode", DATA_TUPLE_KEY, + "json status request"); + } + + void start_reboot_via_rpc( + RebootRequest &request, + swss::StatusCode expected_result = swss::StatusCode::SWSS_RC_SUCCESS) { + SendRebootViaProto(request); + while (true) { + int ret; + swss::Selectable *sel; + ret = m_s.select(&sel, SELECT_TIMEOUT_MS); + if (ret != swss::Select::OBJECT) continue; + if (sel != &m_rebootbeReponseChannel) continue; + break; + } + std::string op, data; + std::vector ret_values; + m_rebootbeReponseChannel.pop(op, data, ret_values); + + EXPECT_THAT(op, StrEq("Reboot")); + EXPECT_THAT(data, StrEq(swss::statusCodeToStr(expected_result))); + } + + gnoi::system::RebootStatusResponse do_reboot_status_rpc() { + SendRebootStatusRequest(); + while (true) { + int ret; + swss::Selectable *sel; + ret = m_s.select(&sel, SELECT_TIMEOUT_MS); + if (ret != swss::Select::OBJECT) continue; + if (sel != &m_rebootbeReponseChannel) continue; + break; + } + std::string op, data; + std::vector ret_values; + m_rebootbeReponseChannel.pop(op, data, ret_values); + + EXPECT_THAT(op, StrEq("RebootStatus")); + EXPECT_EQ(data, swss::statusCodeToStr(swss::StatusCode::SWSS_RC_SUCCESS)); + + std::string json_response; + for (auto &fv : ret_values) { + if (DATA_TUPLE_KEY == fvField(fv)) { + json_response = fvValue(fv); + } + } + gnoi::system::RebootStatusResponse response; + gpu::JsonStringToMessage(json_response, &response); + return response; + } + + void GetNotificationResponse(swss::NotificationConsumer &consumer, + std::string &op, std::string &data, + std::vector &values) { + swss::Select s; + s.addSelectable(&consumer); + swss::Selectable *sel; + s.select(&sel, SELECT_TIMEOUT_MS); + + consumer.pop(op, data, values); + } + + NotificationResponse handle_reboot_request(std::string &json_request) { + return m_rebootbe.handle_reboot_request(json_request); + } + + void set_all_telemetry_expects(bool freeze_status = true, + bool checkpoint_status = true) { + set_telemetry_overall_expects(freeze_status && checkpoint_status); + set_telemetry_stage_expects(WarmBootStage::STAGE_FREEZE, freeze_status); + if (freeze_status) { + set_telemetry_stage_expects(WarmBootStage::STAGE_CHECKPOINT, + checkpoint_status); + } + } + + void set_telemetry_overall_expects(bool success) { + EXPECT_CALL(m_telemetry, record_overall_start()).Times(1); + if (!success) { + EXPECT_CALL(m_telemetry, record_overall_end(success)).Times(1); + } + } + + void set_telemetry_stage_expects(WarmBootStage nsf_stage, bool success) { + EXPECT_CALL(m_telemetry, record_stage_start(nsf_stage)).Times(1); + EXPECT_CALL(m_telemetry, record_stage_end(nsf_stage, success)).Times(1); + } + + swss::SelectableEvent &get_stack_unfrozen_select() { + return m_rebootbe.m_stack_unfrozen; + } + + swss::SelectableEvent &get_init_done_select() { + return m_rebootbe.m_init_thread_done; + } + + // Mock interfaces. + NiceMock m_dbus_interface; + NiceMock m_critical_interface; + StrictMock m_telemetry; + + // DB connectors + swss::DBConnector m_db; + swss::DBConnector m_config_db; + + // Reboot thread signaling. + swss::NotificationProducer m_rebootbeRequestChannel; + swss::Select m_s; + swss::NotificationConsumer m_rebootbeReponseChannel; + + // Module under test. + std::unique_ptr m_rebootbe_thread; + RebootBE m_rebootbe; + + // Not owned by test. + StrictMock *m_init_thread; +}; + +class RebootBETest : public RebootBETestWithoutStop { + protected: + ~RebootBETest() { + m_rebootbe.Stop(); + m_rebootbe_thread->join(); + } +}; + +// Init sequence testing. +TEST_F(RebootBETest, ColdbootInitWorks) { + force_warm_start_state(false); + + EXPECT_CALL(*m_init_thread, GetResponse()) + .Times(2) + .WillRepeatedly(Return(default_not_started_status())); + + start_rebootbe(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(false, 0, RebootMethod::COLD)); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_SUCCESS, "")); +} + +TEST_F(RebootBETest, WarmbootInitWorks) { + force_warm_start_state(true); + + { + InSequence seq; + EXPECT_CALL(*m_init_thread, Start()) + .WillOnce(Return(swss::StatusCode::SWSS_RC_SUCCESS)); + + // Status request during warmboot init, then during Join sequence. + EXPECT_CALL(*m_init_thread, GetResponse()) + .Times(2) + .WillRepeatedly(Return(default_running_status())) + .RetiresOnSaturation(); + + // Normal Join sequence when reaching COMPLETED. + EXPECT_CALL(*m_init_thread, Join()).WillOnce(Return(true)); + + // Status request after warmboot init, then cleanup sequence. + EXPECT_CALL(*m_init_thread, GetResponse()) + .Times(2) + .WillRepeatedly(Return(default_done_status())); + } + + start_rebootbe(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_INIT_WAIT); + + // Check status during init. + EXPECT_THAT( + do_reboot_status_rpc(), + AllOf(ActiveCountMethod(true, 0, RebootMethod::NSF), + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN, + ""))); + + get_stack_unfrozen_select().notify(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_INIT_WAIT); + + get_init_done_select().notify(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + + // Check that NSF status is sticky after init, before a new coldboot starts. + EXPECT_THAT( + do_reboot_status_rpc(), + AllOf(ActiveCountMethod(false, 0, RebootMethod::NSF), + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_SUCCESS, + ""))); +} + +TEST_F(RebootBETest, InitThreadFailsToStart) { + force_warm_start_state(true); + + { + InSequence seq; + EXPECT_CALL(*m_init_thread, Start()) + .WillOnce(Return(swss::StatusCode::SWSS_RC_INTERNAL)); + + // Cleanup sequence. + EXPECT_CALL(*m_init_thread, GetResponse()) + .WillOnce(Return(default_not_started_status())); + } + + start_rebootbe(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); +} + +TEST_F(RebootBETest, WarmbootInProgressBlocksNewWarmboot) { + force_warm_start_state(true); + + // Start InitThread, but do not run to completion. + { + InSequence seq; + EXPECT_CALL(*m_init_thread, Start()) + .WillOnce(Return(swss::StatusCode::SWSS_RC_SUCCESS)); + + // Cleanup sequence. + EXPECT_CALL(*m_init_thread, GetResponse()) + .WillOnce(Return(default_done_status())); + } + + start_rebootbe(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_INIT_WAIT); + + // Send a warmboot request, confirm it fails. + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request, swss::StatusCode::SWSS_RC_IN_USE); + + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_INIT_WAIT); +} + +TEST_F(RebootBETest, ColdbootWhileWarmbootInProgress) { + force_warm_start_state(true); + set_mock_defaults(); + + // Start InitThread, but do not run to completion. + { + InSequence seq; + EXPECT_CALL(*m_init_thread, Start()) + .WillOnce(Return(swss::StatusCode::SWSS_RC_SUCCESS)); + + // Cleanup sequence. + EXPECT_CALL(*m_init_thread, GetResponse()) + .WillOnce(Return(default_done_status())); + } + + start_rebootbe(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_INIT_WAIT); + + // Send a coldboot request, confirm it starts. + RebootRequest request; + request.set_method(RebootMethod::COLD); + start_reboot_via_rpc(request); + + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::COLD_REBOOT_IN_PROGRESS); + + // Cleanup without going through the whole reboot. + send_stop_reboot_thread(); +} + +TEST_F(RebootBETestWithoutStop, WarmbootStopDuringInit) { + force_warm_start_state(true); + + { + InSequence seq; + EXPECT_CALL(*m_init_thread, Start()) + .WillOnce(Return(swss::StatusCode::SWSS_RC_SUCCESS)); + + // Stop triggers the cleanup sequnce without either of the SelectableEvent's + // being triggered. + EXPECT_CALL(*m_init_thread, GetResponse()) + .WillOnce(Return(default_running_status())); + EXPECT_CALL(*m_init_thread, Stop()).Times(1); + EXPECT_CALL(*m_init_thread, Join()).WillOnce(Return(true)); + } + + start_rebootbe(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_INIT_WAIT); + + // Manually join and verify state. + m_rebootbe.Stop(); + m_rebootbe_thread->join(); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_INIT_WAIT); +} + +TEST_F(RebootBETestWithoutStop, WarmbootErrorBeforeUnfreeze) { + force_warm_start_state(true); + + { + InSequence seq; + // Immediately report an error from the InitThread after starting. + auto done_lambda = [&] { + get_init_done_select().notify(); + return swss::StatusCode::SWSS_RC_SUCCESS; + }; + EXPECT_CALL(*m_init_thread, Start()).WillOnce(Invoke(done_lambda)); + + // Normal Join sequence when reaching COMPLETED. + EXPECT_CALL(*m_init_thread, GetResponse()) + .WillOnce(Return(default_error_status())) + .RetiresOnSaturation(); + EXPECT_CALL(*m_init_thread, Join()).WillOnce(Return(false)); + + // Cleanup sequence. + EXPECT_CALL(*m_init_thread, GetResponse()) + .WillOnce(Return(default_done_status())); + } + + start_rebootbe(); + + // Immediately handle InitThread error and become IDLE. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + + // Manually join and verify state. + m_rebootbe.Stop(); + m_rebootbe_thread->join(); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); +} + +TEST_F(RebootBETestWithoutStop, WarmbootErrorBeforeComplete) { + force_warm_start_state(true); + + { + InSequence seq; + EXPECT_CALL(*m_init_thread, Start()) + .WillOnce(Return(swss::StatusCode::SWSS_RC_SUCCESS)); + + // Normal Join sequence when reaching COMPLETED. + EXPECT_CALL(*m_init_thread, GetResponse()) + .WillOnce(Return(default_error_status())) + .RetiresOnSaturation(); + EXPECT_CALL(*m_init_thread, Join()).WillOnce(Return(false)); + + // Cleanup sequnce. + EXPECT_CALL(*m_init_thread, GetResponse()) + .WillOnce(Return(default_error_status())); + EXPECT_CALL(*m_init_thread, Stop()).Times(1); + EXPECT_CALL(*m_init_thread, Join()).WillOnce(Return(false)); + } + + start_rebootbe(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_INIT_WAIT); + + // Advance to waiting for unfreeze. + get_stack_unfrozen_select().notify(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_INIT_WAIT); + + // Triggered as part of InitThread error reporting. + get_init_done_select().notify(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + + // Manually join and verify state. + m_rebootbe.Stop(); + m_rebootbe_thread->join(); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); +} + +// Test fixture to skip through the startup sequence into the main loop. +// Param indicates if RebootBE should be initialized into a state where the +// system came up in warmboot. +class RebootBEAutoStartTest : public RebootBETest, + public ::testing::WithParamInterface { + protected: + RebootBEAutoStartTest() { + force_warm_start_state(GetParam()); + + if (GetParam()) { + EXPECT_CALL(*m_init_thread, Start()) + .WillOnce(Return(swss::StatusCode::SWSS_RC_SUCCESS)); + EXPECT_CALL(*m_init_thread, Join()).WillOnce(Return(true)); + EXPECT_CALL(*m_init_thread, GetResponse()) + .WillOnce(Return(default_running_status())) + .WillRepeatedly(Return(default_done_status())); + } else { + EXPECT_CALL(*m_init_thread, GetResponse()) + .WillRepeatedly(Return(default_not_started_status())); + } + + start_rebootbe(); + + if (GetParam()) { + get_stack_unfrozen_select().notify(); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + get_init_done_select().notify(); + } + + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + } +}; + +// Normal operation testing. +TEST_P(RebootBEAutoStartTest, NonExistentMessage) { + swss::NotificationConsumer consumer(&m_db, + REBOOT_RESPONSE_NOTIFICATION_CHANNEL); + + // No "MESSAGE" in field/values + SendRebootRequest("Reboot", "StatusCode", "field1", "field1_value"); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + + std::string op, data; + std::vector ret_values; + GetNotificationResponse(consumer, op, data, ret_values); + + EXPECT_THAT(op, StrEq("Reboot")); + EXPECT_THAT( + data, + StrEq(swss::statusCodeToStr(swss::StatusCode::SWSS_RC_INVALID_PARAM))); +} + +TEST_P(RebootBEAutoStartTest, TestCancelReboot) { + swss::NotificationConsumer consumer(&m_db, + REBOOT_RESPONSE_NOTIFICATION_CHANNEL); + + SendRebootRequest("CancelReboot", "StatusCode", DATA_TUPLE_KEY, + "json cancelreboot request"); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + + std::string op, data; + std::vector ret_values; + GetNotificationResponse(consumer, op, data, ret_values); + + EXPECT_THAT(op, StrEq("CancelReboot")); + EXPECT_THAT( + data, + StrEq(swss::statusCodeToStr(swss::StatusCode::SWSS_RC_UNIMPLEMENTED))); +} + +TEST_P(RebootBEAutoStartTest, TestUnrecognizedOP) { + swss::NotificationConsumer consumer(&m_db, + REBOOT_RESPONSE_NOTIFICATION_CHANNEL); + + SendRebootRequest("NonOp", "StatusCode", DATA_TUPLE_KEY, "invalid op code"); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + + std::string op, data; + std::vector ret_values; + GetNotificationResponse(consumer, op, data, ret_values); + + EXPECT_THAT(op, StrEq("NonOp")); + EXPECT_THAT( + data, + StrEq(swss::statusCodeToStr(swss::StatusCode::SWSS_RC_INVALID_PARAM))); +} + +TEST_P(RebootBEAutoStartTest, TestColdRebootDbusToCompletion) { + DbusInterface::DbusResponse dbus_response{ + DbusInterface::DbusStatus::DBUS_SUCCESS, ""}; + EXPECT_CALL(m_dbus_interface, Reboot(_)) + .Times(3) + .WillRepeatedly(Return(dbus_response)); + + EXPECT_CALL(m_critical_interface, + report_critical_state("platform failed to reboot")) + .Times(3); + overwrite_reboot_timeout(1); + + RebootRequest request; + request.set_method(RebootMethod::COLD); + start_reboot_via_rpc(request); + + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::COLD_REBOOT_IN_PROGRESS); + sleep(TWO_SECONDS); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(false, 1, RebootMethod::COLD)); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "platform failed to reboot")); + + start_reboot_via_rpc(request); + sleep(TWO_SECONDS); + + start_reboot_via_rpc(request); + sleep(TWO_SECONDS); + + response = do_reboot_status_rpc(); + // Verifiy count is 3 after three reboot attempts. + EXPECT_THAT(response, ActiveCountMethod(false, 3, RebootMethod::COLD)); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "platform failed to reboot")); +} + +TEST_P(RebootBEAutoStartTest, TestColdBootSigterm) { + sigterm_requested = true; + set_mock_defaults(); + overwrite_reboot_timeout(1); + + RebootRequest request; + request.set_method(RebootMethod::COLD); + start_reboot_via_rpc(request); + + sleep(ONE_SECOND); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + gnoi::system::RebootStatusResponse second_resp = do_reboot_status_rpc(); + EXPECT_THAT(second_resp, ActiveCountMethod(false, 1, RebootMethod::COLD)); + EXPECT_THAT( + second_resp, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN, "")); +} + +TEST_P(RebootBEAutoStartTest, TestColdBootDbusError) { + // Return FAIL from dbus reboot call. + DbusInterface::DbusResponse dbus_response{ + DbusInterface::DbusStatus::DBUS_FAIL, "dbus reboot failed"}; + EXPECT_CALL(m_dbus_interface, Reboot(_)) + .Times(1) + .WillOnce(Return(dbus_response)); + + RebootRequest request; + request.set_method(RebootMethod::COLD); + start_reboot_via_rpc(request); + + sleep(TWO_SECONDS); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + gnoi::system::RebootStatusResponse second_resp = do_reboot_status_rpc(); + EXPECT_THAT(second_resp, ActiveCountMethod(false, 1, RebootMethod::COLD)); + EXPECT_THAT(second_resp, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "dbus reboot failed")); +} + +TEST_P(RebootBEAutoStartTest, TestStopDuringColdBoot) { + set_mock_defaults(); + + RebootRequest request; + request.set_method(RebootMethod::COLD); + start_reboot_via_rpc(request); + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::COLD_REBOOT_IN_PROGRESS); + + send_stop_reboot_thread(); + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(false, 1, RebootMethod::COLD)); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_UNKNOWN, "")); +} + +TEST_P(RebootBEAutoStartTest, TestNSFToCompletion) { + set_mock_defaults(); + + EXPECT_CALL(m_critical_interface, + report_critical_state("platform failed to reboot")) + .Times(1); + + set_telemetry_overall_expects(false); + set_telemetry_stage_expects(WarmBootStage::STAGE_FREEZE, true); + set_telemetry_stage_expects(WarmBootStage::STAGE_CHECKPOINT, true); + + overwrite_reboot_timeout(1); + overwrite_quiescent_hold_time_ms(100); + + // skip state verification + TestUtils::set_state_verification_enable(m_config_db, false, false); + + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + true, false); + + auto test_sequence = [&] { + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_REBOOT_IN_PROGRESS); + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + }; + + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request); + + // Warm start states are cleared at beginning of NSF boot. + std::thread test_thread = std::thread(test_sequence); + + // 1 second reboot timeout + // 1/10 second delay before warm state is written in test + // 1/10 second delay for quiescent hold time + std::this_thread::sleep_for(std::chrono::milliseconds(TWO_SECONDS_MS)); + + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + test_thread.join(); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + EXPECT_THAT(response, ActiveCountMethod(false, 1, RebootMethod::NSF)); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "platform failed to reboot")); + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/false); +} + +TEST_P(RebootBEAutoStartTest, TestStateVerificationFailedTimeout) { + set_mock_defaults(); + + overwrite_state_verification_timeout(1); + set_telemetry_overall_expects(/*success=*/false); + + TestUtils::set_state_verification_enable(m_config_db, false, true); + + // Empty registration: if we fail, it should be because of + // state verification + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + false, false); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request); + + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_REBOOT_IN_PROGRESS); + + // We have to wait for the 1 second state verification timeout + std::this_thread::sleep_for(std::chrono::milliseconds(TWO_SECONDS_MS)); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(false, 1, RebootMethod::NSF)); + EXPECT_THAT( + response, + IsStatus( + RebootStatus_Status::RebootStatus_Status_STATUS_RETRIABLE_FAILURE, + "timeout occurred during reboot state verification: retriable " + "error")); + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/false); +} + +TEST_P(RebootBEAutoStartTest, TestQuiescenceFailedTimeout) { + set_mock_defaults(); + + overwrite_quiescent_timeout_ms(400); + set_all_telemetry_expects(/*freeze_status=*/false, + /*checkpoint_status=*/false); + + TestUtils::set_state_verification_enable(m_config_db, false, false); + + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + false, false); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request); + + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_REBOOT_IN_PROGRESS); + + // We have to wait for the 1 second quiescence + std::this_thread::sleep_for(std::chrono::milliseconds(ONE_SECOND_MS)); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(false, 1, RebootMethod::NSF)); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "timeout occurred during reboot stage freeze")); + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/false); +} + +TEST_P(RebootBEAutoStartTest, TestCheckpointFailedTimeout) { + set_mock_defaults(); + + overwrite_checkpoint_timeout(1); + overwrite_quiescent_hold_time_ms(100); + set_all_telemetry_expects(/*freeze_status=*/true, + /*checkpoint_status=*/false); + + TestUtils::set_state_verification_enable(m_config_db, false, false); + + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + true, false); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request); + + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_REBOOT_IN_PROGRESS); + + // 1/10 second for quiescence hold time + // 1 second for checkpoint timeout + std::this_thread::sleep_for(std::chrono::milliseconds(TWO_SECONDS_MS)); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(false, 1, RebootMethod::NSF)); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + HasSubstr("timeout occurred during reboot stage checkpoint"))); + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/false); +} + +TEST_P(RebootBEAutoStartTest, TestNSFDbusRebootError) { + // Return FAIL from dbus reboot call. + DbusInterface::DbusResponse dbus_response{ + DbusInterface::DbusStatus::DBUS_FAIL, "dbus reboot failed"}; + EXPECT_CALL(m_dbus_interface, Reboot(_)) + .Times(1) + .WillOnce(Return(dbus_response)); + set_telemetry_overall_expects(/*status=*/false); + set_telemetry_stage_expects(WarmBootStage::STAGE_FREEZE, /*status=*/true); + set_telemetry_stage_expects(WarmBootStage::STAGE_CHECKPOINT, + /*status=*/true); + + overwrite_quiescent_hold_time_ms(100); + + TestUtils::set_state_verification_enable(m_config_db, false, false); + + // Empty registration. + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + false, false); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request); + + // Short wait: there should be no state verification, checkpoint or + // or platform reboot delays + // 1/10 second for quiescent hold time + // the quiescent select timeout is 250ms + std::this_thread::sleep_for(std::chrono::milliseconds(HALF_SECOND_MS)); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(false, 1, RebootMethod::NSF)); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "dbus reboot failed")); + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/false); +} + +// Test redis tables are cleared. +// - warm boot states should be cleared +// - existing apps in init table should be cleared +TEST_P(RebootBEAutoStartTest, TestRedisNSFSetup) { + set_mock_defaults(); + set_telemetry_overall_expects(/*success=*/false); + + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + + swss::Table warmRestartTable(&m_db, STATE_WARM_RESTART_TABLE_NAME); + std::string state = ""; + warmRestartTable.hget("app1", "state", state); + EXPECT_EQ(state, get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + + swss::Table initTable(&m_db, STATE_WARM_RESTART_INIT_TABLE_NAME); + initTable.hset("docker2|app2", "timestamp", "fake-timestamp"); + std::string timestamp = ""; + initTable.hget("docker2|app2", "timestamp", timestamp); + EXPECT_THAT(timestamp, StrEq("fake-timestamp")); + + overwrite_state_verification_timeout(1); + + TestUtils::set_state_verification_enable(m_config_db, false, true); + + // Empty registration: if we fail, it should be because of + // state verification + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + false, false); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request); + + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_REBOOT_IN_PROGRESS); + + // We have to wait for the 1 second state verification timeout + std::this_thread::sleep_for(std::chrono::milliseconds(TWO_SECONDS_MS)); + + state = ""; + warmRestartTable.hget("app1", "state", state); + EXPECT_NE(state, get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + + timestamp = ""; + initTable.hget("docker2|app2", "timestamp", timestamp); + EXPECT_NE(timestamp, "fake-timestamp"); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(false, 1, RebootMethod::NSF)); + EXPECT_THAT( + response, + IsStatus( + RebootStatus_Status::RebootStatus_Status_STATUS_RETRIABLE_FAILURE, + HasSubstr("timeout occurred during reboot state verification: " + "retriable error"))); + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/false); +} + +TEST_P(RebootBEAutoStartTest, TestNSFFailureFollowedByColdBoot) { + DbusInterface::DbusResponse dbus_response{ + DbusInterface::DbusStatus::DBUS_SUCCESS, ""}; + EXPECT_CALL(m_dbus_interface, Reboot(_)) + .Times(1) + .WillRepeatedly(Return(dbus_response)); + + EXPECT_CALL(m_critical_interface, + report_critical_state("platform failed to reboot")) + .Times(1); + overwrite_reboot_timeout(1); + overwrite_checkpoint_timeout(1); + overwrite_quiescent_hold_time_ms(100); + + set_all_telemetry_expects(/*freeze_status=*/true, + /*checkpoint_status=*/false); + + TestUtils::set_state_verification_enable(m_config_db, false, false); + + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + true, false); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request); + + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_REBOOT_IN_PROGRESS); + + // 1/10 second quiescence hold time + // 1 second checkpoint timeout + std::this_thread::sleep_for(std::chrono::milliseconds(TWO_SECONDS_MS)); + + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + + EXPECT_THAT(response, ActiveCountMethod(false, 1, RebootMethod::NSF)); + EXPECT_THAT( + response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + HasSubstr("timeout occurred during reboot stage checkpoint"))); + + request.set_method(RebootMethod::COLD); + start_reboot_via_rpc(request); + + // We have to wait for the 1 second reboot Timeout + std::this_thread::sleep_for(std::chrono::milliseconds(TWO_SECONDS_MS)); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(false, 2, RebootMethod::COLD)); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "platform failed to reboot")); + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/false); +} + +TEST_P(RebootBEAutoStartTest, TestInvalidJsonRebootRequest) { + std::string json_request = "abcd"; + NotificationResponse response = handle_reboot_request(json_request); + EXPECT_EQ(swss::StatusCode::SWSS_RC_INTERNAL, response.status); +} + +TEST_P(RebootBEAutoStartTest, TestStopDuringRebootStateVerification) { + set_telemetry_overall_expects(/*success=*/false); + + // Enable state verification with default 260 sec timeout) + TestUtils::set_state_verification_enable(m_config_db, false, true); + + // Empty registration. + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + false, false); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request); + + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_REBOOT_IN_PROGRESS); + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(true, 1, RebootMethod::NSF)); + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/true); + + // Reboot thread is active: its waiting for state verification to complete + // TearDown will call rebootbe.Stop() which will Stop and Join the + // reboot thread +} + +TEST_P(RebootBEAutoStartTest, TestStopDuringRebootFreezeStage) { + set_all_telemetry_expects(/*freeze_status=*/false, + /*checkpoint_status=*/false); + + // Disable state verification + TestUtils::set_state_verification_enable(m_config_db, false, false); + + // Register for checkpoint monitoring + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + false, false); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request); + + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_REBOOT_IN_PROGRESS); + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(true, 1, RebootMethod::NSF)); + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/true); + + // Reboot thread is active: its waiting for app1 to quiesce. + // TearDown will call rebootbe.Stop() which will Stop and Join the + // reboot thread +} + +TEST_P(RebootBEAutoStartTest, TestStopDuringRebootCheckpointStage) { + set_all_telemetry_expects(/*freeze_status=*/true, + /*checkpoint_status=*/false); + + // Disable state verification. + overwrite_quiescent_hold_time_ms(10); + TestUtils::set_state_verification_enable(m_config_db, false, false); + + // Register app1 for checkpointing. + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + true, false); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request); + + // With the short hold time we need 250+ms to allow the quiescence hold + // time select timeout to fire. + std::this_thread::sleep_for(std::chrono::milliseconds(HALF_SECOND_MS)); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_REBOOT_IN_PROGRESS); + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(true, 1, RebootMethod::NSF)); + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/true); + + // Reboot thread is active: its waiting for app1 to checkpoint. + // TearDown will call rebootbe.Stop() which will Stop and Join the + // reboot thread +} + +TEST_P(RebootBEAutoStartTest, TestStopDuringWaitPlatformReboot) { + set_telemetry_overall_expects(/*status=*/false); + set_telemetry_stage_expects(WarmBootStage::STAGE_FREEZE, /*status=*/true); + set_telemetry_stage_expects(WarmBootStage::STAGE_CHECKPOINT, + /*status=*/true); + + // Disable state verification. + overwrite_quiescent_hold_time_ms(10); + TestUtils::set_state_verification_enable(m_config_db, false, false); + + // Empty registration. + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + false, false); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request); + + // With the short hold time we need 250+ms to allow the quiescence hold + // time select timeout to fire. + std::this_thread::sleep_for(std::chrono::milliseconds(HALF_SECOND_MS)); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_REBOOT_IN_PROGRESS); + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(true, 1, RebootMethod::NSF)); + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/true); + + // Reboot thread is active: its waiting for the platform to reboot. + // TearDown will call rebootbe.Stop() which will Stop and Join the + // reboot thread +} + +// +// Stop On Freeze Tests +// +TEST_P(RebootBEAutoStartTest, TestStopDuringWaitForStopOnFreeze) { + set_all_telemetry_expects(false); + + overwrite_quiescent_hold_time_ms(50); + overwrite_quiescent_timeout_ms(1000); + + DbusInterface::DbusResponse dbus_response{ + DbusInterface::DbusStatus::DBUS_SUCCESS, ""}; + EXPECT_CALL(m_dbus_interface, StopContainerStatus(_)) + .Times(AtLeast(1)) + .WillRepeatedly(Return(dbus_response)); + + // Disable state verification. + TestUtils::set_state_verification_enable(m_config_db, false, false); + + // Register app1 for checkpointing. + TestUtils::populate_registration_table(m_db, "docker1|app1", true, false, + false, false); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request); + + std::this_thread::sleep_for(std::chrono::milliseconds(TENTH_SECOND_MS)); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), + RebootBE::NsfManagerStatus::NSF_REBOOT_IN_PROGRESS); + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(true, 1, RebootMethod::NSF)); + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/true); + + // Reboot thread is active: its waiting for docker1 to exit/stop + // TearDown will call rebootbe.Stop() which will Stop and Join the + // reboot thread +} + +TEST_P(RebootBEAutoStartTest, TestStopOnFreezeTimeout) { + set_all_telemetry_expects(false); + + overwrite_quiescent_hold_time_ms(50); + overwrite_quiescent_timeout_ms(1000); + + // An empty string is not a valid json stop container status response + DbusInterface::DbusResponse dbus_response{ + DbusInterface::DbusStatus::DBUS_SUCCESS, ""}; + EXPECT_CALL(m_dbus_interface, StopContainerStatus(_)) + .Times(AtLeast(1)) + .WillRepeatedly(Return(dbus_response)); + + TestUtils::set_state_verification_enable(m_config_db, false, false); + + TestUtils::populate_registration_table(m_db, "docker1|app1", true, false, + false, false); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request); + + // Container stop status is checked every half second. + std::this_thread::sleep_for(std::chrono::milliseconds(FIFTEEN_HUNDRED_MS)); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(false, 1, RebootMethod::NSF)); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "timeout occurred waiting for containers to stop")); + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/false); +} + +TEST_P(RebootBEAutoStartTest, TestDbusErrorRequestingContainerStop) { + set_all_telemetry_expects(false); + + // Return FAIL from dbus reboot call. + DbusInterface::DbusResponse dbus_response{ + DbusInterface::DbusStatus::DBUS_FAIL, + "dbus error calling StopContainers"}; + EXPECT_CALL(m_dbus_interface, StopContainers(_)) + .Times(1) + .WillOnce(Return(dbus_response)); + + TestUtils::set_state_verification_enable(m_config_db, false, false); + + TestUtils::populate_registration_table(m_db, "docker1|app1", true, false, + false, false); + + RebootRequest request; + request.set_method(RebootMethod::NSF); + start_reboot_via_rpc(request); + + // Container stop status is checked every half second. + std::this_thread::sleep_for(std::chrono::milliseconds(ONE_SECOND_MS)); + + EXPECT_EQ(m_rebootbe.GetCurrentStatus(), RebootBE::NsfManagerStatus::IDLE); + + gnoi::system::RebootStatusResponse response = do_reboot_status_rpc(); + EXPECT_THAT(response, ActiveCountMethod(false, 1, RebootMethod::NSF)); + EXPECT_THAT(response, + IsStatus(RebootStatus_Status::RebootStatus_Status_STATUS_FAILURE, + "dbus error calling StopContainers")); + TestUtils::check_warmboot_enabled(m_db, /*expected_state=*/false); +} + +INSTANTIATE_TEST_SUITE_P(TestWithStartupWarmbootEnabledState, + RebootBEAutoStartTest, testing::Values(true, false)); + +} // namespace rebootbackend diff --git a/src/sonic-framework/tests/redis_utils_test.cpp b/src/sonic-framework/tests/redis_utils_test.cpp new file mode 100644 index 000000000000..c42161691a22 --- /dev/null +++ b/src/sonic-framework/tests/redis_utils_test.cpp @@ -0,0 +1,785 @@ +#include "redis_utils.h" + +#include +#include +#include + +#include +#include +#include + +#include "select.h" +#include "stateverification.h" +#include "table.h" +#include "test_utils_common.h" +#include "timestamp.h" +#include "warm_restart.h" + +namespace rebootbackend { + +using WarmStartState = ::swss::WarmStart::WarmStartState; +using WarmBootStage = ::swss::WarmStart::WarmBootStage; +using WarmBootNotification = ::swss::WarmStart::WarmBootNotification; + +using ::testing::AllOf; +using ::testing::HasSubstr; +using ::testing::StrEq; + +class RedisTest : public ::testing::Test { + protected: + RedisTest() : m_db("STATE_DB", 0), m_reg(), m_init_reg() { + TestUtils::clear_tables(m_db); + } + + swss::DBConnector m_db; + Registration m_reg; + InitRegistration m_init_reg; + + void clear_contents() { return m_reg.clear_contents(); } + + size_t get_state_set_size(WarmBootStage nsf_stage) { + return m_reg.m_remaining_apps.at(nsf_stage).size(); + } + + size_t get_reregistration_set_size() { + return m_init_reg.m_missing_registrations.size(); + } + + // Special version of name mapping that is compatible with the DB values. + std::string warm_boot_stage_name(WarmBootStage stage) { + switch (stage) { + case (WarmBootStage::STAGE_FREEZE): { + return "quiescent"; + } + case (WarmBootStage::STAGE_CHECKPOINT): { + return "checkpointed"; + } + case (WarmBootStage::STAGE_RECONCILIATION): { + return "reconciled"; + } + case (WarmBootStage::STAGE_UNFREEZE): { + return "completed"; + } + default: { + return ""; + } + } + } + + void populate_default_init_table() { + swss::Table initTable(&m_db, STATE_WARM_RESTART_INIT_TABLE_NAME); + initTable.hset("docker1|app1", "timestamp", ""); + initTable.hset("docker2|app2", "timestamp", ""); + initTable.hset("docker3|app3", "timestamp", ""); + initTable.hset("docker4|app1", "timestamp", ""); + // The invalid entry should not end up in the list of apps. + initTable.hset("invalid", "timestamp", ""); + } + + friend class Registration; +}; + +TEST_F(RedisTest, testSendNsfManagerNotification) { + swss::NotificationConsumer nc( + &m_db, swss::WarmStart::kNsfManagerNotificationChannel); + swss::Select s; + s.addSelectable(&nc); + + send_nsf_manager_notification(m_db, WarmBootNotification::kFreeze); + + swss::Selectable *sel; + bool ret = s.select(&sel, 1); + EXPECT_EQ(ret, swss::Select::OBJECT); + + std::string op, data; + std::vector values; + nc.pop(op, data, values); + auto fv = values[0]; + + EXPECT_EQ(op, swss::WarmStart::warmBootNotificationNameMap()->at( + WarmBootNotification::kFreeze)); + + send_nsf_manager_notification(m_db, WarmBootNotification::kUnfreeze); + + ret = s.select(&sel, 1); + EXPECT_EQ(ret, swss::Select::OBJECT); + + nc.pop(op, data, values); + fv = values[0]; + + EXPECT_EQ(op, swss::WarmStart::warmBootNotificationNameMap()->at( + WarmBootNotification::kUnfreeze)); + + send_nsf_manager_notification(m_db, WarmBootNotification::kCheckpoint); + + ret = s.select(&sel, 1); + EXPECT_EQ(ret, swss::Select::OBJECT); + + nc.pop(op, data, values); + fv = values[0]; + + EXPECT_EQ(op, swss::WarmStart::warmBootNotificationNameMap()->at( + WarmBootNotification::kCheckpoint)); +} + +TEST_F(RedisTest, testSendStateVerification) { + swss::NotificationConsumer nc(&m_db, STATE_VERIFICATION_REQ_CHANNEL); + swss::Select s; + s.addSelectable(&nc); + + std::string timestamp = send_state_verification_notification(m_db, true); + + swss::Selectable *sel; + bool ret = s.select(&sel, 1); + EXPECT_EQ(ret, swss::Select::OBJECT); + + std::string op, data; + std::vector values; + nc.pop(op, data, values); + auto fv = values[0]; + + EXPECT_EQ(op, ALL_COMPONENT); + EXPECT_EQ(data, timestamp); + EXPECT_EQ(FREEZE_FIELD, fvField(fv)); + EXPECT_EQ("true", fvValue(fv)); + + std::string second_timestamp = + send_state_verification_notification(m_db, false); + + ret = s.select(&sel, 1); + EXPECT_EQ(ret, swss::Select::OBJECT); + + nc.pop(op, data, values); + fv = values[0]; + + EXPECT_EQ(op, ALL_COMPONENT); + EXPECT_EQ(data, second_timestamp); + EXPECT_EQ(FREEZE_FIELD, fvField(fv)); + EXPECT_EQ("false", fvValue(fv)); + EXPECT_NE(timestamp, second_timestamp); +} + +TEST_F(RedisTest, testInitWarmRebootStates) { + swss::Table warmRestartTable(&m_db, STATE_WARM_RESTART_TABLE_NAME); + + warmRestartTable.hset("app1", "state", "disabled"); + warmRestartTable.hset("app1", "timestamp", "abcdefg"); + warmRestartTable.hset("app2", "state", "reconciled"); + warmRestartTable.hset("app2", "timestamp", "zyxwvu"); + + std::string value; + bool ret = warmRestartTable.hget("app1", "state", value); + EXPECT_TRUE(ret); + + ret = warmRestartTable.hget("app1", "timestamp", value); + EXPECT_TRUE(ret); + + init_warm_reboot_states(m_db); + + ret = warmRestartTable.hget("app1", "state", value); + EXPECT_FALSE(ret); + + ret = warmRestartTable.hget("app1", "timestamp", value); + EXPECT_FALSE(ret); + + ret = warmRestartTable.hget("app2", "state", value); + EXPECT_FALSE(ret); + + ret = warmRestartTable.hget("app2", "timestamp", value); + EXPECT_FALSE(ret); +} + +TEST_F(RedisTest, testSetWarmRestartEnable) { + swss::Table warmRestartTable(&m_db, STATE_WARM_RESTART_ENABLE_TABLE_NAME); + + for (const auto &enabled : {true, false}) { + warmRestartTable.del("system"); + + set_warm_restart_enable(m_db, enabled); + + std::string value; + bool ret = warmRestartTable.hget("system", "enable", value); + EXPECT_TRUE(ret); + EXPECT_EQ(value, enabled ? "true" : "false"); + } +} + +TEST_F(RedisTest, TestIsValidKeyAndGetDockerAppFromKey) { + std::string key = "abc|def"; + std::string separator = "|"; + std::string docker, app; + + EXPECT_TRUE(is_valid_key(key, separator)); + EXPECT_TRUE(get_docker_app_from_key(key, separator, docker, app)); + EXPECT_EQ(docker, "abc"); + EXPECT_EQ(app, "def"); + + key = "abcd|"; + EXPECT_FALSE(is_valid_key(key, separator)); + EXPECT_FALSE(get_docker_app_from_key(key, separator, docker, app)); + + key = "|abcd"; + EXPECT_FALSE(is_valid_key(key, separator)); + EXPECT_FALSE(get_docker_app_from_key(key, separator, docker, app)); + + key = "abcd"; + EXPECT_FALSE(is_valid_key(key, separator)); + EXPECT_FALSE(get_docker_app_from_key(key, separator, docker, app)); + + separator = ""; + key = "abc|def"; + EXPECT_FALSE(is_valid_key(key, separator)); + EXPECT_FALSE(get_docker_app_from_key(key, separator, docker, app)); +} + +TEST_F(RedisTest, GetWarmRestartCounter) { + EXPECT_THAT(get_warm_restart_counter(m_db), StrEq("")); + for (int i = 0; i < 5; i++) { + set_warm_restart_counter(m_db, i); + EXPECT_THAT(get_warm_restart_counter(m_db), StrEq(std::to_string(i))); + } +} + +TEST_F(RedisTest, TestFetchRegistrationInfo) { + TestUtils::populate_registration_table(m_db, "invalid", false, false, false, + true); + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + false, true); + TestUtils::populate_registration_table(m_db, "docker2|app2", true, true, true, + false); + TestUtils::populate_registration_table(m_db, "docker3|app3", false, false, + true, false); + + m_reg.fetch_registration_info(); + + Registration::RegistrationSet set = m_reg.get_registered_app_set(); + + EXPECT_TRUE(set.count("docker1|app1")); + EXPECT_TRUE(set.count("docker2|app2")); + EXPECT_TRUE(set.count("docker2|app2")); + EXPECT_EQ(set.size(), 3); +} + +TEST_F(RedisTest, TestStopOnFreezeList) { + TestUtils::populate_registration_table(m_db, "docker1|app1", true, false, + false, true); + TestUtils::populate_registration_table(m_db, "docker2|app2", true, false, + false, false); + TestUtils::populate_registration_table(m_db, "docker3|app3", false, false, + true, false); + + m_reg.fetch_registration_info(); + Registration::RegistrationSet set = m_reg.get_stop_on_freeze_set(); + EXPECT_EQ(2, set.size()); + EXPECT_EQ(1, set.count("docker1")); + EXPECT_EQ(1, set.count("docker2")); +} + +TEST_F(RedisTest, TestCheckQuiesced) { + // No apps registered. + m_reg.fetch_registration_info(); + Registration::Response response = m_reg.check_quiesced(); + EXPECT_EQ(response.status, Registration::Status::COMPLETED); + + // Apps registered, but have not reached the correct state. + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + false, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + false, false); + TestUtils::populate_registration_table(m_db, "docker3|app3", false, true, + false, false); + TestUtils::populate_registration_table(m_db, "docker4|app4", true, false, + false, false); + + m_reg.fetch_registration_info(); + EXPECT_EQ(3, get_state_set_size(WarmBootStage::STAGE_FREEZE)); + + // app1 and app2 reach the correct state. + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::QUIESCENT)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + + response = m_reg.check_quiesced(); + EXPECT_EQ(response.status, Registration::Status::IN_PROCESS); + EXPECT_EQ(1, get_state_set_size(WarmBootStage::STAGE_FREEZE)); + + // app3 reaches the correct state. + TestUtils::populate_restart_table_state( + m_db, "app3", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + + response = m_reg.check_quiesced(); + EXPECT_EQ(response.status, Registration::Status::COMPLETED); + EXPECT_EQ(0, get_state_set_size(WarmBootStage::STAGE_FREEZE)); + + // app3 reports an error. + m_reg.fetch_registration_info(); + TestUtils::populate_restart_table_state( + m_db, "app3", get_warm_start_state_name(WarmStartState::FAILED)); + + response = m_reg.check_quiesced(); + EXPECT_EQ(response.status, Registration::Status::FAILURE); + EXPECT_FALSE(response.error_string.empty()); +} + +TEST_F(RedisTest, TestCheckCheckpointed) { + // No apps registered. + m_reg.fetch_registration_info(); + Registration::Response response = m_reg.check_checkpointed(); + EXPECT_EQ(response.status, Registration::Status::COMPLETED); + + // Apps registered, but have not reached the correct state. + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + true, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, false, + true, false); + TestUtils::populate_registration_table(m_db, "docker3|app3", false, false, + true, false); + TestUtils::populate_registration_table(m_db, "docker4|app4", true, false, + false, false); + + m_reg.fetch_registration_info(); + EXPECT_EQ(3, get_state_set_size(WarmBootStage::STAGE_CHECKPOINT)); + + // app2 reaches the correct state. app1 has changed state, but is not yet + // checkpointed. + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::QUIESCENT)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + + response = m_reg.check_checkpointed(); + EXPECT_EQ(response.status, Registration::Status::IN_PROCESS); + EXPECT_EQ(2, get_state_set_size(WarmBootStage::STAGE_CHECKPOINT)); + + // app1 and app3 reach the correct state. + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + TestUtils::populate_restart_table_state( + m_db, "app3", get_warm_start_state_name(WarmStartState::CHECKPOINTED)); + + response = m_reg.check_checkpointed(); + EXPECT_EQ(response.status, Registration::Status::COMPLETED); + EXPECT_EQ(0, get_state_set_size(WarmBootStage::STAGE_CHECKPOINT)); + + // app3 reports an error. + m_reg.fetch_registration_info(); + TestUtils::populate_restart_table_state( + m_db, "app3", get_warm_start_state_name(WarmStartState::FAILED)); + + response = m_reg.check_checkpointed(); + EXPECT_EQ(response.status, Registration::Status::FAILURE); + EXPECT_FALSE(response.error_string.empty()); +} + +TEST_F(RedisTest, TestCheckReconciled) { + // No apps registered. + m_reg.fetch_registration_info(); + Registration::Response response = m_reg.check_reconciled(); + EXPECT_EQ(response.status, Registration::Status::COMPLETED); + + // Apps registered, but have not reached the correct state. + TestUtils::populate_registration_table(m_db, "docker1|app1", false, false, + false, true); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, false, + false, true); + TestUtils::populate_registration_table(m_db, "docker3|app3", false, false, + false, true); + TestUtils::populate_registration_table(m_db, "docker4|app4", true, false, + false, false); + + m_reg.fetch_registration_info(); + EXPECT_EQ(3, get_state_set_size(WarmBootStage::STAGE_RECONCILIATION)); + EXPECT_THAT(m_reg.join_pending_apps(WarmBootStage::STAGE_RECONCILIATION), + AllOf(HasSubstr("app1"), HasSubstr("app2"), HasSubstr("app3"))); + + // app1 and app2 reach the correct state. + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::RECONCILED)); + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::RECONCILED)); + + response = m_reg.check_reconciled(); + EXPECT_EQ(response.status, Registration::Status::IN_PROCESS); + EXPECT_EQ(1, get_state_set_size(WarmBootStage::STAGE_RECONCILIATION)); + EXPECT_THAT(m_reg.join_pending_apps(WarmBootStage::STAGE_RECONCILIATION), + HasSubstr("app3")); + + // app3 reaches the correct state. + TestUtils::populate_restart_table_state( + m_db, "app3", get_warm_start_state_name(WarmStartState::RECONCILED)); + + response = m_reg.check_reconciled(); + EXPECT_EQ(response.status, Registration::Status::COMPLETED); + EXPECT_EQ(0, get_state_set_size(WarmBootStage::STAGE_RECONCILIATION)); + EXPECT_THAT(m_init_reg.join_pending_apps(), StrEq("")); + + // app3 reports an error. + m_reg.fetch_registration_info(); + TestUtils::populate_restart_table_state( + m_db, "app3", get_warm_start_state_name(WarmStartState::FAILED)); + + response = m_reg.check_reconciled(); + EXPECT_EQ(response.status, Registration::Status::FAILURE); + EXPECT_FALSE(response.error_string.empty()); + EXPECT_THAT(m_reg.join_pending_apps(WarmBootStage::STAGE_RECONCILIATION), + HasSubstr("app3")); +} + +TEST_F(RedisTest, TestCheckUnfrozen) { + // No apps registered. + m_reg.fetch_registration_info(); + Registration::Response response = m_reg.check_unfrozen(); + EXPECT_EQ(response.status, Registration::Status::COMPLETED); + + // Apps registered. app4 reaches the correct state, but the others have not. + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + false, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + false, false); + TestUtils::populate_registration_table(m_db, "docker3|app3", false, false, + false, true); + TestUtils::populate_registration_table(m_db, "docker4|app4", true, false, + false, false); + TestUtils::populate_restart_table_state( + m_db, "app4", get_warm_start_state_name(WarmStartState::COMPLETED)); + + m_reg.fetch_registration_info(); + response = m_reg.check_unfrozen(); + EXPECT_EQ(response.status, Registration::Status::IN_PROCESS); + EXPECT_EQ(2, get_state_set_size(WarmBootStage::STAGE_UNFREEZE)); + EXPECT_THAT(m_reg.join_pending_apps(WarmBootStage::STAGE_UNFREEZE), + AllOf(HasSubstr("app1"), HasSubstr("app2"))); + + // app1 reaches the correct state. + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::COMPLETED)); + + response = m_reg.check_unfrozen(); + EXPECT_EQ(response.status, Registration::Status::IN_PROCESS); + EXPECT_EQ(1, get_state_set_size(WarmBootStage::STAGE_UNFREEZE)); + EXPECT_THAT(m_reg.join_pending_apps(WarmBootStage::STAGE_UNFREEZE), + HasSubstr("app2")); + + // app 2 reaches the correct state. We do not monitor app 3. + TestUtils::populate_restart_table_state( + m_db, "app2", get_warm_start_state_name(WarmStartState::COMPLETED)); + + response = m_reg.check_unfrozen(); + EXPECT_EQ(response.status, Registration::Status::COMPLETED); + EXPECT_EQ(0, get_state_set_size(WarmBootStage::STAGE_UNFREEZE)); + EXPECT_THAT(m_reg.join_pending_apps(WarmBootStage::STAGE_UNFREEZE), + StrEq("")); + + // app1 reports an error. + m_reg.fetch_registration_info(); + TestUtils::populate_restart_table_state( + m_db, "app1", get_warm_start_state_name(WarmStartState::FAILED)); + + response = m_reg.check_unfrozen(); + EXPECT_EQ(response.status, Registration::Status::FAILURE); + EXPECT_FALSE(response.error_string.empty()); + EXPECT_THAT(m_reg.join_pending_apps(WarmBootStage::STAGE_UNFREEZE), + AllOf(HasSubstr("app1"))); +} + +class RedisTestWithWarmStartState + : public RedisTest, + public ::testing::WithParamInterface {}; + +TEST_P(RedisTestWithWarmStartState, TestEventHandling) { + // Apps registered. No app has reported state. + const std::vector test_keys( + {"docker1|app1", "docker2|app2", "docker3|app3"}); + for (const auto &key : test_keys) { + TestUtils::populate_registration_table( + m_db, key, false, + GetParam() == WarmBootStage::STAGE_FREEZE || + GetParam() == WarmBootStage::STAGE_UNFREEZE, + GetParam() == WarmBootStage::STAGE_CHECKPOINT, + GetParam() == WarmBootStage::STAGE_RECONCILIATION); + } + TestUtils::populate_registration_table(m_db, "docker4|app4", true, false, + false, false); + m_reg.fetch_registration_info(); + + // Ignore invalid operation + swss::KeyOpFieldsValuesTuple state_event = {"app1", "DEL", {{"state", ""}}}; + Registration::Response response = + m_reg.handle_state_event(GetParam(), state_event); + EXPECT_EQ(response.status, Registration::Status::IN_PROCESS); + EXPECT_EQ(3, get_state_set_size(GetParam())); + + // app1 reaches its final state, but the others have not reported state. + state_event = {"app1", "SET", {{"state", warm_boot_stage_name(GetParam())}}}; + response = m_reg.handle_state_event(GetParam(), state_event); + EXPECT_EQ(response.status, Registration::Status::IN_PROCESS); + EXPECT_EQ(2, get_state_set_size(GetParam())); + + // All apps report final state one-by-one. + for (size_t i = 1; i < test_keys.size(); i++) { + std::string docker, app; + bool ret = get_docker_app_from_key(test_keys[i], "|", docker, app); + EXPECT_TRUE(ret); + + state_event = {app, "SET", {{"state", warm_boot_stage_name(GetParam())}}}; + response = m_reg.handle_state_event(GetParam(), state_event); + + if (i < test_keys.size() - 1) { + EXPECT_EQ(response.status, Registration::Status::IN_PROCESS); + } else { + EXPECT_EQ(response.status, Registration::Status::COMPLETED); + } + EXPECT_EQ(get_state_set_size(GetParam()), test_keys.size() - (i + 1)); + } + + // app3 reports an error. + m_reg.fetch_registration_info(); + state_event = {"app3", "SET", {{"state", "failed"}}}; + + response = m_reg.handle_state_event(GetParam(), state_event); + EXPECT_EQ(response.status, Registration::Status::FAILURE); + EXPECT_FALSE(response.error_string.empty()); +} + +TEST_P(RedisTestWithWarmStartState, HandleEventSkipInvalidKey) { + TestUtils::populate_registration_table( + m_db, "docker1|app1", false, + GetParam() == WarmBootStage::STAGE_FREEZE || + GetParam() == WarmBootStage::STAGE_UNFREEZE, + GetParam() == WarmBootStage::STAGE_CHECKPOINT, + GetParam() == WarmBootStage::STAGE_RECONCILIATION); + m_reg.fetch_registration_info(); + + swss::KeyOpFieldsValuesTuple state_event = { + "invalid", "SET", {{"state", "completed"}}}; + + Registration::Response response = + m_reg.handle_state_event(GetParam(), state_event); + EXPECT_EQ(response.status, Registration::Status::IN_PROCESS); + EXPECT_EQ(get_state_set_size(GetParam()), 1); +} + +INSTANTIATE_TEST_SUITE_P(TestOverWarmStateStates, RedisTestWithWarmStartState, + testing::Values(WarmBootStage::STAGE_FREEZE, + WarmBootStage::STAGE_CHECKPOINT, + WarmBootStage::STAGE_RECONCILIATION, + WarmBootStage::STAGE_UNFREEZE)); + +TEST_F(RedisTest, TestHandleQuiescenceEvent) { + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + false, false); + m_reg.fetch_registration_info(); + + swss::KeyOpFieldsValuesTuple state_event = { + "app1", "DEL", {{"state", "checkpointed"}}}; + Registration::Response response = + m_reg.handle_state_event(WarmBootStage::STAGE_FREEZE, state_event); + EXPECT_EQ(response.status, Registration::Status::IN_PROCESS); + + state_event = {"app1", "SET", {{"state", "checkpointed"}}}; + response = m_reg.handle_state_event(WarmBootStage::STAGE_FREEZE, state_event); + EXPECT_EQ(response.status, Registration::Status::COMPLETED); + + state_event = {"app1", "DEL", {{"state", "completed"}}}; + response = m_reg.handle_state_event(WarmBootStage::STAGE_FREEZE, state_event); + EXPECT_EQ(response.status, Registration::Status::COMPLETED); + + state_event = {"app1", "SET", {{"state", "failed"}}}; + response = m_reg.handle_state_event(WarmBootStage::STAGE_FREEZE, state_event); + EXPECT_EQ(response.status, Registration::Status::FAILURE); +} + +TEST_F(RedisTest, TestHandleQuiescenceEnterExitCompleteState) { + TestUtils::populate_registration_table(m_db, "docker1|app1", false, true, + false, false); + TestUtils::populate_registration_table(m_db, "docker2|app2", false, true, + false, false); + m_reg.fetch_registration_info(); + + swss::KeyOpFieldsValuesTuple state_event = { + "app1", "SET", {{"state", "quiescent"}}}; + Registration::Response response = + m_reg.handle_state_event(WarmBootStage::STAGE_FREEZE, state_event); + EXPECT_EQ(response.status, Registration::Status::IN_PROCESS); + + state_event = {"app2", "SET", {{"state", "checkpointed"}}}; + response = m_reg.handle_state_event(WarmBootStage::STAGE_FREEZE, state_event); + EXPECT_EQ(response.status, Registration::Status::COMPLETED); + + state_event = {"app1", "SET", {{"state", "replayed"}}}; + response = m_reg.handle_state_event(WarmBootStage::STAGE_FREEZE, state_event); + EXPECT_EQ(response.status, Registration::Status::IN_PROCESS); + + state_event = {"app1", "SET", {{"state", "quiescent"}}}; + response = m_reg.handle_state_event(WarmBootStage::STAGE_FREEZE, state_event); + EXPECT_EQ(response.status, Registration::Status::COMPLETED); + + state_event = {"app2", "SET", {{"state", "restored"}}}; + response = m_reg.handle_state_event(WarmBootStage::STAGE_FREEZE, state_event); + EXPECT_EQ(response.status, Registration::Status::IN_PROCESS); + + state_event = {"app2", "SET", {{"state", "checkpointed"}}}; + response = m_reg.handle_state_event(WarmBootStage::STAGE_FREEZE, state_event); + EXPECT_EQ(response.status, Registration::Status::COMPLETED); +} + +TEST_F(RedisTest, TestClearContents) { + TestUtils::populate_registration_table(m_db, "docker1|app1", true, false, + false, true); + TestUtils::populate_registration_table(m_db, "docker2|app2", true, false, + false, false); + TestUtils::populate_registration_table(m_db, "docker3|app3", false, true, + true, false); + m_reg.fetch_registration_info(); + Registration::RegistrationSet set = m_reg.get_stop_on_freeze_set(); + EXPECT_EQ(2, set.size()); + EXPECT_EQ(1, get_state_set_size(WarmBootStage::STAGE_FREEZE)); + EXPECT_EQ(1, get_state_set_size(WarmBootStage::STAGE_CHECKPOINT)); + EXPECT_EQ(1, get_state_set_size(WarmBootStage::STAGE_RECONCILIATION)); + EXPECT_EQ(1, get_state_set_size(WarmBootStage::STAGE_UNFREEZE)); + + clear_contents(); + + set = m_reg.get_stop_on_freeze_set(); + EXPECT_TRUE(set.empty()); + + set = m_reg.get_registered_app_set(); + EXPECT_TRUE(set.empty()); + + EXPECT_EQ(0, get_state_set_size(WarmBootStage::STAGE_FREEZE)); + EXPECT_EQ(0, get_state_set_size(WarmBootStage::STAGE_CHECKPOINT)); + EXPECT_EQ(0, get_state_set_size(WarmBootStage::STAGE_RECONCILIATION)); + EXPECT_EQ(0, get_state_set_size(WarmBootStage::STAGE_UNFREEZE)); +} + +TEST_F(RedisTest, TestClearAllInitApps) { + const std::vector kTestKeys( + {"docker1|app1", "docker2|app2", "docker3|app3", "docker4|app1"}); + for (const auto &key : kTestKeys) { + TestUtils::populate_registration_table(m_db, key, false, false, false, + true); + } + + m_reg.fetch_registration_info(); + m_reg.save_all_init_apps(); + + swss::Table initTable(&m_db, STATE_WARM_RESTART_INIT_TABLE_NAME); + std::string value; + for (const auto &key : kTestKeys) { + EXPECT_TRUE(initTable.hget(key, "timestamp", value)); + } + + m_reg.clear_all_init_apps(); + + for (const auto &key : kTestKeys) { + EXPECT_FALSE(initTable.hget(key, "timestamp", value)); + } +} + +TEST_F(RedisTest, TestSaveInitApps) { + const std::vector kTestKeys( + {"docker1|app1", "docker2|app2", "docker3|app3", "docker4|app1"}); + for (const auto &key : kTestKeys) { + TestUtils::populate_registration_table(m_db, key, false, false, false, + true); + } + // The invalid entry should not end up in the table. + TestUtils::populate_registration_table(m_db, "invalid", false, false, false, + true); + + m_reg.fetch_registration_info(); + m_reg.save_all_init_apps(); + + swss::Table initTable(&m_db, STATE_WARM_RESTART_INIT_TABLE_NAME); + std::string value; + + for (const auto &key : kTestKeys) { + EXPECT_TRUE(initTable.hget(key, "timestamp", value)); + } +} + +TEST_F(RedisTest, TestInitTargetApps) { + // Contains 4 valid apps and 1 invalid app. + populate_default_init_table(); + + m_init_reg.fetch_init_app_info(); + + EXPECT_EQ(get_reregistration_set_size(), 4); +} + +TEST_F(RedisTest, TestCheckReregistration) { + populate_default_init_table(); + + // Before reading the init table, we do not know apps need to re-register. + EXPECT_EQ(m_init_reg.check_reregistration_status(), + InitRegistration::Status::COMPLETED); + EXPECT_EQ(get_reregistration_set_size(), 0); + EXPECT_THAT(m_init_reg.join_pending_apps(), StrEq("")); + + // No apps have re-registered. All valid apps are still pending. + m_init_reg.fetch_init_app_info(); + + EXPECT_EQ(m_init_reg.check_reregistration_status(), + InitRegistration::Status::IN_PROGRESS); + EXPECT_EQ(get_reregistration_set_size(), 4); + EXPECT_THAT(m_init_reg.join_pending_apps(), + AllOf(HasSubstr("docker1|app1"), HasSubstr("docker2|app2"), + HasSubstr("docker3|app3"), HasSubstr("docker4|app1"))); + + // app1 re-registers. Other apps remain outstanding. + TestUtils::populate_registration_table(m_db, "docker1|app1", true, false, + false, true); + + EXPECT_EQ(m_init_reg.check_reregistration_status(), + InitRegistration::Status::IN_PROGRESS); + EXPECT_EQ(get_reregistration_set_size(), 3); + EXPECT_THAT(m_init_reg.join_pending_apps(), + AllOf(HasSubstr("docker2|app2"), HasSubstr("docker3|app3"), + HasSubstr("docker4|app1"))); + + // Other outstanding apps re-register + TestUtils::populate_registration_table(m_db, "docker2|app2", true, false, + false, true); + TestUtils::populate_registration_table(m_db, "docker3|app3", true, false, + false, true); + TestUtils::populate_registration_table(m_db, "docker4|app1", true, false, + false, true); + + EXPECT_EQ(m_init_reg.check_reregistration_status(), + InitRegistration::Status::COMPLETED); + EXPECT_EQ(get_reregistration_set_size(), 0); + EXPECT_THAT(m_init_reg.join_pending_apps(), StrEq("")); +} + +TEST_F(RedisTest, TestHandleRegistrationEvent) { + populate_default_init_table(); + + // No apps have re-registered. All valid apps are still pending. + m_init_reg.fetch_init_app_info(); + + EXPECT_EQ(m_init_reg.check_reregistration_status(), + InitRegistration::Status::IN_PROGRESS); + EXPECT_EQ(get_reregistration_set_size(), 4); + + // Trigger re-registration events for apps one-by-one. + const std::vector event_keys( + {"docker1|app1", "docker2|app2", "docker3|app3", "docker4|app1"}); + for (size_t i = 0; i < event_keys.size(); i++) { + const swss::KeyOpFieldsValuesTuple event = { + event_keys[i], "HSET", {{"timestamp", ""}}}; + + m_init_reg.handle_registration_event(event); + + if (i < event_keys.size() - 1) { + EXPECT_EQ(m_init_reg.check_reregistration_status(), + InitRegistration::Status::IN_PROGRESS); + } else { + EXPECT_EQ(m_init_reg.check_reregistration_status(), + InitRegistration::Status::COMPLETED); + } + EXPECT_EQ(get_reregistration_set_size(), event_keys.size() - (i + 1)); + } +} + +} // namespace rebootbackend diff --git a/src/sonic-framework/tests/telemetry_helper_test.cpp b/src/sonic-framework/tests/telemetry_helper_test.cpp new file mode 100644 index 000000000000..d4b14077d136 --- /dev/null +++ b/src/sonic-framework/tests/telemetry_helper_test.cpp @@ -0,0 +1,394 @@ +#include "telemetry_helper.h" + +#include +#include + +#include +#include +#include + +#include "init_thread.h" +#include "reboot_interfaces.h" +#include "redis_utils.h" +#include "table.h" +#include "test_utils_common.h" +#include "warm_restart.h" + +namespace rebootbackend { + +using swss::WarmStart; +using ::testing::_; +using ::testing::Contains; +using ::testing::ExplainMatchResult; +using ::testing::IsEmpty; +using ::testing::StrEq; + +// A fake app class that has methods to trigger telemetry writes. Used to +// abstract some test logic out of the tests themselves. +class FakeApp { + public: + FakeApp(const std::string &app_name) : m_app_name(app_name) {} + + void record_start(WarmStart::WarmBootStage nsf_stage) const { + swss::WarmStart::updateAppWarmBootStageStart(nsf_stage, m_app_name); + } + + // There are two different API calls in warm_restart.cpp to report final + // status. + void record_end(WarmStart::WarmBootStage nsf_stage, bool success) const { + if (success) { + swss::WarmStart::updateAppWarmBootStageEnd(nsf_stage_to_state(nsf_stage), + m_app_name); + } else { + swss::WarmStart::updateAppWarmBootStageEndOnFailure(nsf_stage, + m_app_name); + } + } + + const std::string &get_name() const { return m_app_name; } + + private: + static WarmStart::WarmStartState nsf_stage_to_state( + WarmStart::WarmBootStage nsf_stage) { + switch (nsf_stage) { + case WarmStart::WarmBootStage::STAGE_FREEZE: { + return WarmStart::WarmStartState::QUIESCENT; + } + case WarmStart::WarmBootStage::STAGE_CHECKPOINT: { + return WarmStart::WarmStartState::CHECKPOINTED; + } + case WarmStart::WarmBootStage::STAGE_RECONCILIATION: { + return WarmStart::WarmStartState::RECONCILED; + } + case WarmStart::WarmBootStage::STAGE_UNFREEZE: { + return WarmStart::WarmStartState::COMPLETED; + } + default: { + return WarmStart::WarmStartState::COMPLETED; + } + } + } + + std::string m_app_name; +}; + +class TelemetryHelperTest : public ::testing::Test { + public: + TelemetryHelperTest() + : m_db("STATE_DB", 0), + m_separator(swss::TableBase::getTableSeparator(m_db.getDbId())), + m_telemetry_helper() { + swss::WarmStart::initialize("fake_app", "fake_docker"); + TestUtils::clear_tables(m_db); + + for (const auto &app : kFakeApps) { + TestUtils::populate_registration_table( + m_db, concat_key("docker1", app.get_name()), false, true, true, true); + } + } + + // Checks for DB entries in the form: + // WARM_RESTART_PERFORMANCE_TABLE|system + // WARM_RESTART_PERFORMANCE_HISTORY||system + void check_overall_entries(int count, const std::string &expected_status) { + check_overall_entries(count, expected_status, + default_fields_for_status(expected_status)); + } + + void check_overall_entries( + int count, const std::string &expected_status, + const std::unordered_set &expected_fields) { + check_table_entries(STATE_WARM_RESTART_PERF_TABLE_NAME, "system", + expected_status, expected_fields); + check_table_entries(STATE_WARM_RESTART_PERF_HISTORY_TABLE_NAME, + concat_key(std::to_string(count), "system"), + expected_status, expected_fields); + } + + // Checks for DB entries in the form: + // WARM_RESTART_PERFORMANCE_TABLE| + // WARM_RESTART_PERFORMANCE_TABLE|| + // WARM_RESTART_PERFORMANCE_HISTORY|| + // WARM_RESTART_PERFORMANCE_HISTORY||| + void check_stage_start_entries(int count, + WarmStart::WarmBootStage nsf_stage) { + fully_parameterized_stage_entries(count, nsf_stage, {}, "in-progress", "", + default_fields_for_status("in-progress")); + } + + void check_stage_end_entries(int count, WarmStart::WarmBootStage nsf_stage, + bool success) { + std::string expected_status = get_end_status_string(success); + fully_parameterized_stage_entries( + count, nsf_stage, kFakeApps, expected_status, + get_app_end_status_string(nsf_stage, success), + default_fields_for_status(expected_status)); + } + + void fully_parameterized_stage_entries( + int count, WarmStart::WarmBootStage nsf_stage, + const std::vector apps, const std::string &expected_status, + const std::string &expected_app_status, + const std::unordered_set &expected_fields) { + std::string stage_name = + swss::WarmStart::warmBootStageToNameMap()->at(nsf_stage); + check_table_entries(STATE_WARM_RESTART_PERF_TABLE_NAME, stage_name, + expected_status, expected_fields); + check_table_entries(STATE_WARM_RESTART_PERF_HISTORY_TABLE_NAME, + concat_key(std::to_string(count), stage_name), + expected_status, expected_fields); + + for (const auto &app : apps) { + std::string key_to_check = concat_key(stage_name, app.get_name()); + check_table_entries(STATE_WARM_RESTART_PERF_TABLE_NAME, key_to_check, + expected_app_status, expected_fields); + check_table_entries(STATE_WARM_RESTART_PERF_HISTORY_TABLE_NAME, + concat_key(std::to_string(count), key_to_check), + expected_app_status, expected_fields); + } + } + + // A filled out swss::FieldValueTuple with the specified values. + std::vector default_fvs( + const std::string &status, const std::string &start_timestamp, + const std::string &end_timestamp) { + std::vector field_values( + {{swss::WarmStart::kPerfTableStatusAttr, status}, + {swss::WarmStart::kPerfTableStartTimeAttr, start_timestamp}}); + if (!end_timestamp.empty()) { + field_values.push_back( + {swss::WarmStart::kPerfTableFinishTimeAttr, end_timestamp}); + } + return field_values; + } + + std::string get_end_status_string(bool success) { + return success ? "success" : "failure"; + } + + std::string get_app_end_status_string(WarmStart::WarmBootStage nsf_stage, + bool success) { + if (nsf_stage == WarmStart::WarmBootStage::STAGE_FREEZE) { + return success ? "quiescent" : "failure"; + } + return success ? "success" : "failure"; + } + + private: + // Not to be used directly. + void check_table_entries( + const std::string &table_name, const std::string &key, + const std::string &expected_status, + const std::unordered_set &expected_fields) { + // Check that we have the key. + swss::Table table(&m_db, table_name); + std::vector keys; + table.getKeys(keys); + EXPECT_THAT(keys, Contains(key)) + << "Table: " << table_name << " did not contain key: " << key; + + // Check that we have the right fields for those keys. + std::vector field_values; + bool result = table.get(key, field_values); + EXPECT_TRUE(result); + + std::unordered_set remaining_expected_fields(expected_fields); + for (const auto &field_value : field_values) { + remaining_expected_fields.erase(fvField(field_value)); + } + std::ostringstream stream; + std::copy(expected_fields.begin(), expected_fields.end(), + std::ostream_iterator(stream, ", ")); + EXPECT_THAT(remaining_expected_fields, IsEmpty()) + << "Table: " << table_name << " had incorrect fields for key: " << key + << ". Expected fields: " << stream.str(); + + // Check that the value of "status" is correct. + std::string actual_status; + result = + table.hget(key, swss::WarmStart::kPerfTableStatusAttr, actual_status); + EXPECT_THAT(actual_status, StrEq(expected_status)) + << "Table: " << table_name << " had incorrect status for key: " << key; + } + + std::string concat_key(const std::string &str1, const std::string &str2) { + return str1 + m_separator + str2; + } + + const std::unordered_set &default_fields_for_status( + std::string status) { + return status == "in-progress" ? kStartFields : kEndFields; + } + + protected: + const std::unordered_set kStartFields = { + swss::WarmStart::kPerfTableStatusAttr, + swss::WarmStart::kPerfTableStartTimeAttr}; + const std::unordered_set kEndFields = { + swss::WarmStart::kPerfTableStatusAttr, + swss::WarmStart::kPerfTableStartTimeAttr, + swss::WarmStart::kPerfTableFinishTimeAttr}; + const std::unordered_set kEndWithoutStartFields = { + swss::WarmStart::kPerfTableStatusAttr, + swss::WarmStart::kPerfTableFinishTimeAttr}; + const std::vector kStagesInOrder = { + WarmStart::WarmBootStage::STAGE_FREEZE, + WarmStart::WarmBootStage::STAGE_CHECKPOINT, + WarmStart::WarmBootStage::STAGE_RECONCILIATION, + WarmStart::WarmBootStage::STAGE_UNFREEZE}; + const std::vector kFakeApps = {FakeApp("app1"), FakeApp("app2"), + FakeApp("app3")}; + + swss::DBConnector m_db; + std::string m_separator; + TelemetryHelper m_telemetry_helper; +}; + +class TelemetryHelperWithResultTest + : public TelemetryHelperTest, + public ::testing::WithParamInterface {}; + +TEST_F(TelemetryHelperTest, OverallStartInitializesCounter) { + for (int i = 1; i < 6; i++) { + m_telemetry_helper.record_overall_start(); + + EXPECT_THAT(get_warm_restart_counter(m_db), StrEq(std::to_string(i))); + check_overall_entries(i, "in-progress"); + } +} + +TEST_F(TelemetryHelperTest, OverallStartIgnoresFakeCounter) { + swss::Table table(&m_db, "BOOT_INFO"); + table.hset("system", "warmboot-count", "fake_counter"); + + m_telemetry_helper.record_overall_start(); + + EXPECT_THAT(get_warm_restart_counter(m_db), StrEq("1")); + check_overall_entries(1, "in-progress"); +} + +TEST_F(TelemetryHelperTest, OverallStartClearsPerf) { + swss::Table table(&m_db, STATE_WARM_RESTART_PERF_TABLE_NAME); + table.set("system", + default_fvs("fake_status", "fake_timestamp", "fake_timestamp2")); + table.set("freeze|fake_app", + default_fvs("fake_status", "fake_timestamp", "fake_timestamp2")); + + m_telemetry_helper.record_overall_start(); + + EXPECT_THAT(get_warm_restart_counter(m_db), StrEq("1")); + check_overall_entries(1, "in-progress"); + + std::vector unused; + EXPECT_FALSE(table.get("freeze|fake_app", unused)); +} + +TEST_P(TelemetryHelperWithResultTest, OverallEndWithoutStart) { + m_telemetry_helper.record_overall_end(GetParam()); + + check_overall_entries(0, get_end_status_string(GetParam()), + kEndWithoutStartFields); +} + +TEST_P(TelemetryHelperWithResultTest, OverallEndWorks) { + m_telemetry_helper.record_overall_start(); + + m_telemetry_helper.record_overall_end(GetParam()); + + check_overall_entries(1, get_end_status_string(GetParam())); +} + +TEST_F(TelemetryHelperTest, StageStartWithoutStart) { + for (const auto &nsf_stage : kStagesInOrder) { + m_telemetry_helper.record_stage_start(nsf_stage); + for (const auto &app : kFakeApps) { + app.record_start(nsf_stage); + } + check_stage_start_entries(0, nsf_stage); + } +} + +TEST_F(TelemetryHelperTest, StageStartWorks) { + m_telemetry_helper.record_overall_start(); + + for (const auto &nsf_stage : kStagesInOrder) { + m_telemetry_helper.record_stage_start(nsf_stage); + for (const auto &app : kFakeApps) { + app.record_start(nsf_stage); + } + check_stage_start_entries(1, nsf_stage); + } +} + +TEST_P(TelemetryHelperWithResultTest, StageEndWithoutStart) { + for (const auto &nsf_stage : kStagesInOrder) { + m_telemetry_helper.record_stage_start(nsf_stage); + for (const auto &app : kFakeApps) { + app.record_start(nsf_stage); + app.record_end(nsf_stage, GetParam()); + } + m_telemetry_helper.record_stage_end(nsf_stage, GetParam()); + check_stage_end_entries(0, nsf_stage, GetParam()); + } +} + +TEST_P(TelemetryHelperWithResultTest, StageEndWithoutStageStart) { + m_telemetry_helper.record_overall_start(); + + std::string expected_status = get_end_status_string(GetParam()); + + for (const auto &nsf_stage : kStagesInOrder) { + for (const auto &app : kFakeApps) { + app.record_start(nsf_stage); + app.record_end(nsf_stage, GetParam()); + } + m_telemetry_helper.record_stage_end(nsf_stage, GetParam()); + + std::string expected_app_status = + get_app_end_status_string(nsf_stage, GetParam()); + fully_parameterized_stage_entries(1, nsf_stage, kFakeApps, expected_status, + expected_app_status, + kEndWithoutStartFields); + } +} + +TEST_P(TelemetryHelperWithResultTest, StageEndWorks) { + m_telemetry_helper.record_overall_start(); + + for (const auto &nsf_stage : kStagesInOrder) { + m_telemetry_helper.record_stage_start(nsf_stage); + for (const auto &app : kFakeApps) { + app.record_start(nsf_stage); + app.record_end(nsf_stage, GetParam()); + } + m_telemetry_helper.record_stage_end(nsf_stage, GetParam()); + check_stage_end_entries(1, nsf_stage, GetParam()); + } +} + +TEST_P(TelemetryHelperWithResultTest, EndToEndWorks) { + m_telemetry_helper.record_overall_start(); + check_overall_entries(1, "in-progress"); + + for (const auto &nsf_stage : kStagesInOrder) { + m_telemetry_helper.record_stage_start(nsf_stage); + for (const auto &app : kFakeApps) { + app.record_start(nsf_stage); + } + check_stage_start_entries(1, nsf_stage); + + for (const auto &app : kFakeApps) { + app.record_end(nsf_stage, GetParam()); + } + m_telemetry_helper.record_stage_end(nsf_stage, GetParam()); + check_stage_end_entries(1, nsf_stage, GetParam()); + } + + m_telemetry_helper.record_overall_end(GetParam()); + check_overall_entries(1, get_end_status_string(GetParam())); +} + +INSTANTIATE_TEST_SUITE_P(TestWithResultSuite, TelemetryHelperWithResultTest, + testing::Values(true, false)); + +} // namespace rebootbackend diff --git a/src/sonic-framework/tests/test_main.cpp b/src/sonic-framework/tests/test_main.cpp new file mode 100644 index 000000000000..693d88f181ed --- /dev/null +++ b/src/sonic-framework/tests/test_main.cpp @@ -0,0 +1,7 @@ + +#include "gtest/gtest.h" + +int main(int argc, char* argv[]) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/src/sonic-framework/tests/test_utils_common.cpp b/src/sonic-framework/tests/test_utils_common.cpp new file mode 100644 index 000000000000..6b5448c41395 --- /dev/null +++ b/src/sonic-framework/tests/test_utils_common.cpp @@ -0,0 +1,143 @@ +#include "test_utils_common.h" + +#include +#include + +#include +#include + +#include "dbconnector.h" +#include "notificationconsumer.h" +#include "redis_utils.h" +#include "select.h" +#include "selectableevent.h" +#include "stateverification.h" +#include "table.h" +#include "timestamp.h" +#include "warm_restart.h" + +namespace rebootbackend { + +void TestUtils::wait_for_finish(swss::Select &s, + swss::SelectableEvent &finished, + uint32_t timeout_seconds) { + swss::Selectable *sel; + int ret; + + ret = s.select(&sel, timeout_seconds * 1000); + EXPECT_EQ(ret, swss::Select::OBJECT); + EXPECT_EQ(sel, &finished); +} + +std::string TestUtils::wait_for_state_verification_trigger( + swss::NotificationConsumer &nc, uint32_t timeout_seconds, bool freeze) { + swss::Select s; + s.addSelectable(&nc); + + swss::Selectable *sel; + int ret; + ret = s.select(&sel, timeout_seconds * 1000); + EXPECT_EQ(ret, swss::Select::OBJECT); + if (ret != swss::Select::OBJECT) { + return ""; + } + + std::string op, timestamp_data; + std::vector values; + nc.pop(op, timestamp_data, values); + auto fv = values[0]; + EXPECT_EQ(op, ALL_COMPONENT); + EXPECT_EQ(fvField(fv), FREEZE_FIELD); + EXPECT_EQ(fvValue(fv), freeze ? "true" : "false"); + return timestamp_data; +} + +void TestUtils::confirm_no_state_verification_trigger( + swss::NotificationConsumer &nc, uint32_t timeout_seconds) { + swss::Select s; + s.addSelectable(&nc); + + swss::Selectable *sel; + int ret; + ret = s.select(&sel, timeout_seconds * 1000); + EXPECT_NE(ret, swss::Select::OBJECT); +} + +void TestUtils::populate_registration_table( + swss::DBConnector &db, const std::string &key, const bool &stop_on_freeze, + const bool &freeze, const bool &checkpoint, const bool &reconciliation) { + swss::Table registrationTable(&db, + STATE_WARM_RESTART_REGISTRATION_TABLE_NAME); + + std::string tableName = key; + std::vector values; + + values.push_back(swss::FieldValueTuple("stop_on_freeze", + stop_on_freeze ? "true" : "false")); + values.push_back(swss::FieldValueTuple("freeze", freeze ? "true" : "false")); + values.push_back( + swss::FieldValueTuple("checkpoint", checkpoint ? "true" : "false")); + values.push_back(swss::FieldValueTuple("reconciliation", + reconciliation ? "true" : "false")); + values.push_back(swss::FieldValueTuple("timestamp", swss::getTimestamp())); + + registrationTable.set(tableName, values); +} + +void TestUtils::populate_restart_table_state(swss::DBConnector &db, + const std::string &app_name, + const std::string &state) { + swss::Table warmRestartTable(&db, STATE_WARM_RESTART_TABLE_NAME); + warmRestartTable.hset(app_name, "state", state); +} + +void TestUtils::write_state_verification_result(swss::DBConnector &db, + const std::string &key, + const std::string &status, + const std::string ×tamp) { + swss::Table state_verification_table(&db, STATE_VERIFICATION_RESP_TABLE); + std::vector fvs; + fvs.push_back(swss::FieldValueTuple(TIMESTAMP_FIELD, timestamp)); + fvs.push_back(swss::FieldValueTuple(STATUS_FIELD, status)); + state_verification_table.set(key, fvs); +} + +void TestUtils::clear_tables(swss::DBConnector &db) { + const std::vector kTablesToClear = { + "BOOT_INFO", + STATE_WARM_RESTART_TABLE_NAME, + STATE_WARM_RESTART_REGISTRATION_TABLE_NAME, + STATE_WARM_RESTART_INIT_TABLE_NAME, + STATE_VERIFICATION_RESP_TABLE, + STATE_WARM_RESTART_ENABLE_TABLE_NAME, + STATE_WARM_RESTART_PERF_TABLE_NAME, + STATE_WARM_RESTART_PERF_HISTORY_TABLE_NAME}; + + for (const auto &table_name : kTablesToClear) { + swss::Table table(&db, table_name); + std::vector keys; + table.getKeys(keys); + for (const auto &key : keys) { + table.del(key); + } + } +} + +void TestUtils::check_warmboot_enabled(swss::DBConnector &db, + bool expected_state) { + swss::Table warmRestartTable(&db, STATE_WARM_RESTART_ENABLE_TABLE_NAME); + std::string actual_state; + warmRestartTable.hget("system", "enable", actual_state); + EXPECT_EQ(actual_state, expected_state ? "true" : "false"); +} + +void TestUtils::set_state_verification_enable(swss::DBConnector &config_db, + bool bootup, bool enabled) { + swss::Table warmRestartTable(&config_db, CFG_WARM_RESTART_TABLE_NAME); + warmRestartTable.hset( + "system", + bootup ? "state_verification_bootup" : "state_verification_shutdown", + enabled ? "true" : "false"); +} + +} // namespace rebootbackend diff --git a/src/sonic-framework/tests/test_utils_common.h b/src/sonic-framework/tests/test_utils_common.h new file mode 100644 index 000000000000..150762e50ad4 --- /dev/null +++ b/src/sonic-framework/tests/test_utils_common.h @@ -0,0 +1,48 @@ +#pragma once +#include +#include + +#include +#include + +#include "dbconnector.h" +#include "notificationconsumer.h" +#include "select.h" +#include "selectableevent.h" + +namespace rebootbackend { + +class TestUtils { + public: + static void wait_for_finish(swss::Select &s, swss::SelectableEvent &finished, + uint32_t timeout_seconds); + + static std::string wait_for_state_verification_trigger( + swss::NotificationConsumer &nc, uint32_t timeout_seconds, bool freeze); + + static void confirm_no_state_verification_trigger( + swss::NotificationConsumer &nc, uint32_t timeout_seconds); + + static void populate_registration_table( + swss::DBConnector &db, const std::string &key, const bool &stop_on_freeze, + const bool &freeze, const bool &checkpoint, const bool &reconciliation); + + static void populate_restart_table_state(swss::DBConnector &db, + const std::string &app_name, + const std::string &state); + + static void write_state_verification_result(swss::DBConnector &db, + const std::string &key, + const std::string &status, + const std::string ×tamp); + + static void clear_tables(swss::DBConnector &db); + + static void check_warmboot_enabled(swss::DBConnector &db, + bool expected_state); + + static void set_state_verification_enable(swss::DBConnector &db, bool bootup, + bool enabled); +}; + +} // namespace rebootbackend