From a051866a066516f5e44f825f5877bb1491b5e4bd Mon Sep 17 00:00:00 2001 From: Maxim Deb Natkh Date: Mon, 25 Nov 2024 17:47:35 +0000 Subject: [PATCH] isue-1918: wip --- cloud/filestore/bin/log | 1 + cloud/filestore/libs/service/auth_scheme.cpp | 1 + .../libs/storage/service/service_actor.cpp | 1 - .../libs/storage/service/service_actor.h | 6 +- .../storage/service/service_actor_actions.cpp | 4 + ...actor_actions_restart_local_filestores.cpp | 168 ++++++++++++++++++ .../service/service_actor_complete.cpp | 1 - .../libs/storage/service/service_private.h | 11 -- cloud/filestore/libs/storage/service/ya.make | 1 + .../libs/storage/tablet/tablet_actor.cpp | 10 +- .../filestore/private/api/protos/tablet.proto | 19 ++ .../test.py | 16 ++ .../ya.make | 33 ++++ cloud/filestore/tests/fio_index/ya.make | 1 + .../tests/recipes/tablets-restarter.inc | 15 ++ .../tests/recipes/tablets-restarter/README.md | 3 + .../recipes/tablets-restarter/__main__.py | 83 +++++++++ .../tests/recipes/tablets-restarter/ya.make | 1 + .../recipes/tablets-restarter/ya.make.inc | 19 ++ cloud/filestore/tests/recipes/ya.make | 1 + 20 files changed, 380 insertions(+), 15 deletions(-) create mode 100644 cloud/filestore/bin/log create mode 100644 cloud/filestore/libs/storage/service/service_actor_actions_restart_local_filestores.cpp create mode 100644 cloud/filestore/tests/fio_index/qemu-kikimr-multishard-tablets-restart-test/test.py create mode 100644 cloud/filestore/tests/fio_index/qemu-kikimr-multishard-tablets-restart-test/ya.make create mode 100644 cloud/filestore/tests/recipes/tablets-restarter.inc create mode 100644 cloud/filestore/tests/recipes/tablets-restarter/README.md create mode 100644 cloud/filestore/tests/recipes/tablets-restarter/__main__.py create mode 100644 cloud/filestore/tests/recipes/tablets-restarter/ya.make create mode 100644 cloud/filestore/tests/recipes/tablets-restarter/ya.make.inc diff --git a/cloud/filestore/bin/log b/cloud/filestore/bin/log new file mode 100644 index 00000000000..7326d960397 --- /dev/null +++ b/cloud/filestore/bin/log @@ -0,0 +1 @@ +Ok diff --git a/cloud/filestore/libs/service/auth_scheme.cpp b/cloud/filestore/libs/service/auth_scheme.cpp index 53a962cd9ae..ee6470f5fa7 100644 --- a/cloud/filestore/libs/service/auth_scheme.cpp +++ b/cloud/filestore/libs/service/auth_scheme.cpp @@ -121,6 +121,7 @@ TPermissionList GetRequestPermissions( // Update perms("draintablets", CreatePermissionList({EPermission::Update})), + perms("restartlocalfilestores", CreatePermissionList({EPermission::Update})), // Admin perms("changestorageconfig", TPermissionList().Flip()) diff --git a/cloud/filestore/libs/storage/service/service_actor.cpp b/cloud/filestore/libs/storage/service/service_actor.cpp index b25c92887db..7ed05edfb26 100644 --- a/cloud/filestore/libs/storage/service/service_actor.cpp +++ b/cloud/filestore/libs/storage/service/service_actor.cpp @@ -126,7 +126,6 @@ bool TStorageServiceActor::HandleRequests(STFUNC_SIG) FILESTORE_HANDLE_RESPONSE(name, ns) \ FILESTORE_REMOTE_SERVICE(FILESTORE_HANDLE_REQUEST_RESPONSE, TEvService) - FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_HANDLE_REQUEST_RESPONSE, TEvServicePrivate) #undef FILESTORE_HANDLE_REQUEST_RESPONSE HFunc(NMon::TEvHttpInfo, HandleHttpInfo); diff --git a/cloud/filestore/libs/storage/service/service_actor.h b/cloud/filestore/libs/storage/service/service_actor.h index 15f81aff74f..8637c0b589c 100644 --- a/cloud/filestore/libs/storage/service/service_actor.h +++ b/cloud/filestore/libs/storage/service/service_actor.h @@ -66,6 +66,7 @@ class TStorageServiceActor final void HandleHttpInfo( const NActors::NMon::TEvHttpInfo::TPtr& ev, const NActors::TActorContext& ctx); + void HandleHttpInfo_Search( const NActors::NMon::TEvHttpInfo::TPtr& ev, const TString& filesystemId, @@ -101,7 +102,6 @@ class TStorageServiceActor final const NActors::TActorContext& ctx); \ FILESTORE_REMOTE_SERVICE(FILESTORE_DECLARE_REQUEST_RESPONSE, TEvService) - FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_DECLARE_REQUEST_RESPONSE, TEvServicePrivate) #undef FILESTORE_DECLARE_REQUEST_RESPONSE STFUNC(StateWork); @@ -209,6 +209,10 @@ class TStorageServiceActor final TRequestInfoPtr requestInfo, TString input); + NActors::IActorPtr CreateRestartLocalFileStoresActionActor( + TRequestInfoPtr requestInfo, + TString input); + private: void RenderSessions(IOutputStream& out); void RenderLocalFileStores(IOutputStream& out); diff --git a/cloud/filestore/libs/storage/service/service_actor_actions.cpp b/cloud/filestore/libs/storage/service/service_actor_actions.cpp index bda6adea3f0..d6158ca225d 100644 --- a/cloud/filestore/libs/storage/service/service_actor_actions.cpp +++ b/cloud/filestore/libs/storage/service/service_actor_actions.cpp @@ -92,6 +92,10 @@ void TStorageServiceActor::HandleExecuteAction( "getstoragestats", &TStorageServiceActor::CreateGetStorageStatsActionActor }, + { + "restartlocalfilestores", + &TStorageServiceActor::CreateRestartLocalFileStoresActionActor, + }, }; auto it = actions.find(action); diff --git a/cloud/filestore/libs/storage/service/service_actor_actions_restart_local_filestores.cpp b/cloud/filestore/libs/storage/service/service_actor_actions_restart_local_filestores.cpp new file mode 100644 index 00000000000..39422178543 --- /dev/null +++ b/cloud/filestore/libs/storage/service/service_actor_actions_restart_local_filestores.cpp @@ -0,0 +1,168 @@ +#include "service_actor.h" + +#include "util/string/join.h" + +#include +#include +#include +#include +#include + +#include + +#include + +#include + +namespace NCloud::NFileStore::NStorage { + +using namespace NActors; + +using namespace NKikimr; + +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +class TRestartLocalFileStoresActionActor final + : public TActorBootstrapped +{ +private: + const TRequestInfoPtr RequestInfo; + const TString Input; + const TVector FileSystemIds; + ui32 RemainingRestarts = 0; + +public: + TRestartLocalFileStoresActionActor( + TRequestInfoPtr requestInfo, + TString input, + TVector fileSystemIds) + : RequestInfo(std::move(requestInfo)) + , Input(std::move(input)) + , FileSystemIds(std::move(fileSystemIds)) + {} + + void Bootstrap(const TActorContext& ctx) + { + Y_UNUSED(ctx); + NProtoPrivate::TRestartLocalFileStoresRequest request; + if (!google::protobuf::util::JsonStringToMessage(Input, &request).ok()) + { + ReplyAndDie( + ctx, + TErrorResponse(E_ARGUMENT, "Failed to parse input")); + return; + } + + auto rng = CreateDeterministicRandomProvider(request.GetSeed()); + + LOG_INFO( + ctx, + TFileStoreComponents::SERVICE_WORKER, + "Restarting local file stores: seed: %lu", + request.GetSeed()); + + ui32 cookie = 0; + + for (const auto& fileSystemId: FileSystemIds) { + if (rng->GenRand() % 2 == 0) { + auto requestToTablet = + std::make_unique(); + requestToTablet->Record.SetFileSystemId(fileSystemId); + + LOG_INFO( + ctx, + TFileStoreComponents::SERVICE_WORKER, + "Sending WaitReady to %s", + fileSystemId.c_str()); + + NCloud::Send( + ctx, + MakeIndexTabletProxyServiceId(), + std::move(requestToTablet), + cookie); + ++RemainingRestarts; + } + ++cookie; + } + + if (RemainingRestarts == 0) { + return ReplyAndDie(ctx, {}); + } + + Become(&TThis::StateWork); + } + +private: + void ReplyAndDie( + const TActorContext& ctx, + const NProtoPrivate::TRestartLocalFileStoresResponse& response) + { + auto msg = std::make_unique( + response.GetError()); + + google::protobuf::util::MessageToJsonString( + response, + msg->Record.MutableOutput()); + + NCloud::Reply(ctx, *RequestInfo, std::move(msg)); + Die(ctx); + } + + STFUNC(StateWork) + { + switch (ev->GetTypeRewrite()) { + HFunc( + TEvIndexTablet::TEvWaitReadyResponse, + HandleWaitReadyResponse); + + default: + HandleUnexpectedEvent(ev, TFileStoreComponents::SERVICE); + break; + } + } + + void HandleWaitReadyResponse( + const TEvIndexTablet::TEvWaitReadyResponse::TPtr& ev, + const TActorContext& ctx) + { + --RemainingRestarts; + Y_UNUSED(ev); + + LOG_INFO( + ctx, + TFileStoreComponents::SERVICE_WORKER, + "Sending poison pill to %s", + FileSystemIds.at(ev->Cookie).c_str()); + NCloud::Send( + ctx, + ev->Sender, + std::make_unique()); + + if (RemainingRestarts == 0) { + ReplyAndDie(ctx, {}); + } + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace + +IActorPtr TStorageServiceActor::CreateRestartLocalFileStoresActionActor( + TRequestInfoPtr requestInfo, + TString input) +{ + TVector fileSystemIds; + for (const auto& [fs, _]: State->GetLocalFileStores()) { + fileSystemIds.push_back(fs); + } + + return std::make_unique( + std::move(requestInfo), + std::move(input), + std::move(fileSystemIds)); +} + +} // namespace NCloud::NFileStore::NStorage diff --git a/cloud/filestore/libs/storage/service/service_actor_complete.cpp b/cloud/filestore/libs/storage/service/service_actor_complete.cpp index 0adc16d33eb..87953d847ba 100644 --- a/cloud/filestore/libs/storage/service/service_actor_complete.cpp +++ b/cloud/filestore/libs/storage/service/service_actor_complete.cpp @@ -72,7 +72,6 @@ void TStorageServiceActor::CompleteRequest( } FILESTORE_REMOTE_SERVICE(FILESTORE_IMPLEMENT_RESPONSE, TEvService) - FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_IMPLEMENT_RESPONSE, TEvServicePrivate) #undef FILESTORE_IMPLEMENT_RESPONSE diff --git a/cloud/filestore/libs/storage/service/service_private.h b/cloud/filestore/libs/storage/service/service_private.h index 3fc2d7e6e69..7e91d1ce9fc 100644 --- a/cloud/filestore/libs/storage/service/service_private.h +++ b/cloud/filestore/libs/storage/service/service_private.h @@ -11,13 +11,6 @@ namespace NCloud::NFileStore::NStorage { -//////////////////////////////////////////////////////////////////////////////// - -#define FILESTORE_SERVICE_REQUESTS_PRIVATE(xxx, ...) \ -// FILESTORE_SERVICE_REQUESTS_PRIVATE - -//////////////////////////////////////////////////////////////////////////////// - struct TEvServicePrivate { // @@ -64,8 +57,6 @@ struct TEvServicePrivate { EvBegin = TFileStoreEventsPrivate::SERVICE_WORKER_START, - FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_DECLARE_EVENT_IDS) - EvPingSession, EvCreateSession, EvSessionCreated, @@ -78,8 +69,6 @@ struct TEvServicePrivate static_assert(EvEnd < (int)TFileStoreEventsPrivate::SERVICE_WORKER_END, "EvEnd expected to be < TFileStoreEventsPrivate::SERVICE_WORKER_END"); - FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_DECLARE_EVENTS) - using TEvPingSession = TRequestEvent; using TEvCreateSession = TRequestEvent; using TEvSessionCreated = TResponseEvent; diff --git a/cloud/filestore/libs/storage/service/ya.make b/cloud/filestore/libs/storage/service/ya.make index 35558983449..f1ff837cae3 100644 --- a/cloud/filestore/libs/storage/service/ya.make +++ b/cloud/filestore/libs/storage/service/ya.make @@ -14,6 +14,7 @@ SRCS( service_actor_actions_reassign_tablet.cpp service_actor_actions_tablet_ops.cpp service_actor_actions_write_compaction_map.cpp + service_actor_actions_restart_local_filestores.cpp service_actor_actions.cpp service_actor_alterfs.cpp service_actor_complete.cpp diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor.cpp index 2da3456b9c7..ba026a03891 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor.cpp @@ -994,6 +994,12 @@ STFUNC(TIndexTabletActor::StateZombie) HFunc(TEvTablet::TEvTabletDead, HandleTabletDead); HFunc(TEvTabletPipe::TEvServerDisconnected, HandleSessionDisconnected); + // If compaction/cleanup/collectgarbage started before the tablet reload + // and completed during the init state, we should ignore it. + IgnoreFunc(TEvIndexTabletPrivate::TEvCompactionResponse); + IgnoreFunc(TEvIndexTabletPrivate::TEvCleanupResponse); + IgnoreFunc(TEvIndexTabletPrivate::TEvCollectGarbageResponse); + IgnoreFunc(TEvFileStore::TEvUpdateConfig); // private api @@ -1022,7 +1028,9 @@ STFUNC(TIndexTabletActor::StateZombie) HandleNodeUnlinkedInShard); default: - HandleUnexpectedEvent(ev, TFileStoreComponents::TABLET); + if (!HandleDefaultEvents(ev, SelfId())) { + HandleUnexpectedEvent(ev, TFileStoreComponents::TABLET); + } break; } } diff --git a/cloud/filestore/private/api/protos/tablet.proto b/cloud/filestore/private/api/protos/tablet.proto index fb5e60af69c..ddfd7536ac1 100644 --- a/cloud/filestore/private/api/protos/tablet.proto +++ b/cloud/filestore/private/api/protos/tablet.proto @@ -700,3 +700,22 @@ message TGetFileSystemTopologyResponse // Shard FileSystem identifiers. repeated string ShardFileSystemIds = 2; } + +//////////////////////////////////////////////////////////////////////////////// +// RestartLocalFileStores request/response. + +message TRestartLocalFileStoresRequest +{ + // Optional request headers. + NProto.THeaders Headers = 1; + + // Seed to randomize which filestores to restart. + uint64 Seed = 2; +} + + +message TRestartLocalFileStoresResponse +{ + // Optional error, set only if error happened. + NCloud.NProto.TError Error = 1; +} diff --git a/cloud/filestore/tests/fio_index/qemu-kikimr-multishard-tablets-restart-test/test.py b/cloud/filestore/tests/fio_index/qemu-kikimr-multishard-tablets-restart-test/test.py new file mode 100644 index 00000000000..d8048e72b11 --- /dev/null +++ b/cloud/filestore/tests/fio_index/qemu-kikimr-multishard-tablets-restart-test/test.py @@ -0,0 +1,16 @@ +import pytest + +import cloud.storage.core.tools.testing.fio.lib as fio + +from cloud.filestore.tests.python.lib.common import get_filestore_mount_path + + +TESTS = fio.generate_index_tests() + + +@pytest.mark.parametrize("name", TESTS.keys()) +def test_fio(name): + mount_dir = get_filestore_mount_path() + dir_name = fio.get_dir_name(mount_dir, name) + + fio.run_index_test(dir_name, TESTS[name], fail_on_errors=True) diff --git a/cloud/filestore/tests/fio_index/qemu-kikimr-multishard-tablets-restart-test/ya.make b/cloud/filestore/tests/fio_index/qemu-kikimr-multishard-tablets-restart-test/ya.make new file mode 100644 index 00000000000..9014e5c277c --- /dev/null +++ b/cloud/filestore/tests/fio_index/qemu-kikimr-multishard-tablets-restart-test/ya.make @@ -0,0 +1,33 @@ +PY3TEST() + +INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/medium.inc) + +DEPENDS( + cloud/storage/core/tools/testing/fio/bin +) + +PEERDIR( + cloud/filestore/tests/python/lib + cloud/storage/core/tools/testing/fio/lib +) + +TEST_SRCS( + test.py +) + +SET(QEMU_VIRTIO fs) +SET(FILESTORE_SHARD_COUNT 5) +SET( + NFS_STORAGE_CONFIG_PATCH + cloud/filestore/tests/loadtest/service-kikimr-newfeatures-test/nfs-storage.txt +) + +INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/service-kikimr.inc) +INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/vhost-kikimr.inc) +INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/vhost-endpoint.inc) +INCLUDE(${ARCADIA_ROOT}/cloud/storage/core/tests/recipes/qemu.inc) + +SET(FILESTORE_TABLETS_RESTART_INTERVAL 5) +INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/tablets-restarter.inc) + +END() diff --git a/cloud/filestore/tests/fio_index/ya.make b/cloud/filestore/tests/fio_index/ya.make index 44e2ffe296e..bb66df4c49c 100644 --- a/cloud/filestore/tests/fio_index/ya.make +++ b/cloud/filestore/tests/fio_index/ya.make @@ -2,6 +2,7 @@ RECURSE_FOR_TESTS( mount-kikimr-test mount-local-test qemu-kikimr-multishard-nemesis-test + qemu-kikimr-multishard-tablets-restart-test qemu-kikimr-multishard-test qemu-kikimr-nemesis-test qemu-kikimr-test diff --git a/cloud/filestore/tests/recipes/tablets-restarter.inc b/cloud/filestore/tests/recipes/tablets-restarter.inc new file mode 100644 index 00000000000..2ec8b77f78e --- /dev/null +++ b/cloud/filestore/tests/recipes/tablets-restarter.inc @@ -0,0 +1,15 @@ +DEPENDS( + cloud/filestore/tests/recipes/tablets-restarter +) + + +IF (FILESTORE_TABLETS_RESTART_INTERVAL) + SET(RECIPE_ARGS --restart-interval $FILESTORE_TABLETS_RESTART_INTERVAL) +ELSE() + MESSAGE(FATAL_ERROR FILESTORE_TABLETS_RESTART_INTERVAL should be set for tablets-restarter recipe to work) +ENDIF() + +USE_RECIPE( + cloud/filestore/tests/recipes/tablets-restarter/filestore-tablets-restarter + ${RECIPE_ARGS} +) diff --git a/cloud/filestore/tests/recipes/tablets-restarter/README.md b/cloud/filestore/tests/recipes/tablets-restarter/README.md new file mode 100644 index 00000000000..0e8076d16e8 --- /dev/null +++ b/cloud/filestore/tests/recipes/tablets-restarter/README.md @@ -0,0 +1,3 @@ +### tablets-restarter + +Include this recipe to get a process that regularly restarts random tablets of a filestore using a private `restartlocalfilestores` action. \ No newline at end of file diff --git a/cloud/filestore/tests/recipes/tablets-restarter/__main__.py b/cloud/filestore/tests/recipes/tablets-restarter/__main__.py new file mode 100644 index 00000000000..cc977c9be48 --- /dev/null +++ b/cloud/filestore/tests/recipes/tablets-restarter/__main__.py @@ -0,0 +1,83 @@ +import argparse +import logging +import multiprocessing +import os +import random +import datetime +import time + +from cloud.filestore.tests.python.lib.client import FilestoreCliClient +from cloud.filestore.tests.python.lib.common import shutdown + +from library.python.testing.recipe import declare_recipe, set_env +import yatest.common as common + +process = None + +logger = logging.getLogger(__name__) + +PID_FILE_NAME = "tablets_restarter_recipe.pid" + +def get_client(): + port = os.getenv("NFS_SERVER_PORT") + if port is None: + raise ValueError("NFS_SERVER_PORT is not set") + binary_path = common.binary_path("cloud/filestore/apps/client/filestore-client") + return FilestoreCliClient(binary_path, port, cwd=common.output_path()) + + +def restart_tablets(client: FilestoreCliClient, seed: int): + client.execute_action("restartlocalfilestores", {"Seed": seed}) + + +def start(argv): + parser = argparse.ArgumentParser() + parser.add_argument('--restart-interval', help='restart the process every N seconds', type=int, default=5) + args = parser.parse_args(argv) + + client = get_client() + + start_ts = datetime.datetime.now() + interval = datetime.timedelta(seconds=args.restart_interval) + + pid = os.fork() + if pid: + with open(PID_FILE_NAME, "w") as f: + f.write(str(pid)) + logger.info(f"Started tablets restarter process with PID {pid}") + exit() + + os.setsid() + + restarter_log_name = os.path.join(common.output_path(), "tablets_restarter.log") + logfile = open(restarter_log_name, "w") + if logfile is None: + raise ValueError("Could not open log file") + os.dup2(logfile.fileno(), os.sys.stdout.fileno()) + os.dup2(logfile.fileno(), os.sys.stderr.fileno()) + + while True: + now = datetime.datetime.now() + deadline = start_ts + interval + if deadline is None or now >= deadline: + start_ts = now + + seed = random.randint(0, 1000000) + logger.info(f"Restarting tablets with seed {seed}") + restart_tablets(client, seed) + else: + time.sleep(min((deadline - now).seconds, 1)) + + logfile.close() + + +def stop(argv): + if not os.path.exists(PID_FILE_NAME): + return + + with open(PID_FILE_NAME) as f: + pid = int(f.read()) + shutdown(pid) + +if __name__ == '__main__': + declare_recipe(start, stop) diff --git a/cloud/filestore/tests/recipes/tablets-restarter/ya.make b/cloud/filestore/tests/recipes/tablets-restarter/ya.make new file mode 100644 index 00000000000..85f643318a8 --- /dev/null +++ b/cloud/filestore/tests/recipes/tablets-restarter/ya.make @@ -0,0 +1 @@ +INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/tablets-restarter/ya.make.inc) diff --git a/cloud/filestore/tests/recipes/tablets-restarter/ya.make.inc b/cloud/filestore/tests/recipes/tablets-restarter/ya.make.inc new file mode 100644 index 00000000000..3c4a0068458 --- /dev/null +++ b/cloud/filestore/tests/recipes/tablets-restarter/ya.make.inc @@ -0,0 +1,19 @@ +PY3_PROGRAM(filestore-tablets-restarter) + + +DEPENDS( + cloud/filestore/apps/client +) + +PEERDIR( + cloud/filestore/tests/python/lib + + library/python/testing/recipe + library/python/testing/yatest_common +) + +PY_SRCS( + __main__.py +) + +END() diff --git a/cloud/filestore/tests/recipes/ya.make b/cloud/filestore/tests/recipes/ya.make index 22445be3390..2db684d252e 100644 --- a/cloud/filestore/tests/recipes/ya.make +++ b/cloud/filestore/tests/recipes/ya.make @@ -4,4 +4,5 @@ RECURSE( service-local vhost vhost-endpoint + tablets-restarter )