Skip to content

Commit

Permalink
isue-1918: wip
Browse files Browse the repository at this point in the history
  • Loading branch information
Maxim Deb Natkh committed Nov 28, 2024
1 parent 0c825cc commit 5423cba
Show file tree
Hide file tree
Showing 20 changed files with 381 additions and 15 deletions.
1 change: 1 addition & 0 deletions cloud/filestore/bin/log
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Ok
1 change: 1 addition & 0 deletions cloud/filestore/libs/service/auth_scheme.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ TPermissionList GetRequestPermissions(

// Update
perms("draintablets", CreatePermissionList({EPermission::Update})),
perms("restartlocalfilestores", CreatePermissionList({EPermission::Update})),

// Admin
perms("changestorageconfig", TPermissionList().Flip())
Expand Down
1 change: 0 additions & 1 deletion cloud/filestore/libs/storage/service/service_actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,6 @@ bool TStorageServiceActor::HandleRequests(STFUNC_SIG)
FILESTORE_HANDLE_RESPONSE(name, ns) \

FILESTORE_REMOTE_SERVICE(FILESTORE_HANDLE_REQUEST_RESPONSE, TEvService)
FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_HANDLE_REQUEST_RESPONSE, TEvServicePrivate)
#undef FILESTORE_HANDLE_REQUEST_RESPONSE

HFunc(NMon::TEvHttpInfo, HandleHttpInfo);
Expand Down
6 changes: 5 additions & 1 deletion cloud/filestore/libs/storage/service/service_actor.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class TStorageServiceActor final
void HandleHttpInfo(
const NActors::NMon::TEvHttpInfo::TPtr& ev,
const NActors::TActorContext& ctx);

void HandleHttpInfo_Search(
const NActors::NMon::TEvHttpInfo::TPtr& ev,
const TString& filesystemId,
Expand Down Expand Up @@ -101,7 +102,6 @@ class TStorageServiceActor final
const NActors::TActorContext& ctx); \

FILESTORE_REMOTE_SERVICE(FILESTORE_DECLARE_REQUEST_RESPONSE, TEvService)
FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_DECLARE_REQUEST_RESPONSE, TEvServicePrivate)
#undef FILESTORE_DECLARE_REQUEST_RESPONSE

STFUNC(StateWork);
Expand Down Expand Up @@ -209,6 +209,10 @@ class TStorageServiceActor final
TRequestInfoPtr requestInfo,
TString input);

NActors::IActorPtr CreateRestartLocalFileStoresActionActor(
TRequestInfoPtr requestInfo,
TString input);

private:
void RenderSessions(IOutputStream& out);
void RenderLocalFileStores(IOutputStream& out);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ void TStorageServiceActor::HandleExecuteAction(
"getstoragestats",
&TStorageServiceActor::CreateGetStorageStatsActionActor
},
{
"restartlocalfilestores",
&TStorageServiceActor::CreateRestartLocalFileStoresActionActor,
},
};

auto it = actions.find(action);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#include "service_actor.h"

#include "util/string/join.h"

#include <cloud/filestore/libs/storage/api/service.h>
#include <cloud/filestore/libs/storage/api/tablet.h>
#include <cloud/filestore/libs/storage/api/tablet_proxy.h>
#include <cloud/filestore/libs/storage/core/public.h>
#include <cloud/filestore/private/api/protos/tablet.pb.h>

#include <contrib/ydb/library/actors/core/actor_bootstrapped.h>

#include <library/cpp/random_provider/random_provider.h>

#include <google/protobuf/util/json_util.h>

namespace NCloud::NFileStore::NStorage {

using namespace NActors;

using namespace NKikimr;

namespace {

////////////////////////////////////////////////////////////////////////////////

class TRestartLocalFileStoresActionActor final
: public TActorBootstrapped<TRestartLocalFileStoresActionActor>
{
private:
const TRequestInfoPtr RequestInfo;
const TString Input;
const TVector<TString> FileSystemIds;
ui32 RemainingRestarts = 0;

public:
TRestartLocalFileStoresActionActor(
TRequestInfoPtr requestInfo,
TString input,
TVector<TString> fileSystemIds)
: RequestInfo(std::move(requestInfo))
, Input(std::move(input))
, FileSystemIds(std::move(fileSystemIds))
{}

void Bootstrap(const TActorContext& ctx)
{
Y_UNUSED(ctx);
NProtoPrivate::TRestartLocalFileStoresRequest request;
if (!google::protobuf::util::JsonStringToMessage(Input, &request).ok())
{
ReplyAndDie(
ctx,
TErrorResponse(E_ARGUMENT, "Failed to parse input"));
return;
}

auto rng = CreateDeterministicRandomProvider(request.GetSeed());

LOG_INFO(
ctx,
TFileStoreComponents::SERVICE_WORKER,
"Restarting local file stores: seed: %lu",
request.GetSeed());

ui32 cookie = 0;

for (const auto& fileSystemId: FileSystemIds) {
if (rng->GenRand() % 2 == 0) {
auto requestToTablet =
std::make_unique<TEvIndexTablet::TEvWaitReadyRequest>();
requestToTablet->Record.SetFileSystemId(fileSystemId);

LOG_INFO(
ctx,
TFileStoreComponents::SERVICE_WORKER,
"Sending WaitReady to %s",
fileSystemId.c_str());

NCloud::Send(
ctx,
MakeIndexTabletProxyServiceId(),
std::move(requestToTablet),
cookie);
++RemainingRestarts;
}
++cookie;
}

if (RemainingRestarts == 0) {
return ReplyAndDie(ctx, {});
}

Become(&TThis::StateWork);
}

private:
void ReplyAndDie(
const TActorContext& ctx,
const NProtoPrivate::TRestartLocalFileStoresResponse& response)
{
auto msg = std::make_unique<TEvService::TEvExecuteActionResponse>(
response.GetError());

google::protobuf::util::MessageToJsonString(
response,
msg->Record.MutableOutput());

NCloud::Reply(ctx, *RequestInfo, std::move(msg));
Die(ctx);
}

STFUNC(StateWork)
{
switch (ev->GetTypeRewrite()) {
HFunc(
TEvIndexTablet::TEvWaitReadyResponse,
HandleWaitReadyResponse);

default:
HandleUnexpectedEvent(ev, TFileStoreComponents::SERVICE);
break;
}
}

void HandleWaitReadyResponse(
const TEvIndexTablet::TEvWaitReadyResponse::TPtr& ev,
const TActorContext& ctx)
{
--RemainingRestarts;
Y_UNUSED(ev);

LOG_INFO(
ctx,
TFileStoreComponents::SERVICE_WORKER,
"Sending poison pill to %s",
FileSystemIds.at(ev->Cookie).c_str());
NCloud::Send(
ctx,
ev->Sender,
std::make_unique<TEvents::TEvPoisonPill>());

if (RemainingRestarts == 0) {
ReplyAndDie(ctx, {});
}
}
};

////////////////////////////////////////////////////////////////////////////////

} // namespace

IActorPtr TStorageServiceActor::CreateRestartLocalFileStoresActionActor(
TRequestInfoPtr requestInfo,
TString input)
{
TVector<TString> fileSystemIds;
for (const auto& [fs, _]: State->GetLocalFileStores()) {
fileSystemIds.push_back(fs);
}

return std::make_unique<TRestartLocalFileStoresActionActor>(
std::move(requestInfo),
std::move(input),
std::move(fileSystemIds));
}

} // namespace NCloud::NFileStore::NStorage
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ void TStorageServiceActor::CompleteRequest(
}

FILESTORE_REMOTE_SERVICE(FILESTORE_IMPLEMENT_RESPONSE, TEvService)
FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_IMPLEMENT_RESPONSE, TEvServicePrivate)

#undef FILESTORE_IMPLEMENT_RESPONSE

Expand Down
11 changes: 0 additions & 11 deletions cloud/filestore/libs/storage/service/service_private.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,6 @@

namespace NCloud::NFileStore::NStorage {

////////////////////////////////////////////////////////////////////////////////

#define FILESTORE_SERVICE_REQUESTS_PRIVATE(xxx, ...) \
// FILESTORE_SERVICE_REQUESTS_PRIVATE

////////////////////////////////////////////////////////////////////////////////

struct TEvServicePrivate
{
//
Expand Down Expand Up @@ -64,8 +57,6 @@ struct TEvServicePrivate
{
EvBegin = TFileStoreEventsPrivate::SERVICE_WORKER_START,

FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_DECLARE_EVENT_IDS)

EvPingSession,
EvCreateSession,
EvSessionCreated,
Expand All @@ -78,8 +69,6 @@ struct TEvServicePrivate
static_assert(EvEnd < (int)TFileStoreEventsPrivate::SERVICE_WORKER_END,
"EvEnd expected to be < TFileStoreEventsPrivate::SERVICE_WORKER_END");

FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_DECLARE_EVENTS)

using TEvPingSession = TRequestEvent<TEmpty, EvPingSession>;
using TEvCreateSession = TRequestEvent<TCreateSession, EvCreateSession>;
using TEvSessionCreated = TResponseEvent<TSessionCreated, EvSessionCreated>;
Expand Down
1 change: 1 addition & 0 deletions cloud/filestore/libs/storage/service/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ SRCS(
service_actor_actions_reassign_tablet.cpp
service_actor_actions_tablet_ops.cpp
service_actor_actions_write_compaction_map.cpp
service_actor_actions_restart_local_filestores.cpp
service_actor_actions.cpp
service_actor_alterfs.cpp
service_actor_complete.cpp
Expand Down
10 changes: 9 additions & 1 deletion cloud/filestore/libs/storage/tablet/tablet_actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -994,6 +994,12 @@ STFUNC(TIndexTabletActor::StateZombie)
HFunc(TEvTablet::TEvTabletDead, HandleTabletDead);
HFunc(TEvTabletPipe::TEvServerDisconnected, HandleSessionDisconnected);

// If compaction/cleanup/collectgarbage started before the tablet reload
// and completed during the init state, we should ignore it.
IgnoreFunc(TEvIndexTabletPrivate::TEvCompactionResponse);
IgnoreFunc(TEvIndexTabletPrivate::TEvCleanupResponse);
IgnoreFunc(TEvIndexTabletPrivate::TEvCollectGarbageResponse);

IgnoreFunc(TEvFileStore::TEvUpdateConfig);

// private api
Expand Down Expand Up @@ -1022,7 +1028,9 @@ STFUNC(TIndexTabletActor::StateZombie)
HandleNodeUnlinkedInShard);

default:
HandleUnexpectedEvent(ev, TFileStoreComponents::TABLET);
if (!HandleDefaultEvents(ev, SelfId())) {
HandleUnexpectedEvent(ev, TFileStoreComponents::TABLET);
}
break;
}
}
Expand Down
19 changes: 19 additions & 0 deletions cloud/filestore/private/api/protos/tablet.proto
Original file line number Diff line number Diff line change
Expand Up @@ -700,3 +700,22 @@ message TGetFileSystemTopologyResponse
// Shard FileSystem identifiers.
repeated string ShardFileSystemIds = 2;
}

////////////////////////////////////////////////////////////////////////////////
// RestartLocalFileStores request/response.

message TRestartLocalFileStoresRequest
{
// Optional request headers.
NProto.THeaders Headers = 1;

// Seed to randomize which filestores to restart.
uint64 Seed = 2;
}


message TRestartLocalFileStoresResponse
{
// Optional error, set only if error happened.
NCloud.NProto.TError Error = 1;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pytest

import cloud.storage.core.tools.testing.fio.lib as fio

from cloud.filestore.tests.python.lib.common import get_filestore_mount_path


TESTS = fio.generate_index_tests()


@pytest.mark.parametrize("name", TESTS.keys())
def test_fio(name):
mount_dir = get_filestore_mount_path()
dir_name = fio.get_dir_name(mount_dir, name)

fio.run_index_test(dir_name, TESTS[name], fail_on_errors=True)
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
PY3TEST()

INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/medium.inc)

DEPENDS(
cloud/storage/core/tools/testing/fio/bin
)

PEERDIR(
cloud/filestore/tests/python/lib
cloud/storage/core/tools/testing/fio/lib
)

TEST_SRCS(
test.py
)

SET(QEMU_VIRTIO fs)
SET(FILESTORE_SHARD_COUNT 5)
SET(
NFS_STORAGE_CONFIG_PATCH
cloud/filestore/tests/loadtest/service-kikimr-newfeatures-test/nfs-storage.txt
)

INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/service-kikimr.inc)
INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/vhost-kikimr.inc)
INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/vhost-endpoint.inc)
INCLUDE(${ARCADIA_ROOT}/cloud/storage/core/tests/recipes/qemu.inc)

SET(FILESTORE_TABLETS_RESTART_INTERVAL 5)
INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/tablets-restarter.inc)

END()
1 change: 1 addition & 0 deletions cloud/filestore/tests/fio_index/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ RECURSE_FOR_TESTS(
mount-kikimr-test
mount-local-test
qemu-kikimr-multishard-nemesis-test
qemu-kikimr-multishard-tablets-restart-test
qemu-kikimr-multishard-test
qemu-kikimr-nemesis-test
qemu-kikimr-test
Expand Down
15 changes: 15 additions & 0 deletions cloud/filestore/tests/recipes/tablets-restarter.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
DEPENDS(
cloud/filestore/tests/recipes/tablets-restarter
)


IF (FILESTORE_TABLETS_RESTART_INTERVAL)
SET(RECIPE_ARGS --restart-interval $FILESTORE_TABLETS_RESTART_INTERVAL)
ELSE()
MESSAGE(FATAL_ERROR FILESTORE_TABLETS_RESTART_INTERVAL should be set for tablets-restarter recipe to work)
ENDIF()

USE_RECIPE(
cloud/filestore/tests/recipes/tablets-restarter/filestore-tablets-restarter
${RECIPE_ARGS}
)
3 changes: 3 additions & 0 deletions cloud/filestore/tests/recipes/tablets-restarter/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### tablets-restarter

Include this recipe to get a process that regularly restarts random tablets of a filestore using a private `restartlocalfilestores` action.
Loading

0 comments on commit 5423cba

Please sign in to comment.