Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

isue-1918: wip #2565

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cloud/filestore/libs/service/auth_scheme.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ TPermissionList GetRequestPermissions(

// Update
perms("draintablets", CreatePermissionList({EPermission::Update})),
perms("restartlocalfilestores", CreatePermissionList({EPermission::Update})),

// Admin
perms("changestorageconfig", TPermissionList().Flip())
Expand Down
1 change: 0 additions & 1 deletion cloud/filestore/libs/storage/service/service_actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,6 @@ bool TStorageServiceActor::HandleRequests(STFUNC_SIG)
FILESTORE_HANDLE_RESPONSE(name, ns) \

FILESTORE_REMOTE_SERVICE(FILESTORE_HANDLE_REQUEST_RESPONSE, TEvService)
FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_HANDLE_REQUEST_RESPONSE, TEvServicePrivate)
#undef FILESTORE_HANDLE_REQUEST_RESPONSE

HFunc(NMon::TEvHttpInfo, HandleHttpInfo);
Expand Down
6 changes: 5 additions & 1 deletion cloud/filestore/libs/storage/service/service_actor.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class TStorageServiceActor final
void HandleHttpInfo(
const NActors::NMon::TEvHttpInfo::TPtr& ev,
const NActors::TActorContext& ctx);

void HandleHttpInfo_Search(
const NActors::NMon::TEvHttpInfo::TPtr& ev,
const TString& filesystemId,
Expand Down Expand Up @@ -101,7 +102,6 @@ class TStorageServiceActor final
const NActors::TActorContext& ctx); \

FILESTORE_REMOTE_SERVICE(FILESTORE_DECLARE_REQUEST_RESPONSE, TEvService)
FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_DECLARE_REQUEST_RESPONSE, TEvServicePrivate)
#undef FILESTORE_DECLARE_REQUEST_RESPONSE

STFUNC(StateWork);
Expand Down Expand Up @@ -209,6 +209,10 @@ class TStorageServiceActor final
TRequestInfoPtr requestInfo,
TString input);

NActors::IActorPtr CreateRestartLocalFileStoresActionActor(
TRequestInfoPtr requestInfo,
TString input);

private:
void RenderSessions(IOutputStream& out);
void RenderLocalFileStores(IOutputStream& out);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ void TStorageServiceActor::HandleExecuteAction(
"getstoragestats",
&TStorageServiceActor::CreateGetStorageStatsActionActor
},
{
"restartlocalfilestores",
&TStorageServiceActor::CreateRestartLocalFileStoresActionActor,
},
};

auto it = actions.find(action);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#include "service_actor.h"

#include "util/string/join.h"

#include <cloud/filestore/libs/storage/api/service.h>
#include <cloud/filestore/libs/storage/api/tablet.h>
#include <cloud/filestore/libs/storage/api/tablet_proxy.h>
#include <cloud/filestore/libs/storage/core/public.h>
#include <cloud/filestore/private/api/protos/tablet.pb.h>

#include <contrib/ydb/library/actors/core/actor_bootstrapped.h>

#include <library/cpp/random_provider/random_provider.h>

#include <google/protobuf/util/json_util.h>

namespace NCloud::NFileStore::NStorage {

using namespace NActors;

using namespace NKikimr;

namespace {

////////////////////////////////////////////////////////////////////////////////

class TRestartLocalFileStoresActionActor final
: public TActorBootstrapped<TRestartLocalFileStoresActionActor>
{
private:
const TRequestInfoPtr RequestInfo;
const TString Input;
const TVector<TString> FileSystemIds;
ui32 RemainingRestarts = 0;

public:
TRestartLocalFileStoresActionActor(
TRequestInfoPtr requestInfo,
TString input,
TVector<TString> fileSystemIds)
: RequestInfo(std::move(requestInfo))
, Input(std::move(input))
, FileSystemIds(std::move(fileSystemIds))
{}

void Bootstrap(const TActorContext& ctx)
{
Y_UNUSED(ctx);
NProtoPrivate::TRestartLocalFileStoresRequest request;
if (!google::protobuf::util::JsonStringToMessage(Input, &request).ok())
{
ReplyAndDie(
ctx,
TErrorResponse(E_ARGUMENT, "Failed to parse input"));
return;
}

auto rng = CreateDeterministicRandomProvider(request.GetSeed());

LOG_INFO(
ctx,
TFileStoreComponents::SERVICE_WORKER,
"Restarting local file stores: seed: %lu",
request.GetSeed());

ui32 cookie = 0;

for (const auto& fileSystemId: FileSystemIds) {
if (rng->GenRand() % 2 == 0) {
auto requestToTablet =
std::make_unique<TEvIndexTablet::TEvWaitReadyRequest>();
requestToTablet->Record.SetFileSystemId(fileSystemId);

LOG_INFO(
ctx,
TFileStoreComponents::SERVICE_WORKER,
"Sending WaitReady to %s",
fileSystemId.c_str());

NCloud::Send(
ctx,
MakeIndexTabletProxyServiceId(),
std::move(requestToTablet),
cookie);
++RemainingRestarts;
}
++cookie;
}

if (RemainingRestarts == 0) {
return ReplyAndDie(ctx, {});
}

Become(&TThis::StateWork);
}

private:
void ReplyAndDie(
const TActorContext& ctx,
const NProtoPrivate::TRestartLocalFileStoresResponse& response)
{
auto msg = std::make_unique<TEvService::TEvExecuteActionResponse>(
response.GetError());

google::protobuf::util::MessageToJsonString(
response,
msg->Record.MutableOutput());

NCloud::Reply(ctx, *RequestInfo, std::move(msg));
Die(ctx);
}

STFUNC(StateWork)
{
switch (ev->GetTypeRewrite()) {
HFunc(
TEvIndexTablet::TEvWaitReadyResponse,
HandleWaitReadyResponse);

default:
HandleUnexpectedEvent(ev, TFileStoreComponents::SERVICE);
break;
}
}

void HandleWaitReadyResponse(
const TEvIndexTablet::TEvWaitReadyResponse::TPtr& ev,
const TActorContext& ctx)
{
--RemainingRestarts;
Y_UNUSED(ev);

LOG_INFO(
ctx,
TFileStoreComponents::SERVICE_WORKER,
"Sending poison pill to %s",
FileSystemIds.at(ev->Cookie).c_str());
NCloud::Send(
ctx,
ev->Sender,
std::make_unique<TEvents::TEvPoisonPill>());

if (RemainingRestarts == 0) {
ReplyAndDie(ctx, {});
}
}
};

////////////////////////////////////////////////////////////////////////////////

} // namespace

IActorPtr TStorageServiceActor::CreateRestartLocalFileStoresActionActor(
TRequestInfoPtr requestInfo,
TString input)
{
TVector<TString> fileSystemIds;
for (const auto& [fs, _]: State->GetLocalFileStores()) {
fileSystemIds.push_back(fs);
}

return std::make_unique<TRestartLocalFileStoresActionActor>(
std::move(requestInfo),
std::move(input),
std::move(fileSystemIds));
}

} // namespace NCloud::NFileStore::NStorage
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ void TStorageServiceActor::CompleteRequest(
}

FILESTORE_REMOTE_SERVICE(FILESTORE_IMPLEMENT_RESPONSE, TEvService)
FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_IMPLEMENT_RESPONSE, TEvServicePrivate)

#undef FILESTORE_IMPLEMENT_RESPONSE

Expand Down
11 changes: 0 additions & 11 deletions cloud/filestore/libs/storage/service/service_private.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,6 @@

namespace NCloud::NFileStore::NStorage {

////////////////////////////////////////////////////////////////////////////////

#define FILESTORE_SERVICE_REQUESTS_PRIVATE(xxx, ...) \
// FILESTORE_SERVICE_REQUESTS_PRIVATE

////////////////////////////////////////////////////////////////////////////////

struct TEvServicePrivate
{
//
Expand Down Expand Up @@ -64,8 +57,6 @@ struct TEvServicePrivate
{
EvBegin = TFileStoreEventsPrivate::SERVICE_WORKER_START,

FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_DECLARE_EVENT_IDS)

EvPingSession,
EvCreateSession,
EvSessionCreated,
Expand All @@ -78,8 +69,6 @@ struct TEvServicePrivate
static_assert(EvEnd < (int)TFileStoreEventsPrivate::SERVICE_WORKER_END,
"EvEnd expected to be < TFileStoreEventsPrivate::SERVICE_WORKER_END");

FILESTORE_SERVICE_REQUESTS_PRIVATE(FILESTORE_DECLARE_EVENTS)

using TEvPingSession = TRequestEvent<TEmpty, EvPingSession>;
using TEvCreateSession = TRequestEvent<TCreateSession, EvCreateSession>;
using TEvSessionCreated = TResponseEvent<TSessionCreated, EvSessionCreated>;
Expand Down
1 change: 1 addition & 0 deletions cloud/filestore/libs/storage/service/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ SRCS(
service_actor_actions_reassign_tablet.cpp
service_actor_actions_tablet_ops.cpp
service_actor_actions_write_compaction_map.cpp
service_actor_actions_restart_local_filestores.cpp
service_actor_actions.cpp
service_actor_alterfs.cpp
service_actor_complete.cpp
Expand Down
10 changes: 9 additions & 1 deletion cloud/filestore/libs/storage/tablet/tablet_actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -994,6 +994,12 @@ STFUNC(TIndexTabletActor::StateZombie)
HFunc(TEvTablet::TEvTabletDead, HandleTabletDead);
HFunc(TEvTabletPipe::TEvServerDisconnected, HandleSessionDisconnected);

// If compaction/cleanup/collectgarbage started before the tablet reload
// and completed during the init state, we should ignore it.
IgnoreFunc(TEvIndexTabletPrivate::TEvCompactionResponse);
IgnoreFunc(TEvIndexTabletPrivate::TEvCleanupResponse);
IgnoreFunc(TEvIndexTabletPrivate::TEvCollectGarbageResponse);

IgnoreFunc(TEvFileStore::TEvUpdateConfig);

// private api
Expand Down Expand Up @@ -1022,7 +1028,9 @@ STFUNC(TIndexTabletActor::StateZombie)
HandleNodeUnlinkedInShard);

default:
HandleUnexpectedEvent(ev, TFileStoreComponents::TABLET);
if (!HandleDefaultEvents(ev, SelfId())) {
HandleUnexpectedEvent(ev, TFileStoreComponents::TABLET);
}
break;
}
}
Expand Down
19 changes: 19 additions & 0 deletions cloud/filestore/private/api/protos/tablet.proto
Original file line number Diff line number Diff line change
Expand Up @@ -700,3 +700,22 @@ message TGetFileSystemTopologyResponse
// Shard FileSystem identifiers.
repeated string ShardFileSystemIds = 2;
}

////////////////////////////////////////////////////////////////////////////////
// RestartLocalFileStores request/response.

message TRestartLocalFileStoresRequest
{
// Optional request headers.
NProto.THeaders Headers = 1;

// Seed to randomize which filestores to restart.
uint64 Seed = 2;
}


message TRestartLocalFileStoresResponse
{
// Optional error, set only if error happened.
NCloud.NProto.TError Error = 1;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pytest

import cloud.storage.core.tools.testing.fio.lib as fio

from cloud.filestore.tests.python.lib.common import get_filestore_mount_path


TESTS = fio.generate_index_tests()


@pytest.mark.parametrize("name", TESTS.keys())
def test_fio(name):
mount_dir = get_filestore_mount_path()
dir_name = fio.get_dir_name(mount_dir, name)

fio.run_index_test(dir_name, TESTS[name], fail_on_errors=True)
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
PY3TEST()

INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/medium.inc)

DEPENDS(
cloud/storage/core/tools/testing/fio/bin
)

PEERDIR(
cloud/filestore/tests/python/lib
cloud/storage/core/tools/testing/fio/lib
)

TEST_SRCS(
test.py
)

SET(QEMU_VIRTIO fs)
SET(FILESTORE_SHARD_COUNT 5)
SET(
NFS_STORAGE_CONFIG_PATCH
cloud/filestore/tests/loadtest/service-kikimr-newfeatures-test/nfs-storage.txt
)

INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/service-kikimr.inc)
INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/vhost-kikimr.inc)
INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/vhost-endpoint.inc)
INCLUDE(${ARCADIA_ROOT}/cloud/storage/core/tests/recipes/qemu.inc)

SET(FILESTORE_TABLETS_RESTART_INTERVAL 5)
INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/tablets-restarter.inc)

END()
1 change: 1 addition & 0 deletions cloud/filestore/tests/fio_index/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ RECURSE_FOR_TESTS(
mount-kikimr-test
mount-local-test
qemu-kikimr-multishard-nemesis-test
qemu-kikimr-multishard-tablets-restart-test
qemu-kikimr-multishard-test
qemu-kikimr-nemesis-test
qemu-kikimr-test
Expand Down
15 changes: 15 additions & 0 deletions cloud/filestore/tests/recipes/tablets-restarter.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
DEPENDS(
cloud/filestore/tests/recipes/tablets-restarter
)


IF (FILESTORE_TABLETS_RESTART_INTERVAL)
SET(RECIPE_ARGS --restart-interval $FILESTORE_TABLETS_RESTART_INTERVAL)
ELSE()
MESSAGE(FATAL_ERROR FILESTORE_TABLETS_RESTART_INTERVAL should be set for tablets-restarter recipe to work)
ENDIF()

USE_RECIPE(
cloud/filestore/tests/recipes/tablets-restarter/filestore-tablets-restarter
${RECIPE_ARGS}
)
3 changes: 3 additions & 0 deletions cloud/filestore/tests/recipes/tablets-restarter/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### tablets-restarter

Include this recipe to get a process that regularly restarts random tablets of a filestore using a private `restartlocalfilestores` action.
Loading