From 97c41771bf0edc88f78ee8586cf5b7407d56082d Mon Sep 17 00:00:00 2001 From: Yurii Oleksyshyn Date: Wed, 13 Nov 2024 19:45:48 +0200 Subject: [PATCH 1/6] Work in progress on adding integration test --- cmd/bootstrap/run/epochs.go | 134 ++++++++++ .../cohort2/epoch_recover_from_efm_test.go | 231 ++++++++++++++++++ 2 files changed, 365 insertions(+) diff --git a/cmd/bootstrap/run/epochs.go b/cmd/bootstrap/run/epochs.go index b8e006e01c7..a40bdf9a5fb 100644 --- a/cmd/bootstrap/run/epochs.go +++ b/cmd/bootstrap/run/epochs.go @@ -3,6 +3,7 @@ package run import ( "encoding/hex" "fmt" + "github.com/onflow/crypto" "github.com/rs/zerolog" @@ -172,6 +173,139 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, return args, nil } +func GenerateRecoverTxArgsWithDKG(log zerolog.Logger, + internalNodes []bootstrap.NodeInfo, + collectionClusters int, + recoveryEpochCounter uint64, + rootChainID flow.ChainID, + numViewsInStakingAuction uint64, + numViewsInEpoch uint64, + targetDuration uint64, + unsafeAllowOverWrite bool, + dkgIndexMap flow.DKGIndexMap, + dkgParticipantKeys []crypto.PublicKey, + dkgGroupKey crypto.PublicKey, + snapshot *inmem.Snapshot, +) ([]cadence.Value, error) { + epoch := snapshot.Epochs().Current() + + currentEpochIdentities, err := snapshot.Identities(filter.IsValidProtocolParticipant) + if err != nil { + return nil, fmt.Errorf("failed to get valid protocol participants from snapshot: %w", err) + } + // We need canonical ordering here; sanity check to enforce this: + if !currentEpochIdentities.Sorted(flow.Canonical[flow.Identity]) { + return nil, fmt.Errorf("identies from snapshot not in canonical order") + } + + // separate collector nodes by internal and partner nodes + collectors := currentEpochIdentities.Filter(filter.HasRole[flow.Identity](flow.RoleCollection)) + internalCollectors := make(flow.IdentityList, 0) + partnerCollectors := make(flow.IdentityList, 0) + + internalNodesMap := make(map[flow.Identifier]struct{}) + for _, node := range internalNodes { + internalNodesMap[node.NodeID] = struct{}{} + } + + for _, collector := range collectors { + if _, ok := internalNodesMap[collector.NodeID]; ok { + internalCollectors = append(internalCollectors, collector) + } else { + partnerCollectors = append(partnerCollectors, collector) + } + } + + assignments, clusters, err := common.ConstructClusterAssignment(log, partnerCollectors, internalCollectors, collectionClusters) + if err != nil { + return nil, fmt.Errorf("unable to generate cluster assignment: %w", err) + } + + clusterBlocks := GenerateRootClusterBlocks(recoveryEpochCounter, clusters) + clusterQCs := ConstructRootQCsForClusters(log, clusters, internalNodes, clusterBlocks) + + // NOTE: The RecoveryEpoch will re-use the last successful DKG output. This means that the random beacon committee can be + // different from the consensus committee. This could happen if the node was ejected from the consensus committee, but it still has to be + // included in the DKG committee since the threshold signature scheme operates on pre-defined number of participants and cannot be changed. + dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgGroupKey.Encode())) + if cdcErr != nil { + return nil, fmt.Errorf("failed to get dkg group key cadence string: %w", cdcErr) + } + + // copy DKG index map from the current epoch + dkgIndexMapPairs := make([]cadence.KeyValuePair, 0) + for nodeID, index := range dkgIndexMap { + dkgIndexMapPairs = append(dkgIndexMapPairs, cadence.KeyValuePair{ + Key: cadence.String(nodeID.String()), + Value: cadence.NewInt(index), + }) + } + // copy DKG public keys from the current epoch + dkgPubKeys := make([]cadence.Value, 0) + for _, dkgPubKey := range dkgParticipantKeys { + dkgPubKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgPubKey.Encode())) + if cdcErr != nil { + return nil, fmt.Errorf("failed to get dkg pub key cadence string for node: %w", cdcErr) + } + dkgPubKeys = append(dkgPubKeys, dkgPubKeyCdc) + } + // fill node IDs + nodeIds := make([]cadence.Value, 0) + for _, id := range currentEpochIdentities { + nodeIdCdc, err := cadence.NewString(id.GetNodeID().String()) + if err != nil { + return nil, fmt.Errorf("failed to convert node ID to cadence string %s: %w", id.GetNodeID(), err) + } + nodeIds = append(nodeIds, nodeIdCdc) + } + + clusterQCAddress := systemcontracts.SystemContractsForChain(rootChainID).ClusterQC.Address.String() + qcVoteData, err := common.ConvertClusterQcsCdc(clusterQCs, clusters, clusterQCAddress) + if err != nil { + return nil, fmt.Errorf("failed to convert cluster qcs to cadence type") + } + currEpochFinalView, err := epoch.FinalView() + if err != nil { + return nil, fmt.Errorf("failed to get final view of current epoch") + } + currEpochTargetEndTime, err := epoch.TargetEndTime() + if err != nil { + return nil, fmt.Errorf("failed to get target end time of current epoch") + } + + args := []cadence.Value{ + // recovery epoch counter + cadence.NewUInt64(recoveryEpochCounter), + // epoch start view + cadence.NewUInt64(currEpochFinalView + 1), + // staking phase end view + cadence.NewUInt64(currEpochFinalView + numViewsInStakingAuction), + // epoch end view + cadence.NewUInt64(currEpochFinalView + numViewsInEpoch), + // target duration + cadence.NewUInt64(targetDuration), + // target end time + cadence.NewUInt64(currEpochTargetEndTime), + // clusters, + common.ConvertClusterAssignmentsCdc(assignments), + // qcVoteData + cadence.NewArray(qcVoteData), + // dkg pub keys + cadence.NewArray(dkgPubKeys), + // dkg group key, + dkgGroupKeyCdc, + // dkg index map + cadence.NewDictionary(dkgIndexMapPairs), + // node ids + cadence.NewArray(nodeIds), + // recover the network by initializing a new recover epoch which will increment the smart contract epoch counter + // or overwrite the epoch metadata for the current epoch + cadence.NewBool(unsafeAllowOverWrite), + } + + return args, nil +} + // ConstructRootQCsForClusters constructs a root QC for each cluster in the list. // Args: // - log: the logger instance. diff --git a/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go b/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go index 0ddf82ceadc..b28bffb2f46 100644 --- a/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go +++ b/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go @@ -1,7 +1,12 @@ package cohort2 import ( + "encoding/hex" "fmt" + "github.com/onflow/crypto" + "github.com/onflow/flow-go/cmd/util/cmd/common" + "github.com/onflow/flow-go/fvm/systemcontracts" + "github.com/rs/zerolog" "strings" "testing" @@ -292,3 +297,229 @@ func (s *RecoverEpochSuite) TestRecoverEpochNodeEjected() { s.AssertInEpoch(s.Ctx, 1) } + +// TestRecoverEpochNodeEjected ensures that the recover epoch governance transaction flow works as expected, and a network that +// enters Epoch Fallback Mode can successfully recover. +// For this specific scenario, we are testing a scenario where the consensus committee is a subset of the DKG committee, i.e., +// a node was ejected between epoch start and submitting the recover epoch transaction. +// This test will do the following: +// 1. Triggers EFM by turning off the sole collection node before the end of the DKG forcing the DKG to fail. +// 2. Generates epoch recover transaction args using the epoch efm-recover-tx-args. +// 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args. +// 4. Submit recover epoch transaction. +// 5. Ensure expected EpochRecover event is emitted. +// 6. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch. +func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { + // 1. Manually trigger EFM + // wait until the epoch setup phase to force network into EFM + s.AwaitEpochPhase(s.Ctx, 0, flow.EpochPhaseSetup, 10*time.Second, 500*time.Millisecond) + + // pause the collection node to trigger EFM by failing DKG + ln := s.GetContainersByRole(flow.RoleCollection)[0] + require.NoError(s.T(), ln.Pause()) + s.AwaitFinalizedView(s.Ctx, s.GetDKGEndView(), 2*time.Minute, 500*time.Millisecond) + // start the paused collection node now that we are in EFM + require.NoError(s.T(), ln.Start()) + + // get final view form the latest snapshot + epoch1FinalView, err := s.Net.BootstrapSnapshot.Epochs().Current().FinalView() + require.NoError(s.T(), err) + + // Wait for at least the first view past the current epoch's original FinalView to be finalized. + // At this point we can observe that an extension has been added to the current epoch, indicating EFM. + s.TimedLogf("waiting for epoch transition (finalized view %d)", epoch1FinalView+1) + s.AwaitFinalizedView(s.Ctx, epoch1FinalView+1, 2*time.Minute, 500*time.Millisecond) + s.TimedLogf("observed finalized view %d", epoch1FinalView+1) + + // assert transition to second epoch did not happen + // if counter is still 0, epoch emergency fallback was triggered as expected + s.AssertInEpoch(s.Ctx, 0) + + // 2. Generate transaction arguments for epoch recover transaction. + collectionClusters := s.NumOfCollectionClusters + recoveryEpochCounter := uint64(1) + + // read internal node info from one of the consensus nodes + internalNodePrivInfoDir, nodeConfigJson := s.getNodeInfoDirs(flow.RoleConsensus) + snapshot := s.GetLatestProtocolSnapshot(s.Ctx).Encodable() + // 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args. + snapshot.SealingSegment.LatestProtocolStateEntry().EpochEntry.CurrentEpochIdentityTable. + Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))[0].EpochParticipationStatus = flow.EpochParticipationStatusEjected + + txArgs, err := run.GenerateRecoverEpochTxArgs( + s.Log, + internalNodePrivInfoDir, + nodeConfigJson, + collectionClusters, + recoveryEpochCounter, + flow.Localnet, + s.StakingAuctionLen, + s.EpochLen, + 3000, + false, + inmem.SnapshotFromEncodable(snapshot), + ) + require.NoError(s.T(), err) + + // 4. Submit recover epoch transaction to the network. + env := utils.LocalnetEnv() + result := s.recoverEpoch(env, txArgs) + require.NoError(s.T(), result.Error) + require.Equal(s.T(), result.Status, sdk.TransactionStatusSealed) + + // 5. Ensure expected EpochRecover event is emitted. + eventType := "" + for _, evt := range result.Events { + if strings.Contains(evt.Type, "FlowEpoch.EpochRecover") { + eventType = evt.Type + break + } + } + require.NotEmpty(s.T(), eventType, "expected FlowEpoch.EpochRecover event type") + events, err := s.Client.GetEventsForBlockIDs(s.Ctx, eventType, []sdk.Identifier{result.BlockID}) + require.NoError(s.T(), err) + require.Equal(s.T(), events[0].Events[0].Type, eventType) + + // 6. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch. + startViewOfNextEpoch := uint64(txArgs[1].(cadence.UInt64)) + s.TimedLogf("waiting to transition into recovery epoch (finalized view %d)", startViewOfNextEpoch) + s.AwaitFinalizedView(s.Ctx, startViewOfNextEpoch, 2*time.Minute, 500*time.Millisecond) + s.TimedLogf("observed finalized first view of recovery epoch %d", startViewOfNextEpoch) + + s.AssertInEpoch(s.Ctx, 1) +} + +// generateRecoverTxArgsWithCustomDKG generates the required transaction arguments for the `recoverEpoch` transaction. +// No errors are expected during normal operation. +func GenerateRecoverTxArgsWithDKG(log zerolog.Logger, + internalNodes []bootstrap.NodeInfo, + collectionClusters int, + recoveryEpochCounter uint64, + rootChainID flow.ChainID, + numViewsInStakingAuction uint64, + numViewsInEpoch uint64, + targetDuration uint64, + unsafeAllowOverWrite bool, + dkgIndexMap flow.DKGIndexMap, + dkgParticipantKeys []crypto.PublicKey, + dkgGroupKey crypto.PublicKey, + snapshot *inmem.Snapshot, +) ([]cadence.Value, error) { + epoch := snapshot.Epochs().Current() + + currentEpochIdentities, err := snapshot.Identities(filter.IsValidProtocolParticipant) + if err != nil { + return nil, fmt.Errorf("failed to get valid protocol participants from snapshot: %w", err) + } + // We need canonical ordering here; sanity check to enforce this: + if !currentEpochIdentities.Sorted(flow.Canonical[flow.Identity]) { + return nil, fmt.Errorf("identies from snapshot not in canonical order") + } + + // separate collector nodes by internal and partner nodes + collectors := currentEpochIdentities.Filter(filter.HasRole[flow.Identity](flow.RoleCollection)) + internalCollectors := make(flow.IdentityList, 0) + partnerCollectors := make(flow.IdentityList, 0) + + internalNodesMap := make(map[flow.Identifier]struct{}) + for _, node := range internalNodes { + internalNodesMap[node.NodeID] = struct{}{} + } + + for _, collector := range collectors { + if _, ok := internalNodesMap[collector.NodeID]; ok { + internalCollectors = append(internalCollectors, collector) + } else { + partnerCollectors = append(partnerCollectors, collector) + } + } + + assignments, clusters, err := common.ConstructClusterAssignment(log, partnerCollectors, internalCollectors, collectionClusters) + if err != nil { + return nil, fmt.Errorf("unable to generate cluster assignment: %w", err) + } + + clusterBlocks := run.GenerateRootClusterBlocks(recoveryEpochCounter, clusters) + clusterQCs := run.ConstructRootQCsForClusters(log, clusters, internalNodes, clusterBlocks) + + // NOTE: The RecoveryEpoch will re-use the last successful DKG output. This means that the random beacon committee can be + // different from the consensus committee. This could happen if the node was ejected from the consensus committee, but it still has to be + // included in the DKG committee since the threshold signature scheme operates on pre-defined number of participants and cannot be changed. + dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgGroupKey.Encode())) + if cdcErr != nil { + return nil, fmt.Errorf("failed to get dkg group key cadence string: %w", cdcErr) + } + + // copy DKG index map from the current epoch + dkgIndexMapPairs := make([]cadence.KeyValuePair, 0) + for nodeID, index := range dkgIndexMap { + dkgIndexMapPairs = append(dkgIndexMapPairs, cadence.KeyValuePair{ + Key: cadence.String(nodeID.String()), + Value: cadence.NewInt(index), + }) + } + // copy DKG public keys from the current epoch + dkgPubKeys := make([]cadence.Value, 0) + for _, dkgPubKey := range dkgParticipantKeys { + dkgPubKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgPubKey.Encode())) + if cdcErr != nil { + return nil, fmt.Errorf("failed to get dkg pub key cadence string for node: %w", cdcErr) + } + dkgPubKeys = append(dkgPubKeys, dkgPubKeyCdc) + } + // fill node IDs + nodeIds := make([]cadence.Value, 0) + for _, id := range currentEpochIdentities { + nodeIdCdc, err := cadence.NewString(id.GetNodeID().String()) + if err != nil { + return nil, fmt.Errorf("failed to convert node ID to cadence string %s: %w", id.GetNodeID(), err) + } + nodeIds = append(nodeIds, nodeIdCdc) + } + + clusterQCAddress := systemcontracts.SystemContractsForChain(rootChainID).ClusterQC.Address.String() + qcVoteData, err := common.ConvertClusterQcsCdc(clusterQCs, clusters, clusterQCAddress) + if err != nil { + return nil, fmt.Errorf("failed to convert cluster qcs to cadence type") + } + currEpochFinalView, err := epoch.FinalView() + if err != nil { + return nil, fmt.Errorf("failed to get final view of current epoch") + } + currEpochTargetEndTime, err := epoch.TargetEndTime() + if err != nil { + return nil, fmt.Errorf("failed to get target end time of current epoch") + } + + args := []cadence.Value{ + // recovery epoch counter + cadence.NewUInt64(recoveryEpochCounter), + // epoch start view + cadence.NewUInt64(currEpochFinalView + 1), + // staking phase end view + cadence.NewUInt64(currEpochFinalView + numViewsInStakingAuction), + // epoch end view + cadence.NewUInt64(currEpochFinalView + numViewsInEpoch), + // target duration + cadence.NewUInt64(targetDuration), + // target end time + cadence.NewUInt64(currEpochTargetEndTime), + // clusters, + common.ConvertClusterAssignmentsCdc(assignments), + // qcVoteData + cadence.NewArray(qcVoteData), + // dkg pub keys + cadence.NewArray(dkgPubKeys), + // dkg group key, + dkgGroupKeyCdc, + // dkg index map + cadence.NewDictionary(dkgIndexMapPairs), + // node ids + cadence.NewArray(nodeIds), + // recover the network by initializing a new recover epoch which will increment the smart contract epoch counter + // or overwrite the epoch metadata for the current epoch + cadence.NewBool(unsafeAllowOverWrite), + } + + return args, nil +} From a4ee5a15baf128e5d1bb8749a342096d43b35650 Mon Sep 17 00:00:00 2001 From: Yurii Oleksyshyn Date: Thu, 19 Dec 2024 16:41:27 +0200 Subject: [PATCH 2/6] Implemented a test for epoch recovery for the most general case. --- cmd/bootstrap/dkg/dkg.go | 4 +- cmd/bootstrap/run/epochs.go | 195 ++++------------ .../cohort2/epoch_recover_from_efm_test.go | 219 +++++------------- 3 files changed, 103 insertions(+), 315 deletions(-) diff --git a/cmd/bootstrap/dkg/dkg.go b/cmd/bootstrap/dkg/dkg.go index e6dfe1788d4..79bacac26d9 100644 --- a/cmd/bootstrap/dkg/dkg.go +++ b/cmd/bootstrap/dkg/dkg.go @@ -26,8 +26,8 @@ func RandomBeaconKG(n int, seed []byte) (model.ThresholdKeySet, error) { return dkgData, nil } - skShares, pkShares, pkGroup, err := crypto.BLSThresholdKeyGen(int(n), - signature.RandomBeaconThreshold(int(n)), seed) + skShares, pkShares, pkGroup, err := crypto.BLSThresholdKeyGen(n, + signature.RandomBeaconThreshold(n), seed) if err != nil { return model.ThresholdKeySet{}, fmt.Errorf("Beacon KeyGen failed: %w", err) } diff --git a/cmd/bootstrap/run/epochs.go b/cmd/bootstrap/run/epochs.go index f7579b1b7a1..16eb416c408 100644 --- a/cmd/bootstrap/run/epochs.go +++ b/cmd/bootstrap/run/epochs.go @@ -4,7 +4,6 @@ import ( "encoding/hex" "fmt" "github.com/onflow/crypto" - "github.com/rs/zerolog" "github.com/onflow/cadence" @@ -32,6 +31,51 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, recoveryEpochTargetDuration uint64, unsafeAllowOverWrite bool, snapshot *inmem.Snapshot, +) ([]cadence.Value, error) { + log.Info().Msg("collecting internal node network and staking keys") + internalNodes, err := common.ReadFullInternalNodeInfos(log, internalNodePrivInfoDir, nodeConfigJson) + if err != nil { + return nil, fmt.Errorf("failed to read full internal node infos: %w", err) + } + + epochProtocolState, err := snapshot.EpochProtocolState() + if err != nil { + return nil, fmt.Errorf("failed to get epoch protocol state from snapshot: %w", err) + } + currentEpochCommit := epochProtocolState.EpochCommit() + + return GenerateRecoverTxArgsWithDKG( + log, + internalNodes, + collectionClusters, + recoveryEpochCounter, + rootChainID, + numViewsInStakingAuction, + numViewsInEpoch, + recoveryEpochTargetDuration, + unsafeAllowOverWrite, + currentEpochCommit.DKGIndexMap, + currentEpochCommit.DKGParticipantKeys, + currentEpochCommit.DKGGroupKey, + snapshot, + ) +} + +// GenerateRecoverTxArgsWithDKG generates the required transaction arguments for the `recoverEpoch` transaction. +// No errors are expected during normal operation. +func GenerateRecoverTxArgsWithDKG(log zerolog.Logger, + internalNodes []bootstrap.NodeInfo, + collectionClusters int, + recoveryEpochCounter uint64, + rootChainID flow.ChainID, + numViewsInStakingAuction uint64, + numViewsInEpoch uint64, + recoveryEpochTargetDuration uint64, + unsafeAllowOverWrite bool, + dkgIndexMap flow.DKGIndexMap, + dkgParticipantKeys []crypto.PublicKey, + dkgGroupKey crypto.PublicKey, + snapshot *inmem.Snapshot, ) ([]cadence.Value, error) { epoch := snapshot.Epochs().Current() currentEpochCounter, err := epoch.Counter() @@ -82,10 +126,6 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, partnerCollectors := make(flow.IdentityList, 0) log.Info().Msg("collecting internal node network and staking keys") - internalNodes, err := common.ReadFullInternalNodeInfos(log, internalNodePrivInfoDir, nodeConfigJson) - if err != nil { - return nil, fmt.Errorf("failed to read full internal node infos: %w", err) - } internalNodesMap := make(map[flow.Identifier]struct{}) for _, node := range internalNodes { @@ -135,23 +175,17 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, // The EFM Recovery State Machine will heuristically reject recovery attempts (specifically reject EpochRecover Service // events, when the intersection between consensus and random beacon committees is too small. - epochProtocolState, err := snapshot.EpochProtocolState() - if err != nil { - return nil, fmt.Errorf("failed to get epoch protocol state from snapshot: %w", err) - } - currentEpochCommit := epochProtocolState.EpochCommit() - // NOTE: The RecoveryEpoch will re-use the last successful DKG output. This means that the random beacon committee can be // different from the consensus committee. This could happen if the node was ejected from the consensus committee, but it still has to be // included in the DKG committee since the threshold signature scheme operates on pre-defined number of participants and cannot be changed. - dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(currentEpochCommit.DKGGroupKey.Encode())) + dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgGroupKey.Encode())) if cdcErr != nil { return nil, fmt.Errorf("failed to convert Random Beacon group key to cadence representation: %w", cdcErr) } // copy DKG index map from the current epoch dkgIndexMapPairs := make([]cadence.KeyValuePair, 0) - for nodeID, index := range currentEpochCommit.DKGIndexMap { + for nodeID, index := range dkgIndexMap { dkgIndexMapPairs = append(dkgIndexMapPairs, cadence.KeyValuePair{ Key: cadence.String(nodeID.String()), Value: cadence.NewInt(index), @@ -159,7 +193,7 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, } // copy DKG public keys from the current epoch dkgPubKeys := make([]cadence.Value, 0) - for k, dkgPubKey := range currentEpochCommit.DKGParticipantKeys { + for k, dkgPubKey := range dkgParticipantKeys { dkgPubKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgPubKey.Encode())) if cdcErr != nil { return nil, fmt.Errorf("failed convert public beacon key of participant %d to cadence representation: %w", k, cdcErr) @@ -226,139 +260,6 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, return args, nil } -func GenerateRecoverTxArgsWithDKG(log zerolog.Logger, - internalNodes []bootstrap.NodeInfo, - collectionClusters int, - recoveryEpochCounter uint64, - rootChainID flow.ChainID, - numViewsInStakingAuction uint64, - numViewsInEpoch uint64, - targetDuration uint64, - unsafeAllowOverWrite bool, - dkgIndexMap flow.DKGIndexMap, - dkgParticipantKeys []crypto.PublicKey, - dkgGroupKey crypto.PublicKey, - snapshot *inmem.Snapshot, -) ([]cadence.Value, error) { - epoch := snapshot.Epochs().Current() - - currentEpochIdentities, err := snapshot.Identities(filter.IsValidProtocolParticipant) - if err != nil { - return nil, fmt.Errorf("failed to get valid protocol participants from snapshot: %w", err) - } - // We need canonical ordering here; sanity check to enforce this: - if !currentEpochIdentities.Sorted(flow.Canonical[flow.Identity]) { - return nil, fmt.Errorf("identies from snapshot not in canonical order") - } - - // separate collector nodes by internal and partner nodes - collectors := currentEpochIdentities.Filter(filter.HasRole[flow.Identity](flow.RoleCollection)) - internalCollectors := make(flow.IdentityList, 0) - partnerCollectors := make(flow.IdentityList, 0) - - internalNodesMap := make(map[flow.Identifier]struct{}) - for _, node := range internalNodes { - internalNodesMap[node.NodeID] = struct{}{} - } - - for _, collector := range collectors { - if _, ok := internalNodesMap[collector.NodeID]; ok { - internalCollectors = append(internalCollectors, collector) - } else { - partnerCollectors = append(partnerCollectors, collector) - } - } - - assignments, clusters, err := common.ConstructClusterAssignment(log, partnerCollectors, internalCollectors, collectionClusters) - if err != nil { - return nil, fmt.Errorf("unable to generate cluster assignment: %w", err) - } - - clusterBlocks := GenerateRootClusterBlocks(recoveryEpochCounter, clusters) - clusterQCs := ConstructRootQCsForClusters(log, clusters, internalNodes, clusterBlocks) - - // NOTE: The RecoveryEpoch will re-use the last successful DKG output. This means that the random beacon committee can be - // different from the consensus committee. This could happen if the node was ejected from the consensus committee, but it still has to be - // included in the DKG committee since the threshold signature scheme operates on pre-defined number of participants and cannot be changed. - dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgGroupKey.Encode())) - if cdcErr != nil { - return nil, fmt.Errorf("failed to get dkg group key cadence string: %w", cdcErr) - } - - // copy DKG index map from the current epoch - dkgIndexMapPairs := make([]cadence.KeyValuePair, 0) - for nodeID, index := range dkgIndexMap { - dkgIndexMapPairs = append(dkgIndexMapPairs, cadence.KeyValuePair{ - Key: cadence.String(nodeID.String()), - Value: cadence.NewInt(index), - }) - } - // copy DKG public keys from the current epoch - dkgPubKeys := make([]cadence.Value, 0) - for _, dkgPubKey := range dkgParticipantKeys { - dkgPubKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgPubKey.Encode())) - if cdcErr != nil { - return nil, fmt.Errorf("failed to get dkg pub key cadence string for node: %w", cdcErr) - } - dkgPubKeys = append(dkgPubKeys, dkgPubKeyCdc) - } - // fill node IDs - nodeIds := make([]cadence.Value, 0) - for _, id := range currentEpochIdentities { - nodeIdCdc, err := cadence.NewString(id.GetNodeID().String()) - if err != nil { - return nil, fmt.Errorf("failed to convert node ID to cadence string %s: %w", id.GetNodeID(), err) - } - nodeIds = append(nodeIds, nodeIdCdc) - } - - clusterQCAddress := systemcontracts.SystemContractsForChain(rootChainID).ClusterQC.Address.String() - qcVoteData, err := common.ConvertClusterQcsCdc(clusterQCs, clusters, clusterQCAddress) - if err != nil { - return nil, fmt.Errorf("failed to convert cluster qcs to cadence type") - } - currEpochFinalView, err := epoch.FinalView() - if err != nil { - return nil, fmt.Errorf("failed to get final view of current epoch") - } - currEpochTargetEndTime, err := epoch.TargetEndTime() - if err != nil { - return nil, fmt.Errorf("failed to get target end time of current epoch") - } - - args := []cadence.Value{ - // recovery epoch counter - cadence.NewUInt64(recoveryEpochCounter), - // epoch start view - cadence.NewUInt64(currEpochFinalView + 1), - // staking phase end view - cadence.NewUInt64(currEpochFinalView + numViewsInStakingAuction), - // epoch end view - cadence.NewUInt64(currEpochFinalView + numViewsInEpoch), - // target duration - cadence.NewUInt64(targetDuration), - // target end time - cadence.NewUInt64(currEpochTargetEndTime), - // clusters, - common.ConvertClusterAssignmentsCdc(assignments), - // qcVoteData - cadence.NewArray(qcVoteData), - // dkg pub keys - cadence.NewArray(dkgPubKeys), - // dkg group key, - dkgGroupKeyCdc, - // dkg index map - cadence.NewDictionary(dkgIndexMapPairs), - // node ids - cadence.NewArray(nodeIds), - // recover the network by initializing a new recover epoch which will increment the smart contract epoch counter - // or overwrite the epoch metadata for the current epoch - cadence.NewBool(unsafeAllowOverWrite), - } - - return args, nil -} - // ConstructRootQCsForClusters constructs a root QC for each cluster in the list. // Args: // - log: the logger instance. diff --git a/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go b/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go index 56fbf3d40a1..0052b9f52ea 100644 --- a/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go +++ b/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go @@ -1,13 +1,9 @@ package cohort2 import ( - "encoding/hex" "fmt" - "github.com/onflow/crypto" "github.com/onflow/flow-go/cmd/util/cmd/common" - "github.com/onflow/flow-go/fvm/systemcontracts" - "github.com/rs/zerolog" - + "github.com/onflow/flow-go/utils/unittest" "strings" "testing" "time" @@ -44,9 +40,9 @@ func (s *RecoverEpochSuite) SetupTest() { s.EpochLen = 150 s.FinalizationSafetyThreshold = 20 s.NumOfCollectionClusters = 1 - // we need to use 3 consensus nodes to be able to eject a single node from the consensus committee - // and still have a Random Beacon committee which meets the protocol.RandomBeaconSafetyThreshold - s.NumOfConsensusNodes = 3 + // we need to use 4 consensus nodes to be able to eject a single node and still have a super-majority and + // have a Random Beacon committee which meets the protocol.RandomBeaconSafetyThreshold. + s.NumOfConsensusNodes = 4 // run the generic setup, which starts up the network s.BaseSuite.SetupTest() @@ -296,21 +292,21 @@ func (s *RecoverEpochSuite) TestRecoverEpochNodeEjected() { s.AssertInEpoch(s.Ctx, 1) } -// TestRecoverEpochNodeEjected ensures that the recover epoch governance transaction flow works as expected, and a network that +// TestRecoverEpochEjectNodeDifferentDKG ensures that the recover epoch governance transaction flow works as expected, and a network that // enters Epoch Fallback Mode can successfully recover. -// For this specific scenario, we are testing a scenario where the consensus committee is a subset of the DKG committee, i.e., -// a node was ejected between epoch start and submitting the recover epoch transaction. +// For this specific scenario, we are testing a scenario where the consensus committee and Random Beacon committee form a symmetric difference with +// cardinality 1. In other words, there is a node which is part of the consensus committee but not part of the Random Beacon committee and +// another node which is part of the Random Beacon committee but not part of the consensus committee. // This test will do the following: // 1. Triggers EFM by turning off the sole collection node before the end of the DKG forcing the DKG to fail. // 2. Generates epoch recover transaction args using the epoch efm-recover-tx-args. // 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args. -// 4. Submit recover epoch transaction. -// 5. Ensure expected EpochRecover event is emitted. -// 6. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch. +// 4. Eject consensus node from the Random Beacon committee by modifying the snapshot before generating the recover epoch transaction args. +// 5. Submit recover epoch transaction. +// 6. Ensure expected EpochRecover event is emitted. +// 7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch. func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { // 1. Manually trigger EFM - // wait until the epoch setup phase to force network into EFM - s.AwaitEpochPhase(s.Ctx, 0, flow.EpochPhaseSetup, 10*time.Second, 500*time.Millisecond) // pause the collection node to trigger EFM by failing DKG ln := s.GetContainersByRole(flow.RoleCollection)[0] @@ -319,19 +315,21 @@ func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { // start the paused collection node now that we are in EFM require.NoError(s.T(), ln.Start()) - // get final view form the latest snapshot + // get final view from the latest snapshot epoch1FinalView, err := s.Net.BootstrapSnapshot.Epochs().Current().FinalView() require.NoError(s.T(), err) // Wait for at least the first view past the current epoch's original FinalView to be finalized. - // At this point we can observe that an extension has been added to the current epoch, indicating EFM. s.TimedLogf("waiting for epoch transition (finalized view %d)", epoch1FinalView+1) s.AwaitFinalizedView(s.Ctx, epoch1FinalView+1, 2*time.Minute, 500*time.Millisecond) s.TimedLogf("observed finalized view %d", epoch1FinalView+1) - // assert transition to second epoch did not happen - // if counter is still 0, epoch emergency fallback was triggered as expected - s.AssertInEpoch(s.Ctx, 0) + // assert that we are in EFM + snapshot, err := s.Client.GetLatestProtocolSnapshot(s.Ctx) + require.NoError(s.T(), err) + epochPhase, err := snapshot.EpochPhase() + require.NoError(s.T(), err) + require.Equal(s.T(), flow.EpochPhaseFallback, epochPhase, "network must enter EFM by this point") // 2. Generate transaction arguments for epoch recover transaction. collectionClusters := s.NumOfCollectionClusters @@ -339,15 +337,36 @@ func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { // read internal node info from one of the consensus nodes internalNodePrivInfoDir, nodeConfigJson := s.getNodeInfoDirs(flow.RoleConsensus) - snapshot := s.GetLatestProtocolSnapshot(s.Ctx).Encodable() + internalNodes, err := common.ReadFullInternalNodeInfos(unittest.Logger(), internalNodePrivInfoDir, nodeConfigJson) + require.NoError(s.T(), err) // 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args. - snapshot.SealingSegment.LatestProtocolStateEntry().EpochEntry.CurrentEpochIdentityTable. - Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))[0].EpochParticipationStatus = flow.EpochParticipationStatusEjected + // By ejecting a node from the consensus committee but keeping it in the Random Beacon committee, we ensure that the there is a node + // which is not part of the consensus committee but is part of the Random Beacon committee. + currentIdentityTable := snapshot.Encodable().SealingSegment.LatestProtocolStateEntry().EpochEntry.CurrentEpochIdentityTable + ejectedIdentity := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))[0] + ejectedIdentity.EpochParticipationStatus = flow.EpochParticipationStatusEjected + + // 4. Modify DKG data by removing the last node of the consensus committee from DKG committee. We can do this + // by altering the DKG index map and DKG public key shares. + // This way we ensure that consensus committee has a node which is not part of the Random Beacon committee. + randomBeaconParticipants := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus)) + nConsensusNodes := len(randomBeaconParticipants) - 1 + + dkgIndexMap := make(flow.DKGIndexMap, nConsensusNodes) + for i, participant := range randomBeaconParticipants[:nConsensusNodes] { + dkgIndexMap[participant.NodeID] = i + } - txArgs, err := run.GenerateRecoverEpochTxArgs( + epochProtocolState, err := snapshot.EpochProtocolState() + require.NoError(s.T(), err) + dkg, err := epochProtocolState.DKG() + require.NoError(s.T(), err) + + // At this point we have a node which is part of the consensus committee but not part of the Random Beacon committee and + // another node which is part of the Random Beacon committee but not part of the consensus committee. + txArgs, err := run.GenerateRecoverTxArgsWithDKG( s.Log, - internalNodePrivInfoDir, - nodeConfigJson, + internalNodes, collectionClusters, recoveryEpochCounter, flow.Localnet, @@ -355,17 +374,20 @@ func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { s.EpochLen, 3000, false, - inmem.SnapshotFromEncodable(snapshot), + dkgIndexMap, + dkg.KeyShares()[:nConsensusNodes], + dkg.GroupKey(), + snapshot, ) require.NoError(s.T(), err) - // 4. Submit recover epoch transaction to the network. + // 5. Submit recover epoch transaction to the network. env := utils.LocalnetEnv() result := s.recoverEpoch(env, txArgs) require.NoError(s.T(), result.Error) require.Equal(s.T(), result.Status, sdk.TransactionStatusSealed) - // 5. Ensure expected EpochRecover event is emitted. + // 6. Ensure expected EpochRecover event is emitted. eventType := "" for _, evt := range result.Events { if strings.Contains(evt.Type, "FlowEpoch.EpochRecover") { @@ -378,7 +400,7 @@ func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { require.NoError(s.T(), err) require.Equal(s.T(), events[0].Events[0].Type, eventType) - // 6. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch. + // 7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch. startViewOfNextEpoch := uint64(txArgs[1].(cadence.UInt64)) s.TimedLogf("waiting to transition into recovery epoch (finalized view %d)", startViewOfNextEpoch) s.AwaitFinalizedView(s.Ctx, startViewOfNextEpoch, 2*time.Minute, 500*time.Millisecond) @@ -386,138 +408,3 @@ func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { s.AssertInEpoch(s.Ctx, 1) } - -// generateRecoverTxArgsWithCustomDKG generates the required transaction arguments for the `recoverEpoch` transaction. -// No errors are expected during normal operation. -func GenerateRecoverTxArgsWithDKG(log zerolog.Logger, - internalNodes []bootstrap.NodeInfo, - collectionClusters int, - recoveryEpochCounter uint64, - rootChainID flow.ChainID, - numViewsInStakingAuction uint64, - numViewsInEpoch uint64, - targetDuration uint64, - unsafeAllowOverWrite bool, - dkgIndexMap flow.DKGIndexMap, - dkgParticipantKeys []crypto.PublicKey, - dkgGroupKey crypto.PublicKey, - snapshot *inmem.Snapshot, -) ([]cadence.Value, error) { - epoch := snapshot.Epochs().Current() - - currentEpochIdentities, err := snapshot.Identities(filter.IsValidProtocolParticipant) - if err != nil { - return nil, fmt.Errorf("failed to get valid protocol participants from snapshot: %w", err) - } - // We need canonical ordering here; sanity check to enforce this: - if !currentEpochIdentities.Sorted(flow.Canonical[flow.Identity]) { - return nil, fmt.Errorf("identies from snapshot not in canonical order") - } - - // separate collector nodes by internal and partner nodes - collectors := currentEpochIdentities.Filter(filter.HasRole[flow.Identity](flow.RoleCollection)) - internalCollectors := make(flow.IdentityList, 0) - partnerCollectors := make(flow.IdentityList, 0) - - internalNodesMap := make(map[flow.Identifier]struct{}) - for _, node := range internalNodes { - internalNodesMap[node.NodeID] = struct{}{} - } - - for _, collector := range collectors { - if _, ok := internalNodesMap[collector.NodeID]; ok { - internalCollectors = append(internalCollectors, collector) - } else { - partnerCollectors = append(partnerCollectors, collector) - } - } - - assignments, clusters, err := common.ConstructClusterAssignment(log, partnerCollectors, internalCollectors, collectionClusters) - if err != nil { - return nil, fmt.Errorf("unable to generate cluster assignment: %w", err) - } - - clusterBlocks := run.GenerateRootClusterBlocks(recoveryEpochCounter, clusters) - clusterQCs := run.ConstructRootQCsForClusters(log, clusters, internalNodes, clusterBlocks) - - // NOTE: The RecoveryEpoch will re-use the last successful DKG output. This means that the random beacon committee can be - // different from the consensus committee. This could happen if the node was ejected from the consensus committee, but it still has to be - // included in the DKG committee since the threshold signature scheme operates on pre-defined number of participants and cannot be changed. - dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgGroupKey.Encode())) - if cdcErr != nil { - return nil, fmt.Errorf("failed to get dkg group key cadence string: %w", cdcErr) - } - - // copy DKG index map from the current epoch - dkgIndexMapPairs := make([]cadence.KeyValuePair, 0) - for nodeID, index := range dkgIndexMap { - dkgIndexMapPairs = append(dkgIndexMapPairs, cadence.KeyValuePair{ - Key: cadence.String(nodeID.String()), - Value: cadence.NewInt(index), - }) - } - // copy DKG public keys from the current epoch - dkgPubKeys := make([]cadence.Value, 0) - for _, dkgPubKey := range dkgParticipantKeys { - dkgPubKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgPubKey.Encode())) - if cdcErr != nil { - return nil, fmt.Errorf("failed to get dkg pub key cadence string for node: %w", cdcErr) - } - dkgPubKeys = append(dkgPubKeys, dkgPubKeyCdc) - } - // fill node IDs - nodeIds := make([]cadence.Value, 0) - for _, id := range currentEpochIdentities { - nodeIdCdc, err := cadence.NewString(id.GetNodeID().String()) - if err != nil { - return nil, fmt.Errorf("failed to convert node ID to cadence string %s: %w", id.GetNodeID(), err) - } - nodeIds = append(nodeIds, nodeIdCdc) - } - - clusterQCAddress := systemcontracts.SystemContractsForChain(rootChainID).ClusterQC.Address.String() - qcVoteData, err := common.ConvertClusterQcsCdc(clusterQCs, clusters, clusterQCAddress) - if err != nil { - return nil, fmt.Errorf("failed to convert cluster qcs to cadence type") - } - currEpochFinalView, err := epoch.FinalView() - if err != nil { - return nil, fmt.Errorf("failed to get final view of current epoch") - } - currEpochTargetEndTime, err := epoch.TargetEndTime() - if err != nil { - return nil, fmt.Errorf("failed to get target end time of current epoch") - } - - args := []cadence.Value{ - // recovery epoch counter - cadence.NewUInt64(recoveryEpochCounter), - // epoch start view - cadence.NewUInt64(currEpochFinalView + 1), - // staking phase end view - cadence.NewUInt64(currEpochFinalView + numViewsInStakingAuction), - // epoch end view - cadence.NewUInt64(currEpochFinalView + numViewsInEpoch), - // target duration - cadence.NewUInt64(targetDuration), - // target end time - cadence.NewUInt64(currEpochTargetEndTime), - // clusters, - common.ConvertClusterAssignmentsCdc(assignments), - // qcVoteData - cadence.NewArray(qcVoteData), - // dkg pub keys - cadence.NewArray(dkgPubKeys), - // dkg group key, - dkgGroupKeyCdc, - // dkg index map - cadence.NewDictionary(dkgIndexMapPairs), - // node ids - cadence.NewArray(nodeIds), - // recover the network by initializing a new recover epoch which will increment the smart contract epoch counter - // or overwrite the epoch metadata for the current epoch - cadence.NewBool(unsafeAllowOverWrite), - } - - return args, nil -} From eea38469003655f7740cd400f3602daa6c6dbdbc Mon Sep 17 00:00:00 2001 From: Yurii Oleksyshyn Date: Thu, 19 Dec 2024 16:42:54 +0200 Subject: [PATCH 3/6] Linted --- cmd/bootstrap/run/epochs.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/bootstrap/run/epochs.go b/cmd/bootstrap/run/epochs.go index 16eb416c408..f3ead512e4f 100644 --- a/cmd/bootstrap/run/epochs.go +++ b/cmd/bootstrap/run/epochs.go @@ -3,10 +3,10 @@ package run import ( "encoding/hex" "fmt" - "github.com/onflow/crypto" - "github.com/rs/zerolog" "github.com/onflow/cadence" + "github.com/onflow/crypto" + "github.com/rs/zerolog" "github.com/onflow/flow-go/cmd/util/cmd/common" "github.com/onflow/flow-go/fvm/systemcontracts" From cbed9833bf9426dc3ffc860c6fc0636bfe5a3959 Mon Sep 17 00:00:00 2001 From: Yurii Oleksyshyn Date: Thu, 19 Dec 2024 17:27:44 +0200 Subject: [PATCH 4/6] Linted --- .../tests/epochs/cohort2/epoch_recover_from_efm_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go b/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go index 0052b9f52ea..3167f67c9f1 100644 --- a/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go +++ b/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go @@ -2,8 +2,6 @@ package cohort2 import ( "fmt" - "github.com/onflow/flow-go/cmd/util/cmd/common" - "github.com/onflow/flow-go/utils/unittest" "strings" "testing" "time" @@ -16,11 +14,13 @@ import ( sdk "github.com/onflow/flow-go-sdk" "github.com/onflow/flow-go/cmd/bootstrap/run" + "github.com/onflow/flow-go/cmd/util/cmd/common" "github.com/onflow/flow-go/integration/tests/epochs" "github.com/onflow/flow-go/integration/utils" "github.com/onflow/flow-go/model/bootstrap" "github.com/onflow/flow-go/model/flow" "github.com/onflow/flow-go/model/flow/filter" + "github.com/onflow/flow-go/utils/unittest" ) func TestRecoverEpoch(t *testing.T) { From b83f76a14aae5e58adb0862cc614ef1bc5e5a6de Mon Sep 17 00:00:00 2001 From: Yurii Oleksyshyn Date: Mon, 6 Jan 2025 13:03:05 +0200 Subject: [PATCH 5/6] Apply suggestions from code review Co-authored-by: Alexander Hentschel Co-authored-by: Jordan Schalm --- cmd/bootstrap/run/epochs.go | 1 - .../cohort2/epoch_recover_from_efm_test.go | 28 +++++++++++-------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/cmd/bootstrap/run/epochs.go b/cmd/bootstrap/run/epochs.go index f3ead512e4f..31070d52655 100644 --- a/cmd/bootstrap/run/epochs.go +++ b/cmd/bootstrap/run/epochs.go @@ -126,7 +126,6 @@ func GenerateRecoverTxArgsWithDKG(log zerolog.Logger, partnerCollectors := make(flow.IdentityList, 0) log.Info().Msg("collecting internal node network and staking keys") - internalNodesMap := make(map[flow.Identifier]struct{}) for _, node := range internalNodes { if !currentEpochIdentities.Exists(node.Identity()) { diff --git a/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go b/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go index 3167f67c9f1..05e7428ff10 100644 --- a/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go +++ b/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go @@ -299,7 +299,7 @@ func (s *RecoverEpochSuite) TestRecoverEpochNodeEjected() { // another node which is part of the Random Beacon committee but not part of the consensus committee. // This test will do the following: // 1. Triggers EFM by turning off the sole collection node before the end of the DKG forcing the DKG to fail. -// 2. Generates epoch recover transaction args using the epoch efm-recover-tx-args. +// 2. Generates epoch recover transaction args using the tooling [run.GenerateRecoverTxArgsWithDKG], which would also be used for mainnet recovery. // 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args. // 4. Eject consensus node from the Random Beacon committee by modifying the snapshot before generating the recover epoch transaction args. // 5. Submit recover epoch transaction. @@ -339,20 +339,24 @@ func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { internalNodePrivInfoDir, nodeConfigJson := s.getNodeInfoDirs(flow.RoleConsensus) internalNodes, err := common.ReadFullInternalNodeInfos(unittest.Logger(), internalNodePrivInfoDir, nodeConfigJson) require.NoError(s.T(), err) - // 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args. + // 3. Eject the FIRST consensus node by modifying the snapshot before generating the recover epoch transaction args. // By ejecting a node from the consensus committee but keeping it in the Random Beacon committee, we ensure that the there is a node // which is not part of the consensus committee but is part of the Random Beacon committee. currentIdentityTable := snapshot.Encodable().SealingSegment.LatestProtocolStateEntry().EpochEntry.CurrentEpochIdentityTable ejectedIdentity := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))[0] - ejectedIdentity.EpochParticipationStatus = flow.EpochParticipationStatusEjected - - // 4. Modify DKG data by removing the last node of the consensus committee from DKG committee. We can do this - // by altering the DKG index map and DKG public key shares. - // This way we ensure that consensus committee has a node which is not part of the Random Beacon committee. + ejectedIdentity.EpochParticipationStatus = flow.EpochParticipationStatusEjected // writes through to `currentIdentityTable` + + // 4. Modify DKG data by removing the last node of the consensus committee from DKG committee. This way we ensure that consensus + // committee has a node which is not part of the Random Beacon committee. For threshold committees of *even size*, we can remove a + // single node without changing the threshold (see [ref. 1] for details). In other words, we can just pretend that there was originally + // one node less in the DKG, while the same number of signatures (threshold +1) are sufficient to construct a group signature. + // + // [ref. 1] function `RandomBeaconThreshold` for computing the threshold in package module/signature; note + // that for reconstructing the group sig, _strictly more_ than `threshold` sig shares are required. randomBeaconParticipants := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus)) nConsensusNodes := len(randomBeaconParticipants) - 1 - dkgIndexMap := make(flow.DKGIndexMap, nConsensusNodes) + recoveryDkgIndexMap := make(flow.DKGIndexMap, nConsensusNodes) for i, participant := range randomBeaconParticipants[:nConsensusNodes] { dkgIndexMap[participant.NodeID] = i } @@ -361,6 +365,8 @@ func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { require.NoError(s.T(), err) dkg, err := epochProtocolState.DKG() require.NoError(s.T(), err) + recoveryThresholdKeyShares := dkg.KeyShares()[:nConsensusNodes] + recoveryThresholdGroupKey := dkg.GroupKey() // At this point we have a node which is part of the consensus committee but not part of the Random Beacon committee and // another node which is part of the Random Beacon committee but not part of the consensus committee. @@ -374,9 +380,9 @@ func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { s.EpochLen, 3000, false, - dkgIndexMap, - dkg.KeyShares()[:nConsensusNodes], - dkg.GroupKey(), + recoveryDkgIndexMap, + recoveryThresholdKeyShares, + recoveryThresholdGroupKey, snapshot, ) require.NoError(s.T(), err) From 4603a9810c97a43a4c27f7b18900b8afd53af5a8 Mon Sep 17 00:00:00 2001 From: Yurii Oleksyshyn Date: Mon, 6 Jan 2025 13:33:18 +0200 Subject: [PATCH 6/6] Apply suggestions from PR review --- cmd/bootstrap/run/epochs.go | 2 - .../cohort2/epoch_recover_from_efm_test.go | 54 ++++++++++--------- 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/cmd/bootstrap/run/epochs.go b/cmd/bootstrap/run/epochs.go index 31070d52655..177482c4bb3 100644 --- a/cmd/bootstrap/run/epochs.go +++ b/cmd/bootstrap/run/epochs.go @@ -125,7 +125,6 @@ func GenerateRecoverTxArgsWithDKG(log zerolog.Logger, internalCollectors := make(flow.IdentityList, 0) partnerCollectors := make(flow.IdentityList, 0) - log.Info().Msg("collecting internal node network and staking keys") internalNodesMap := make(map[flow.Identifier]struct{}) for _, node := range internalNodes { if !currentEpochIdentities.Exists(node.Identity()) { @@ -133,7 +132,6 @@ func GenerateRecoverTxArgsWithDKG(log zerolog.Logger, } internalNodesMap[node.NodeID] = struct{}{} } - log.Info().Msg("") for _, collector := range collectors { if _, ok := internalNodesMap[collector.NodeID]; ok { diff --git a/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go b/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go index 05e7428ff10..ac311c230cc 100644 --- a/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go +++ b/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go @@ -294,19 +294,25 @@ func (s *RecoverEpochSuite) TestRecoverEpochNodeEjected() { // TestRecoverEpochEjectNodeDifferentDKG ensures that the recover epoch governance transaction flow works as expected, and a network that // enters Epoch Fallback Mode can successfully recover. -// For this specific scenario, we are testing a scenario where the consensus committee and Random Beacon committee form a symmetric difference with -// cardinality 1. In other words, there is a node which is part of the consensus committee but not part of the Random Beacon committee and -// another node which is part of the Random Beacon committee but not part of the consensus committee. +// Here, we are testing a scenario where the consensus committee 𝒞 and Random Beacon committee 𝒟 form a symmetric difference with +// cardinality 1. Formally, |𝒞 ∖ 𝒟| = 1 and |𝒟 \ 𝒞| = 1. In other words, there is a node which is part of the consensus committee but not +// part of the Random Beacon committee and another node which is part of the Random Beacon committee but not part of the consensus committee. +// We remove the first consensus node from the Consensus Committee, and the last consensus node from the Random Beacon Committee. For example, +// if the original consensus set is {A, B, C, D} then: +// - the post-recovery consensus committee is {B, C, D} +// - the post-recovery random beacon committee is {A, B, C} +// // This test will do the following: -// 1. Triggers EFM by turning off the sole collection node before the end of the DKG forcing the DKG to fail. -// 2. Generates epoch recover transaction args using the tooling [run.GenerateRecoverTxArgsWithDKG], which would also be used for mainnet recovery. -// 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args. -// 4. Eject consensus node from the Random Beacon committee by modifying the snapshot before generating the recover epoch transaction args. -// 5. Submit recover epoch transaction. -// 6. Ensure expected EpochRecover event is emitted. -// 7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch. +// 1. Triggers EFM by turning off the sole collection node before the end of the DKG forcing the DKG to fail. +// 2. Eject the first consensus node by modifying the epoch snapshot. +// 3. Drop the last consensus node from the Random Beacon committee. This hack works only for threshold systems with an even number of participants, +// without changing the threshold - hence we need to start this test with 4 consensus nodes. +// 4. Generates epoch recover transaction args using the tooling [run.GenerateRecoverTxArgsWithDKG] provided for the governance committee. +// 5. Submit recover epoch transaction. +// 6. Ensure expected EpochRecover event is emitted. +// 7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch. func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { - // 1. Manually trigger EFM + // 1. Triggers EFM by turning off the sole collection node before the end of the DKG forcing the DKG to fail. // pause the collection node to trigger EFM by failing DKG ln := s.GetContainersByRole(flow.RoleCollection)[0] @@ -331,22 +337,14 @@ func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { require.NoError(s.T(), err) require.Equal(s.T(), flow.EpochPhaseFallback, epochPhase, "network must enter EFM by this point") - // 2. Generate transaction arguments for epoch recover transaction. - collectionClusters := s.NumOfCollectionClusters - recoveryEpochCounter := uint64(1) - - // read internal node info from one of the consensus nodes - internalNodePrivInfoDir, nodeConfigJson := s.getNodeInfoDirs(flow.RoleConsensus) - internalNodes, err := common.ReadFullInternalNodeInfos(unittest.Logger(), internalNodePrivInfoDir, nodeConfigJson) - require.NoError(s.T(), err) - // 3. Eject the FIRST consensus node by modifying the snapshot before generating the recover epoch transaction args. + // 2. Eject the FIRST consensus node by modifying the snapshot before generating the recover epoch transaction args. // By ejecting a node from the consensus committee but keeping it in the Random Beacon committee, we ensure that the there is a node // which is not part of the consensus committee but is part of the Random Beacon committee. currentIdentityTable := snapshot.Encodable().SealingSegment.LatestProtocolStateEntry().EpochEntry.CurrentEpochIdentityTable ejectedIdentity := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))[0] - ejectedIdentity.EpochParticipationStatus = flow.EpochParticipationStatusEjected // writes through to `currentIdentityTable` + ejectedIdentity.EpochParticipationStatus = flow.EpochParticipationStatusEjected // writes through to `currentIdentityTable` - // 4. Modify DKG data by removing the last node of the consensus committee from DKG committee. This way we ensure that consensus + // 3. Modify DKG data by removing the last node of the consensus committee from DKG committee. This way we ensure that consensus // committee has a node which is not part of the Random Beacon committee. For threshold committees of *even size*, we can remove a // single node without changing the threshold (see [ref. 1] for details). In other words, we can just pretend that there was originally // one node less in the DKG, while the same number of signatures (threshold +1) are sufficient to construct a group signature. @@ -356,9 +354,10 @@ func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { randomBeaconParticipants := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus)) nConsensusNodes := len(randomBeaconParticipants) - 1 + // 4. Generates epoch recover transaction args using the tooling [run.GenerateRecoverTxArgsWithDKG] provided for the governance committee. recoveryDkgIndexMap := make(flow.DKGIndexMap, nConsensusNodes) for i, participant := range randomBeaconParticipants[:nConsensusNodes] { - dkgIndexMap[participant.NodeID] = i + recoveryDkgIndexMap[participant.NodeID] = i } epochProtocolState, err := snapshot.EpochProtocolState() @@ -366,10 +365,17 @@ func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { dkg, err := epochProtocolState.DKG() require.NoError(s.T(), err) recoveryThresholdKeyShares := dkg.KeyShares()[:nConsensusNodes] - recoveryThresholdGroupKey := dkg.GroupKey() + recoveryThresholdGroupKey := dkg.GroupKey() + + // read internal node info from one of the consensus nodes + internalNodePrivInfoDir, nodeConfigJson := s.getNodeInfoDirs(flow.RoleConsensus) + internalNodes, err := common.ReadFullInternalNodeInfos(unittest.Logger(), internalNodePrivInfoDir, nodeConfigJson) + require.NoError(s.T(), err) // At this point we have a node which is part of the consensus committee but not part of the Random Beacon committee and // another node which is part of the Random Beacon committee but not part of the consensus committee. + collectionClusters := s.NumOfCollectionClusters + recoveryEpochCounter := uint64(1) txArgs, err := run.GenerateRecoverTxArgsWithDKG( s.Log, internalNodes,