diff --git a/cmd/bootstrap/dkg/dkg.go b/cmd/bootstrap/dkg/dkg.go index e6dfe1788d4..79bacac26d9 100644 --- a/cmd/bootstrap/dkg/dkg.go +++ b/cmd/bootstrap/dkg/dkg.go @@ -26,8 +26,8 @@ func RandomBeaconKG(n int, seed []byte) (model.ThresholdKeySet, error) { return dkgData, nil } - skShares, pkShares, pkGroup, err := crypto.BLSThresholdKeyGen(int(n), - signature.RandomBeaconThreshold(int(n)), seed) + skShares, pkShares, pkGroup, err := crypto.BLSThresholdKeyGen(n, + signature.RandomBeaconThreshold(n), seed) if err != nil { return model.ThresholdKeySet{}, fmt.Errorf("Beacon KeyGen failed: %w", err) } diff --git a/cmd/bootstrap/run/epochs.go b/cmd/bootstrap/run/epochs.go index e242e4cf41b..177482c4bb3 100644 --- a/cmd/bootstrap/run/epochs.go +++ b/cmd/bootstrap/run/epochs.go @@ -4,9 +4,9 @@ import ( "encoding/hex" "fmt" - "github.com/rs/zerolog" - "github.com/onflow/cadence" + "github.com/onflow/crypto" + "github.com/rs/zerolog" "github.com/onflow/flow-go/cmd/util/cmd/common" "github.com/onflow/flow-go/fvm/systemcontracts" @@ -31,6 +31,51 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, recoveryEpochTargetDuration uint64, unsafeAllowOverWrite bool, snapshot *inmem.Snapshot, +) ([]cadence.Value, error) { + log.Info().Msg("collecting internal node network and staking keys") + internalNodes, err := common.ReadFullInternalNodeInfos(log, internalNodePrivInfoDir, nodeConfigJson) + if err != nil { + return nil, fmt.Errorf("failed to read full internal node infos: %w", err) + } + + epochProtocolState, err := snapshot.EpochProtocolState() + if err != nil { + return nil, fmt.Errorf("failed to get epoch protocol state from snapshot: %w", err) + } + currentEpochCommit := epochProtocolState.EpochCommit() + + return GenerateRecoverTxArgsWithDKG( + log, + internalNodes, + collectionClusters, + recoveryEpochCounter, + rootChainID, + numViewsInStakingAuction, + numViewsInEpoch, + recoveryEpochTargetDuration, + unsafeAllowOverWrite, + currentEpochCommit.DKGIndexMap, + currentEpochCommit.DKGParticipantKeys, + currentEpochCommit.DKGGroupKey, + snapshot, + ) +} + +// GenerateRecoverTxArgsWithDKG generates the required transaction arguments for the `recoverEpoch` transaction. +// No errors are expected during normal operation. +func GenerateRecoverTxArgsWithDKG(log zerolog.Logger, + internalNodes []bootstrap.NodeInfo, + collectionClusters int, + recoveryEpochCounter uint64, + rootChainID flow.ChainID, + numViewsInStakingAuction uint64, + numViewsInEpoch uint64, + recoveryEpochTargetDuration uint64, + unsafeAllowOverWrite bool, + dkgIndexMap flow.DKGIndexMap, + dkgParticipantKeys []crypto.PublicKey, + dkgGroupKey crypto.PublicKey, + snapshot *inmem.Snapshot, ) ([]cadence.Value, error) { epoch := snapshot.Epochs().Current() currentEpochCounter, err := epoch.Counter() @@ -80,12 +125,6 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, internalCollectors := make(flow.IdentityList, 0) partnerCollectors := make(flow.IdentityList, 0) - log.Info().Msg("collecting internal node network and staking keys") - internalNodes, err := common.ReadFullInternalNodeInfos(log, internalNodePrivInfoDir, nodeConfigJson) - if err != nil { - return nil, fmt.Errorf("failed to read full internal node infos: %w", err) - } - internalNodesMap := make(map[flow.Identifier]struct{}) for _, node := range internalNodes { if !currentEpochIdentities.Exists(node.Identity()) { @@ -93,7 +132,6 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, } internalNodesMap[node.NodeID] = struct{}{} } - log.Info().Msg("") for _, collector := range collectors { if _, ok := internalNodesMap[collector.NodeID]; ok { @@ -134,23 +172,17 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, // The EFM Recovery State Machine will heuristically reject recovery attempts (specifically reject EpochRecover Service // events, when the intersection between consensus and random beacon committees is too small. - epochProtocolState, err := snapshot.EpochProtocolState() - if err != nil { - return nil, fmt.Errorf("failed to get epoch protocol state from snapshot: %w", err) - } - currentEpochCommit := epochProtocolState.EpochCommit() - // NOTE: The RecoveryEpoch will re-use the last successful DKG output. This means that the random beacon committee can be // different from the consensus committee. This could happen if the node was ejected from the consensus committee, but it still has to be // included in the DKG committee since the threshold signature scheme operates on pre-defined number of participants and cannot be changed. - dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(currentEpochCommit.DKGGroupKey.Encode())) + dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgGroupKey.Encode())) if cdcErr != nil { return nil, fmt.Errorf("failed to convert Random Beacon group key to cadence representation: %w", cdcErr) } // copy DKG index map from the current epoch dkgIndexMapPairs := make([]cadence.KeyValuePair, 0) - for nodeID, index := range currentEpochCommit.DKGIndexMap { + for nodeID, index := range dkgIndexMap { dkgIndexMapPairs = append(dkgIndexMapPairs, cadence.KeyValuePair{ Key: cadence.String(nodeID.String()), Value: cadence.NewInt(index), @@ -158,7 +190,7 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, } // copy DKG public keys from the current epoch dkgPubKeys := make([]cadence.Value, 0) - for k, dkgPubKey := range currentEpochCommit.DKGParticipantKeys { + for k, dkgPubKey := range dkgParticipantKeys { dkgPubKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgPubKey.Encode())) if cdcErr != nil { return nil, fmt.Errorf("failed convert public beacon key of participant %d to cadence representation: %w", k, cdcErr) diff --git a/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go b/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go index 385c42c5c88..ac311c230cc 100644 --- a/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go +++ b/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go @@ -2,7 +2,6 @@ package cohort2 import ( "fmt" - "strings" "testing" "time" @@ -15,11 +14,13 @@ import ( sdk "github.com/onflow/flow-go-sdk" "github.com/onflow/flow-go/cmd/bootstrap/run" + "github.com/onflow/flow-go/cmd/util/cmd/common" "github.com/onflow/flow-go/integration/tests/epochs" "github.com/onflow/flow-go/integration/utils" "github.com/onflow/flow-go/model/bootstrap" "github.com/onflow/flow-go/model/flow" "github.com/onflow/flow-go/model/flow/filter" + "github.com/onflow/flow-go/utils/unittest" ) func TestRecoverEpoch(t *testing.T) { @@ -39,9 +40,9 @@ func (s *RecoverEpochSuite) SetupTest() { s.EpochLen = 150 s.FinalizationSafetyThreshold = 20 s.NumOfCollectionClusters = 1 - // we need to use 3 consensus nodes to be able to eject a single node from the consensus committee - // and still have a Random Beacon committee which meets the protocol.RandomBeaconSafetyThreshold - s.NumOfConsensusNodes = 3 + // we need to use 4 consensus nodes to be able to eject a single node and still have a super-majority and + // have a Random Beacon committee which meets the protocol.RandomBeaconSafetyThreshold. + s.NumOfConsensusNodes = 4 // run the generic setup, which starts up the network s.BaseSuite.SetupTest() @@ -290,3 +291,132 @@ func (s *RecoverEpochSuite) TestRecoverEpochNodeEjected() { s.AssertInEpoch(s.Ctx, 1) } + +// TestRecoverEpochEjectNodeDifferentDKG ensures that the recover epoch governance transaction flow works as expected, and a network that +// enters Epoch Fallback Mode can successfully recover. +// Here, we are testing a scenario where the consensus committee 𝒞 and Random Beacon committee 𝒟 form a symmetric difference with +// cardinality 1. Formally, |𝒞 ∖ 𝒟| = 1 and |𝒟 \ 𝒞| = 1. In other words, there is a node which is part of the consensus committee but not +// part of the Random Beacon committee and another node which is part of the Random Beacon committee but not part of the consensus committee. +// We remove the first consensus node from the Consensus Committee, and the last consensus node from the Random Beacon Committee. For example, +// if the original consensus set is {A, B, C, D} then: +// - the post-recovery consensus committee is {B, C, D} +// - the post-recovery random beacon committee is {A, B, C} +// +// This test will do the following: +// 1. Triggers EFM by turning off the sole collection node before the end of the DKG forcing the DKG to fail. +// 2. Eject the first consensus node by modifying the epoch snapshot. +// 3. Drop the last consensus node from the Random Beacon committee. This hack works only for threshold systems with an even number of participants, +// without changing the threshold - hence we need to start this test with 4 consensus nodes. +// 4. Generates epoch recover transaction args using the tooling [run.GenerateRecoverTxArgsWithDKG] provided for the governance committee. +// 5. Submit recover epoch transaction. +// 6. Ensure expected EpochRecover event is emitted. +// 7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch. +func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { + // 1. Triggers EFM by turning off the sole collection node before the end of the DKG forcing the DKG to fail. + + // pause the collection node to trigger EFM by failing DKG + ln := s.GetContainersByRole(flow.RoleCollection)[0] + require.NoError(s.T(), ln.Pause()) + s.AwaitFinalizedView(s.Ctx, s.GetDKGEndView(), 2*time.Minute, 500*time.Millisecond) + // start the paused collection node now that we are in EFM + require.NoError(s.T(), ln.Start()) + + // get final view from the latest snapshot + epoch1FinalView, err := s.Net.BootstrapSnapshot.Epochs().Current().FinalView() + require.NoError(s.T(), err) + + // Wait for at least the first view past the current epoch's original FinalView to be finalized. + s.TimedLogf("waiting for epoch transition (finalized view %d)", epoch1FinalView+1) + s.AwaitFinalizedView(s.Ctx, epoch1FinalView+1, 2*time.Minute, 500*time.Millisecond) + s.TimedLogf("observed finalized view %d", epoch1FinalView+1) + + // assert that we are in EFM + snapshot, err := s.Client.GetLatestProtocolSnapshot(s.Ctx) + require.NoError(s.T(), err) + epochPhase, err := snapshot.EpochPhase() + require.NoError(s.T(), err) + require.Equal(s.T(), flow.EpochPhaseFallback, epochPhase, "network must enter EFM by this point") + + // 2. Eject the FIRST consensus node by modifying the snapshot before generating the recover epoch transaction args. + // By ejecting a node from the consensus committee but keeping it in the Random Beacon committee, we ensure that the there is a node + // which is not part of the consensus committee but is part of the Random Beacon committee. + currentIdentityTable := snapshot.Encodable().SealingSegment.LatestProtocolStateEntry().EpochEntry.CurrentEpochIdentityTable + ejectedIdentity := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))[0] + ejectedIdentity.EpochParticipationStatus = flow.EpochParticipationStatusEjected // writes through to `currentIdentityTable` + + // 3. Modify DKG data by removing the last node of the consensus committee from DKG committee. This way we ensure that consensus + // committee has a node which is not part of the Random Beacon committee. For threshold committees of *even size*, we can remove a + // single node without changing the threshold (see [ref. 1] for details). In other words, we can just pretend that there was originally + // one node less in the DKG, while the same number of signatures (threshold +1) are sufficient to construct a group signature. + // + // [ref. 1] function `RandomBeaconThreshold` for computing the threshold in package module/signature; note + // that for reconstructing the group sig, _strictly more_ than `threshold` sig shares are required. + randomBeaconParticipants := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus)) + nConsensusNodes := len(randomBeaconParticipants) - 1 + + // 4. Generates epoch recover transaction args using the tooling [run.GenerateRecoverTxArgsWithDKG] provided for the governance committee. + recoveryDkgIndexMap := make(flow.DKGIndexMap, nConsensusNodes) + for i, participant := range randomBeaconParticipants[:nConsensusNodes] { + recoveryDkgIndexMap[participant.NodeID] = i + } + + epochProtocolState, err := snapshot.EpochProtocolState() + require.NoError(s.T(), err) + dkg, err := epochProtocolState.DKG() + require.NoError(s.T(), err) + recoveryThresholdKeyShares := dkg.KeyShares()[:nConsensusNodes] + recoveryThresholdGroupKey := dkg.GroupKey() + + // read internal node info from one of the consensus nodes + internalNodePrivInfoDir, nodeConfigJson := s.getNodeInfoDirs(flow.RoleConsensus) + internalNodes, err := common.ReadFullInternalNodeInfos(unittest.Logger(), internalNodePrivInfoDir, nodeConfigJson) + require.NoError(s.T(), err) + + // At this point we have a node which is part of the consensus committee but not part of the Random Beacon committee and + // another node which is part of the Random Beacon committee but not part of the consensus committee. + collectionClusters := s.NumOfCollectionClusters + recoveryEpochCounter := uint64(1) + txArgs, err := run.GenerateRecoverTxArgsWithDKG( + s.Log, + internalNodes, + collectionClusters, + recoveryEpochCounter, + flow.Localnet, + s.StakingAuctionLen, + s.EpochLen, + 3000, + false, + recoveryDkgIndexMap, + recoveryThresholdKeyShares, + recoveryThresholdGroupKey, + snapshot, + ) + require.NoError(s.T(), err) + + // 5. Submit recover epoch transaction to the network. + env := utils.LocalnetEnv() + result := s.recoverEpoch(env, txArgs) + require.NoError(s.T(), result.Error) + require.Equal(s.T(), result.Status, sdk.TransactionStatusSealed) + + // 6. Ensure expected EpochRecover event is emitted. + eventType := "" + for _, evt := range result.Events { + if strings.Contains(evt.Type, "FlowEpoch.EpochRecover") { + eventType = evt.Type + break + } + } + require.NotEmpty(s.T(), eventType, "expected FlowEpoch.EpochRecover event type") + events, err := s.Client.GetEventsForBlockIDs(s.Ctx, eventType, []sdk.Identifier{result.BlockID}) + require.NoError(s.T(), err) + require.Equal(s.T(), events[0].Events[0].Type, eventType) + + // 7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch. + startViewOfNextEpoch := uint64(txArgs[1].(cadence.UInt64)) + s.TimedLogf("waiting to transition into recovery epoch (finalized view %d)", startViewOfNextEpoch) + s.AwaitFinalizedView(s.Ctx, startViewOfNextEpoch, 2*time.Minute, 500*time.Millisecond) + s.TimedLogf("observed finalized first view of recovery epoch %d", startViewOfNextEpoch) + + s.AssertInEpoch(s.Ctx, 1) +}