Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BFT] Epoch Recovery integration test #6823

Open
wants to merge 6 commits into
base: feature/efm-recovery
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cmd/bootstrap/dkg/dkg.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ func RandomBeaconKG(n int, seed []byte) (model.ThresholdKeySet, error) {
return dkgData, nil
}

skShares, pkShares, pkGroup, err := crypto.BLSThresholdKeyGen(int(n),
signature.RandomBeaconThreshold(int(n)), seed)
skShares, pkShares, pkGroup, err := crypto.BLSThresholdKeyGen(n,
signature.RandomBeaconThreshold(n), seed)
if err != nil {
return model.ThresholdKeySet{}, fmt.Errorf("Beacon KeyGen failed: %w", err)
}
Expand Down
65 changes: 50 additions & 15 deletions cmd/bootstrap/run/epochs.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ import (
"encoding/hex"
"fmt"

"github.com/rs/zerolog"

"github.com/onflow/cadence"
"github.com/onflow/crypto"
"github.com/rs/zerolog"

"github.com/onflow/flow-go/cmd/util/cmd/common"
"github.com/onflow/flow-go/fvm/systemcontracts"
Expand All @@ -31,6 +31,51 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger,
recoveryEpochTargetDuration uint64,
unsafeAllowOverWrite bool,
snapshot *inmem.Snapshot,
) ([]cadence.Value, error) {
log.Info().Msg("collecting internal node network and staking keys")
internalNodes, err := common.ReadFullInternalNodeInfos(log, internalNodePrivInfoDir, nodeConfigJson)
if err != nil {
return nil, fmt.Errorf("failed to read full internal node infos: %w", err)
}

epochProtocolState, err := snapshot.EpochProtocolState()
if err != nil {
return nil, fmt.Errorf("failed to get epoch protocol state from snapshot: %w", err)
}
currentEpochCommit := epochProtocolState.EpochCommit()

return GenerateRecoverTxArgsWithDKG(
log,
internalNodes,
collectionClusters,
recoveryEpochCounter,
rootChainID,
numViewsInStakingAuction,
numViewsInEpoch,
recoveryEpochTargetDuration,
unsafeAllowOverWrite,
currentEpochCommit.DKGIndexMap,
currentEpochCommit.DKGParticipantKeys,
currentEpochCommit.DKGGroupKey,
snapshot,
)
}

// GenerateRecoverTxArgsWithDKG generates the required transaction arguments for the `recoverEpoch` transaction.
// No errors are expected during normal operation.
func GenerateRecoverTxArgsWithDKG(log zerolog.Logger,
internalNodes []bootstrap.NodeInfo,
collectionClusters int,
recoveryEpochCounter uint64,
rootChainID flow.ChainID,
numViewsInStakingAuction uint64,
numViewsInEpoch uint64,
recoveryEpochTargetDuration uint64,
unsafeAllowOverWrite bool,
dkgIndexMap flow.DKGIndexMap,
dkgParticipantKeys []crypto.PublicKey,
dkgGroupKey crypto.PublicKey,
snapshot *inmem.Snapshot,
) ([]cadence.Value, error) {
epoch := snapshot.Epochs().Current()
currentEpochCounter, err := epoch.Counter()
Expand Down Expand Up @@ -81,10 +126,6 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger,
partnerCollectors := make(flow.IdentityList, 0)

log.Info().Msg("collecting internal node network and staking keys")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would suggest to remove this progress message (and the corresponding one in line 137), because the step of reading from disk (which could potentially fail) has been moved elsewhere.

internalNodes, err := common.ReadFullInternalNodeInfos(log, internalNodePrivInfoDir, nodeConfigJson)
if err != nil {
return nil, fmt.Errorf("failed to read full internal node infos: %w", err)
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change

internalNodesMap := make(map[flow.Identifier]struct{})
for _, node := range internalNodes {
Expand Down Expand Up @@ -134,31 +175,25 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger,
// The EFM Recovery State Machine will heuristically reject recovery attempts (specifically reject EpochRecover Service
// events, when the intersection between consensus and random beacon committees is too small.

epochProtocolState, err := snapshot.EpochProtocolState()
if err != nil {
return nil, fmt.Errorf("failed to get epoch protocol state from snapshot: %w", err)
}
currentEpochCommit := epochProtocolState.EpochCommit()

// NOTE: The RecoveryEpoch will re-use the last successful DKG output. This means that the random beacon committee can be
// different from the consensus committee. This could happen if the node was ejected from the consensus committee, but it still has to be
// included in the DKG committee since the threshold signature scheme operates on pre-defined number of participants and cannot be changed.
dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(currentEpochCommit.DKGGroupKey.Encode()))
dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgGroupKey.Encode()))
if cdcErr != nil {
return nil, fmt.Errorf("failed to convert Random Beacon group key to cadence representation: %w", cdcErr)
}

// copy DKG index map from the current epoch
dkgIndexMapPairs := make([]cadence.KeyValuePair, 0)
for nodeID, index := range currentEpochCommit.DKGIndexMap {
for nodeID, index := range dkgIndexMap {
dkgIndexMapPairs = append(dkgIndexMapPairs, cadence.KeyValuePair{
Key: cadence.String(nodeID.String()),
Value: cadence.NewInt(index),
})
}
// copy DKG public keys from the current epoch
dkgPubKeys := make([]cadence.Value, 0)
for k, dkgPubKey := range currentEpochCommit.DKGParticipantKeys {
for k, dkgPubKey := range dkgParticipantKeys {
dkgPubKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgPubKey.Encode()))
if cdcErr != nil {
return nil, fmt.Errorf("failed convert public beacon key of participant %d to cadence representation: %w", k, cdcErr)
Expand Down
126 changes: 122 additions & 4 deletions integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package cohort2

import (
"fmt"

"strings"
"testing"
"time"
Expand All @@ -15,11 +14,13 @@ import (
sdk "github.com/onflow/flow-go-sdk"

"github.com/onflow/flow-go/cmd/bootstrap/run"
"github.com/onflow/flow-go/cmd/util/cmd/common"
"github.com/onflow/flow-go/integration/tests/epochs"
"github.com/onflow/flow-go/integration/utils"
"github.com/onflow/flow-go/model/bootstrap"
"github.com/onflow/flow-go/model/flow"
"github.com/onflow/flow-go/model/flow/filter"
"github.com/onflow/flow-go/utils/unittest"
)

func TestRecoverEpoch(t *testing.T) {
Expand All @@ -39,9 +40,9 @@ func (s *RecoverEpochSuite) SetupTest() {
s.EpochLen = 150
s.FinalizationSafetyThreshold = 20
s.NumOfCollectionClusters = 1
// we need to use 3 consensus nodes to be able to eject a single node from the consensus committee
// and still have a Random Beacon committee which meets the protocol.RandomBeaconSafetyThreshold
s.NumOfConsensusNodes = 3
// we need to use 4 consensus nodes to be able to eject a single node and still have a super-majority and
// have a Random Beacon committee which meets the protocol.RandomBeaconSafetyThreshold.
s.NumOfConsensusNodes = 4

// run the generic setup, which starts up the network
s.BaseSuite.SetupTest()
Expand Down Expand Up @@ -290,3 +291,120 @@ func (s *RecoverEpochSuite) TestRecoverEpochNodeEjected() {

s.AssertInEpoch(s.Ctx, 1)
}

// TestRecoverEpochEjectNodeDifferentDKG ensures that the recover epoch governance transaction flow works as expected, and a network that
// enters Epoch Fallback Mode can successfully recover.
// For this specific scenario, we are testing a scenario where the consensus committee and Random Beacon committee form a symmetric difference with
// cardinality 1. In other words, there is a node which is part of the consensus committee but not part of the Random Beacon committee and
// another node which is part of the Random Beacon committee but not part of the consensus committee.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// another node which is part of the Random Beacon committee but not part of the consensus committee.
// another node which is part of the Random Beacon committee but not part of the consensus committee.
// We remove the first consensus node from the Consensus Committee, and the last consensus node from the Random Beacon Committee:
// If the original consensus set is {A, B, C, D} then:
// - the post-recovery consensus committee is {B, C, D}
// - the post-recovery random beacon committee is {A, B, C}
//

Comment on lines +297 to +299
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I liked the formal description in the issue. Don't think it hurts to include it here.

Suggested change
// For this specific scenario, we are testing a scenario where the consensus committee and Random Beacon committee form a symmetric difference with
// cardinality 1. In other words, there is a node which is part of the consensus committee but not part of the Random Beacon committee and
// another node which is part of the Random Beacon committee but not part of the consensus committee.
// Here, we are testing a scenario where the consensus committee 𝒞 and Random Beacon committee 𝒟 form a symmetric difference with
// cardinality 1. Formally, |𝒞 ∖ 𝒟| = 1 and |𝒟 \ 𝒞| = 1. In other words, there is a node which is part of the consensus committee but not
// part of the Random Beacon committee and another node which is part of the Random Beacon committee but not part of the consensus committee.

followed by Jordan's suggestion (marginally updated):

// We remove the first consensus node from the Consensus Committee, and the last consensus node from the Random Beacon Committee. For example,
// if the original consensus set is {A, B, C, D} then:
//   - the post-recovery consensus committee is {B, C, D}
//   - the post-recovery random beacon committee is {A, B, C}

// This test will do the following:
// 1. Triggers EFM by turning off the sole collection node before the end of the DKG forcing the DKG to fail.
// 2. Generates epoch recover transaction args using the epoch efm-recover-tx-args.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am unsure what the "epoch efm-recover-tx-args" are ...

Suggested change
// 2. Generates epoch recover transaction args using the epoch efm-recover-tx-args.
// 2. Generates epoch recover transaction args using the tooling [run.GenerateRecoverTxArgsWithDKG], which would also be used for mainnet recovery.

// 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args.
// 4. Eject consensus node from the Random Beacon committee by modifying the snapshot before generating the recover epoch transaction args.
// 5. Submit recover epoch transaction.
// 6. Ensure expected EpochRecover event is emitted.
// 7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch.
func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() {
// 1. Manually trigger EFM

// pause the collection node to trigger EFM by failing DKG
ln := s.GetContainersByRole(flow.RoleCollection)[0]
require.NoError(s.T(), ln.Pause())
s.AwaitFinalizedView(s.Ctx, s.GetDKGEndView(), 2*time.Minute, 500*time.Millisecond)
// start the paused collection node now that we are in EFM
require.NoError(s.T(), ln.Start())

// get final view from the latest snapshot
epoch1FinalView, err := s.Net.BootstrapSnapshot.Epochs().Current().FinalView()
require.NoError(s.T(), err)

// Wait for at least the first view past the current epoch's original FinalView to be finalized.
s.TimedLogf("waiting for epoch transition (finalized view %d)", epoch1FinalView+1)
s.AwaitFinalizedView(s.Ctx, epoch1FinalView+1, 2*time.Minute, 500*time.Millisecond)
s.TimedLogf("observed finalized view %d", epoch1FinalView+1)

// assert that we are in EFM
snapshot, err := s.Client.GetLatestProtocolSnapshot(s.Ctx)
require.NoError(s.T(), err)
epochPhase, err := snapshot.EpochPhase()
require.NoError(s.T(), err)
require.Equal(s.T(), flow.EpochPhaseFallback, epochPhase, "network must enter EFM by this point")

// 2. Generate transaction arguments for epoch recover transaction.
collectionClusters := s.NumOfCollectionClusters
recoveryEpochCounter := uint64(1)

// read internal node info from one of the consensus nodes
internalNodePrivInfoDir, nodeConfigJson := s.getNodeInfoDirs(flow.RoleConsensus)
internalNodes, err := common.ReadFullInternalNodeInfos(unittest.Logger(), internalNodePrivInfoDir, nodeConfigJson)
require.NoError(s.T(), err)
Comment on lines +334 to +341
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand why we need to do step 2 so early. For me, it would be much more intuitive if that happened right before line 367, because all its outputs are only needed then.

// 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args.
// 3. Eject the FIRST consensus node by modifying the snapshot before generating the recover epoch transaction args.

// By ejecting a node from the consensus committee but keeping it in the Random Beacon committee, we ensure that the there is a node
// which is not part of the consensus committee but is part of the Random Beacon committee.
currentIdentityTable := snapshot.Encodable().SealingSegment.LatestProtocolStateEntry().EpochEntry.CurrentEpochIdentityTable
ejectedIdentity := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))[0]
ejectedIdentity.EpochParticipationStatus = flow.EpochParticipationStatusEjected
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding is that this write will also modify currentIdentityTable (?) Is that the correct behaviour?

Suggested change
ejectedIdentity.EpochParticipationStatus = flow.EpochParticipationStatusEjected
ejectedIdentity.EpochParticipationStatus = flow.EpochParticipationStatusEjected // writes through to `currentIdentityTable`


// 4. Modify DKG data by removing the last node of the consensus committee from DKG committee. We can do this
// by altering the DKG index map and DKG public key shares.
// This way we ensure that consensus committee has a node which is not part of the Random Beacon committee.
Comment on lines +349 to +351
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was very confused why we could just remove a node from the DKG and the resulting setup would still work. In general, threshold key sets do not allow that. We are kind of using a hack here, which I think is fine for the test. Nevertheless, we should document it accordingly.

Suggested change
// 4. Modify DKG data by removing the last node of the consensus committee from DKG committee. We can do this
// by altering the DKG index map and DKG public key shares.
// This way we ensure that consensus committee has a node which is not part of the Random Beacon committee.
// 4. Modify DKG data by removing the last node of the consensus committee from DKG committee. This way we ensure that consensus
// committee has a node which is not part of the Random Beacon committee. For threshold committees of *even size*, we can remove a
// single node without changing the threshold (see [ref. 1] for details). In other words, we can just pretend that there was originally
// one node less in the DKG, while the same number of signatures (threshold +1) are sufficient to construct a group signature.
//
// [ref. 1] function `RandomBeaconThreshold` for computing the threshold in package module/signature; note
// that for reconstructing the group sig, _strictly more_ than `threshold` sig shares are required.

randomBeaconParticipants := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))
nConsensusNodes := len(randomBeaconParticipants) - 1

dkgIndexMap := make(flow.DKGIndexMap, nConsensusNodes)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lets clearly distinguish the modified threshold key system

Suggested change
dkgIndexMap := make(flow.DKGIndexMap, nConsensusNodes)
recoveryDkgIndexMap := make(flow.DKGIndexMap, nConsensusNodes)

for i, participant := range randomBeaconParticipants[:nConsensusNodes] {
dkgIndexMap[participant.NodeID] = i
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
}
}
dkgKeyShares := dkg.KeyShares()[:nConsensusNodes]

Putting this next to the corresponding index map construction logic


epochProtocolState, err := snapshot.EpochProtocolState()
require.NoError(s.T(), err)
dkg, err := epochProtocolState.DKG()
require.NoError(s.T(), err)
Comment on lines +360 to +363
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and here (incorporating Jordan's suggestion)

Suggested change
epochProtocolState, err := snapshot.EpochProtocolState()
require.NoError(s.T(), err)
dkg, err := epochProtocolState.DKG()
require.NoError(s.T(), err)
epochProtocolState, err := snapshot.EpochProtocolState()
require.NoError(s.T(), err)
dkg, err := epochProtocolState.DKG()
require.NoError(s.T(), err)
recoveryThresholdKeyShares := dkg.KeyShares()[:nConsensusNodes]
recoveryThresholdGroupKey := dkg.GroupKey()


// At this point we have a node which is part of the consensus committee but not part of the Random Beacon committee and
// another node which is part of the Random Beacon committee but not part of the consensus committee.
txArgs, err := run.GenerateRecoverTxArgsWithDKG(
Comment on lines +365 to +367
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we move step 2 to here, the test would be easier for me to understand as it would more closely emulate a real-world scenario.

If that is possible, we could summarize the test as follows in the documentation (changes highlighted in bold font)

This test will do the following:

  1. Triggers EFM by turning off the sole collection node before the end of the DKG forcing the DKG to fail.
  2. Eject the first consensus node by modifying the epoch snapshot.
  3. Drop the last consensus node from the Random Beacon committee. This hack works only for threshold systems with an even number of participants,
    without changing the threshold - hence we need to start this test with 4 consensus nodes.
  4. Generates epoch recover transaction args using the tooling [run.GenerateRecoverTxArgsWithDKG] provided for the governance committee.
  5. Submit recover epoch transaction.
  6. Ensure expected EpochRecover event is emitted.
  7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch.

s.Log,
internalNodes,
collectionClusters,
recoveryEpochCounter,
flow.Localnet,
s.StakingAuctionLen,
s.EpochLen,
3000,
false,
dkgIndexMap,
dkg.KeyShares()[:nConsensusNodes],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
dkg.KeyShares()[:nConsensusNodes],
dkgKeyShares,

dkg.GroupKey(),
Comment on lines +377 to +379
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
dkgIndexMap,
dkg.KeyShares()[:nConsensusNodes],
dkg.GroupKey(),
recoveryDkgIndexMap,
recoveryThresholdKeyShares,
recoveryThresholdGroupKey,

snapshot,
)
require.NoError(s.T(), err)

// 5. Submit recover epoch transaction to the network.
env := utils.LocalnetEnv()
result := s.recoverEpoch(env, txArgs)
require.NoError(s.T(), result.Error)
require.Equal(s.T(), result.Status, sdk.TransactionStatusSealed)

// 6. Ensure expected EpochRecover event is emitted.
eventType := ""
for _, evt := range result.Events {
if strings.Contains(evt.Type, "FlowEpoch.EpochRecover") {
eventType = evt.Type
break
}
}
require.NotEmpty(s.T(), eventType, "expected FlowEpoch.EpochRecover event type")
events, err := s.Client.GetEventsForBlockIDs(s.Ctx, eventType, []sdk.Identifier{result.BlockID})
require.NoError(s.T(), err)
require.Equal(s.T(), events[0].Events[0].Type, eventType)

// 7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch.
startViewOfNextEpoch := uint64(txArgs[1].(cadence.UInt64))
s.TimedLogf("waiting to transition into recovery epoch (finalized view %d)", startViewOfNextEpoch)
s.AwaitFinalizedView(s.Ctx, startViewOfNextEpoch, 2*time.Minute, 500*time.Millisecond)
s.TimedLogf("observed finalized first view of recovery epoch %d", startViewOfNextEpoch)

s.AssertInEpoch(s.Ctx, 1)
}
Loading