diff --git a/tests/raft_scenarios/startup_2nodes b/tests/raft_scenarios/startup_2nodes new file mode 100644 index 000000000000..8e1fec5062c5 --- /dev/null +++ b/tests/raft_scenarios/startup_2nodes @@ -0,0 +1,14 @@ +start_node,0 +assert_is_primary,0 +emit_signature,2 + +assert_is_primary,0 +assert_commit_idx,0,2 + +trust_node,2,1 +emit_signature,2 + +dispatch_all +# periodic_all,10 +# dispatch_all +# assert_commit_idx,0,4 \ No newline at end of file diff --git a/tla/consensus/MCccfraftAtomicReconfig.cfg b/tla/consensus/MCccfraftAtomicReconfig.cfg index c4b365e8b972..7bef71862d18 100644 --- a/tla/consensus/MCccfraftAtomicReconfig.cfg +++ b/tla/consensus/MCccfraftAtomicReconfig.cfg @@ -77,6 +77,5 @@ INVARIANTS MonoLogInv LogConfigurationConsistentInv NoLeaderBeforeInitialTerm - MatchIndexLowerBoundNextIndexInv CommitCommittableIndices CommittableIndicesAreKnownSignaturesInv \ No newline at end of file diff --git a/tla/consensus/Traceccfraft.tla b/tla/consensus/Traceccfraft.tla index 4c4c2d78b8ba..f4d6e6fc0f27 100644 --- a/tla/consensus/Traceccfraft.tla +++ b/tla/consensus/Traceccfraft.tla @@ -80,7 +80,7 @@ ASSUME TraceServers \subseteq Servers TraceAppendEntriesBatchsize(i, j) == \* -1) .. to explicitly model heartbeats, i.e. a message with zero entries. - (nextIndex[i][j] - 1) .. Len(log[i]) + sentIndex[i][j] .. Len(log[i]) TraceInitReconfigurationVars == /\ reconfigurationCount = 0 @@ -176,7 +176,8 @@ IsSendAppendEntries == /\ IsAppendEntriesRequest(msg, j, i, logline) \* There is now one more message of this type. /\ Network!OneMoreMessage(msg) - /\ logline.msg.sent_idx + 1 = nextIndex[i][j] +\* TODO revisit once nextIndex-related changes are merged in the spec +\* /\ logline.msg.sent_idx = nextIndex[i][j] /\ logline.msg.match_idx = matchIndex[i][j] /\ committableIndices[logline.msg.state.node_id] = Range(logline.msg.state.committable_indices) @@ -243,7 +244,7 @@ IsRcvAppendEntriesResponse == /\ IsEvent("recv_append_entries_response") /\ LET i == logline.msg.state.node_id j == logline.msg.from_node_id - IN /\ logline.msg.sent_idx + 1 = nextIndex[i][j] + IN /\ logline.msg.sent_idx = sentIndex[i][j] /\ logline.msg.match_idx = matchIndex[i][j] /\ \E m \in Network!MessagesTo(i, j): /\ m.type = AppendEntriesResponse @@ -435,7 +436,7 @@ TraceDifferentialInv == \* /\ d.clientRequests = clientRequests \* /\ d.votesGranted = votesGranted \* /\ d.votesRequested = votesRequested - \* /\ d.nextIndex = nextIndex + \* /\ d.sentIndex = sentIndex \* /\ d.matchIndex = matchIndex ------------------------------------------------------------------------------------- diff --git a/tla/consensus/ccfraft.tla b/tla/consensus/ccfraft.tla index 6b03b3a76e21..7cc431cec042 100644 --- a/tla/consensus/ccfraft.tla +++ b/tla/consensus/ccfraft.tla @@ -274,16 +274,16 @@ CandidateVarsTypeInv == \* The following variables are used only on leaders: -\* The next entry to send to each follower, called sent_idx in raft.h +\* The last entry sent to each follower. +\* sentIndex in CCF is similar in function to nextIndex in Raft \* In CCF, the leader updates nextIndex optimically when an AE message is dispatched \* In contrast, in Raft the leader only updates nextIndex when an AE response is received -VARIABLE nextIndex +VARIABLE sentIndex -\* nextIndex cannot be zero as its the index of the first log -\* entry in the AE message (recalling that TLA+ is 1-indexed). -NextIndexTypeInv == + +SentIndexTypeInv == \A i, j \in Servers : i /= j => - /\ nextIndex[i][j] \in Nat \ {0} + /\ sentIndex[i][j] \in Nat \* The latest entry that each follower has acknowledged is the same as the \* leader's. This is used to calculate commitIndex on the leader. @@ -293,10 +293,10 @@ MatchIndexTypeInv == \A i, j \in Servers : i /= j => matchIndex[i][j] \in Nat -leaderVars == <> +leaderVars == <> LeaderVarsTypeInv == - /\ NextIndexTypeInv + /\ SentIndexTypeInv /\ MatchIndexTypeInv \* End of per server variables. @@ -452,7 +452,7 @@ CommittedTermPrefix(i, x) == AppendEntriesBatchsize(i, j) == \* The Leader is modeled to send zero to one entries per AppendEntriesRequest. \* This can be redefined to send bigger batches of entries. - {nextIndex[i][j]} + {sentIndex[i][j] + 1} PlausibleSucessorNodes(i) == @@ -496,11 +496,11 @@ InitMessagesVars == InitCandidateVars == /\ votesGranted = [i \in Servers |-> {}] -\* The values nextIndex[i][i] and matchIndex[i][i] are never read, since the +\* The values sentIndex[i][i] and matchIndex[i][i] are never read, since the \* leader does not send itself messages. It's still easier to include these \* in the functions. InitLeaderVars == - /\ nextIndex = [i \in Servers |-> [j \in Servers |-> 1]] + /\ sentIndex = [i \in Servers |-> [j \in Servers |-> 0]] /\ matchIndex = [i \in Servers |-> [j \in Servers |-> 0]] Init == @@ -557,15 +557,15 @@ AppendEntries(i, j) == \* AppendEntries must be sent for historical entries, unless \* snapshots are used. Whether the node is in configuration at \* that index makes no difference. - \* /\ IsInServerSetForIndex(j, i, nextIndex[i][j]) - /\ LET prevLogIndex == nextIndex[i][j] - 1 + \* /\ IsInServerSetForIndex(j, i, sentIndex[i][j]) + /\ LET prevLogIndex == sentIndex[i][j] prevLogTerm == IF prevLogIndex \in DOMAIN log[i] THEN log[i][prevLogIndex].term ELSE StartTerm \* Send a number of entries (constrained by the end of the log). lastEntry(idx) == min(Len(log[i]), idx) - index == nextIndex[i][j] + index == sentIndex[i][j] + 1 msg(idx) == [type |-> AppendEntriesRequest, term |-> currentTerm[i], @@ -581,7 +581,7 @@ AppendEntries(i, j) == /\ Send(m) \* Record the most recent index we have sent to this node. \* (see https://github.com/microsoft/CCF/blob/9fbde45bf5ab856ca7bcf655e8811dc7baf1e8a3/src/consensus/aft/raft.h#L935-L936) - /\ nextIndex' = [nextIndex EXCEPT ![i][j] = @ + Len(m.entries)] + /\ sentIndex' = [sentIndex EXCEPT ![i][j] = @ + Len(m.entries)] /\ UNCHANGED <> \* Candidate i transitions to leader. @@ -594,8 +594,8 @@ BecomeLeader(i) == \* all unsigned log entries of the previous leader. \* See occurrence of last_committable_index() in raft.h::become_leader. /\ log' = [log EXCEPT ![i] = SubSeq(log[i],1, MaxCommittableIndex(log[i]))] - \* Reset our nextIndex to the end of the *new* log. - /\ nextIndex' = [nextIndex EXCEPT ![i] = [j \in Servers |-> Len(log'[i]) + 1]] + \* Reset our sentIndex to the end of the *new* log. + /\ sentIndex' = [sentIndex EXCEPT ![i] = [j \in Servers |-> Len(log'[i])]] /\ matchIndex' = [matchIndex EXCEPT ![i] = [j \in Servers |-> 0]] \* Shorten the configurations if the removed txs contained reconfigurations /\ configurations' = [configurations EXCEPT ![i] = ConfigurationsToIndex(i, Len(log'[i]))] @@ -645,6 +645,12 @@ ChangeConfigurationInt(i, newConfiguration) == \* i.e., be re-added to a new configuration. Instead, the node has to rejoin with a \* "fresh" identity (compare sec 6.2, page 8, https://arxiv.org/abs/2310.11559). /\ \A s \in newConfiguration: s \notin removedFromConfiguration + \* See raft.h:2401, nodes are only sent future entries initially, they will NACK if necessary. + \* This is because they are expected to start from a fairly recent snapshot, not from scratch. + /\ LET + addedNodes == newConfiguration \ CurrentConfiguration(i) + newSentIndex == [ k \in Servers |-> IF k \in addedNodes THEN Len(log[i]) ELSE sentIndex[i][k]] + IN sentIndex' = [sentIndex EXCEPT ![i] = newSentIndex] \* Keep track of running reconfigurations to limit state space /\ reconfigurationCount' = reconfigurationCount + 1 /\ removedFromConfiguration' = removedFromConfiguration \cup (CurrentConfiguration(i) \ newConfiguration) @@ -653,7 +659,7 @@ ChangeConfigurationInt(i, newConfiguration) == configuration |-> newConfiguration, contentType |-> TypeReconfiguration])] /\ configurations' = [configurations EXCEPT ![i] = configurations[i] @@ Len(log'[i]) :> newConfiguration] - /\ UNCHANGED <> + /\ UNCHANGED <> ChangeConfiguration(i) == \* Reconfigure to any *non-empty* subset of servers. ChangeConfigurationInt checks that the new @@ -925,13 +931,13 @@ HandleAppendEntriesResponse(i, j, m) == /\ m.success \* successful \* max(...) because why would we ever want to go backwards on a success response?! /\ matchIndex' = [matchIndex EXCEPT ![i][j] = max(@, m.lastLogIndex)] - \* nextIndex is unchanged on successful AE response as it was already updated when the AE was dispatched - /\ UNCHANGED nextIndex + \* sentIndex is unchanged on successful AE response as it was already updated when the AE was dispatched + /\ UNCHANGED sentIndex \/ /\ \lnot m.success \* not successful /\ LET tm == FindHighestPossibleMatch(log[i], m.lastLogIndex, m.term) - IN nextIndex' = [nextIndex EXCEPT ![i][j] = max(min(tm, nextIndex[i][j]-1), matchIndex[i][j]) + 1 ] + IN sentIndex' = [sentIndex EXCEPT ![i][j] = max(min(tm, sentIndex[i][j]), matchIndex[i][j])] \* UNCHANGED matchIndex is implied by the following statement in figure 2, page 4 in the raft paper: - \* "If AppendEntries fails because of log inconsistency: decrement nextIndex and retry" + \* "If AppendEntries fails because of log inconsistency: decrement nextIndex (aka sentIndex +1) and retry" /\ UNCHANGED matchIndex /\ Discard(m) /\ UNCHANGED <> @@ -940,7 +946,8 @@ HandleAppendEntriesResponse(i, j, m) == UpdateTerm(i, j, m) == /\ m.term > currentTerm[i] /\ currentTerm' = [currentTerm EXCEPT ![i] = m.term] - /\ state' = [state EXCEPT ![i] = IF @ \in {Leader, Candidate} THEN Follower ELSE @] + \* See become_aware_of_new_term() in raft.h:1915 + /\ state' = [state EXCEPT ![i] = IF @ \in {Leader, Candidate, None} THEN Follower ELSE @] /\ votedFor' = [votedFor EXCEPT ![i] = Nil] \* See rollback(last_committable_index()) in raft::become_follower /\ log' = [log EXCEPT ![i] = SubSeq(@, 1, LastCommittableIndex(i))] @@ -1224,13 +1231,14 @@ MonoLogInv == LogConfigurationConsistentInv == \A i \in Servers : \/ state[i] = None + \* Follower, but no known configurations yet + \/ /\ state[i] = Follower + /\ Cardinality(DOMAIN configurations[i]) = 0 \/ \* Configurations should have associated reconfiguration txs in the log /\ \A idx \in DOMAIN (configurations[i]) : /\ log[i][idx].contentType = TypeReconfiguration /\ log[i][idx].configuration = configurations[i][idx] - \* Current configuration should be committed - /\ commitIndex[i] >= CurrentConfigurationIndex(i) \* Pending configurations should not be committed yet /\ Cardinality(DOMAIN configurations[i]) > 1 => commitIndex[i] < NextConfigurationIndex(i) @@ -1251,10 +1259,10 @@ NoLeaderBeforeInitialTerm == \* MatchIndexLowerBoundNextIndexInv is not currently an invariant but \* we might wish to add it in the future. This could be achieved by updating \* nextIndex to max of current nextIndex and matchIndex when an AE ACK is received. -MatchIndexLowerBoundNextIndexInv == +MatchIndexLowerBoundSentIndexInv == \A i,j \in Servers : state[i] = Leader => - nextIndex[i][j] > matchIndex[i][j] + sentIndex[i][j] >= matchIndex[i][j] CommitCommittableIndices == \A i \in Servers : @@ -1399,7 +1407,7 @@ DebugAlias == commitIndex |-> commitIndex, committableIndices |-> committableIndices, votesGranted |-> votesGranted, - nextIndex |-> nextIndex, + sentIndex |-> sentIndex, matchIndex |-> matchIndex, _MessagesTo |-> [ s, t \in Servers |-> Network!MessagesTo(s, t) ]