Skip to content

Commit

Permalink
Fixes synchronisation problem when forgetting peer
Browse files Browse the repository at this point in the history
  • Loading branch information
bolt12 committed Oct 11, 2023
1 parent 202ab34 commit 3052c1f
Show file tree
Hide file tree
Showing 12 changed files with 179 additions and 85 deletions.
2 changes: 2 additions & 0 deletions ouroboros-network-framework/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

* Split `test` component into `io-tests` and `sim-tests`.

* Change connection manager `readState` function to be in `STM` instead of `m`

## 0.9.0.0 -- 2023-08-21

### Breaking changes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -613,15 +613,14 @@ withConnectionManager ConnectionManagerArguments {
return (freshIdSupply, v)

let readState
:: m (Map peerAddr AbstractState)
readState =
atomically $ do
state <- readTMVar stateVar
traverse ( fmap (abstractState . Known)
. readTVar
. connVar
)
state
:: STM m (Map peerAddr AbstractState)
readState = do
state <- readTMVar stateVar
traverse ( fmap (abstractState . Known)
. readTVar
. connVar
)
state

connectionManager :: ConnectionManager muxMode socket peerAddr
handle handleError m
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,7 @@ data ConnectionManager (muxMode :: MuxMode) socket peerAddr handle handleError m
(InboundConnectionManager muxMode socket peerAddr handle handleError m),

readState
:: m (Map peerAddr AbstractState)
:: STM m (Map peerAddr AbstractState)
}

--
Expand Down
13 changes: 13 additions & 0 deletions ouroboros-network/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,19 @@

* Fix diffusion tests

* Adds new constructor to `PeerStatus`: `PeerReallyCold`. This is in order to
fix a possible race where a node would be asynchronously demoted to cold
state, put into the cold set and wrongly forgotten, while its connection
was not yet cleaned up. This could lead to the peer being added to the known
set and promoted leading to a `TrConnectionExists` trace in the logs, which
is disallowed. With this being said, `PeerCold` status is now an
intermediate status that means the peer is cold but its connection still
lingers. This extra information warns the governor not to move the peer to
the cold set just yet.

Yes it can mean that the governor could promote the peer to hot, however it
will promptly fail since no mux will be running anyway.

## 0.9.1.0 -- 2023-08-22

### Breaking changes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -559,9 +559,10 @@ envEventCredits (TraceEnvSetTargets PeerSelectionTargets {
+ targetNumberOfEstablishedPeers
+ targetNumberOfActivePeers)

envEventCredits (TraceEnvPeersDemote Noop _) = 10
envEventCredits (TraceEnvPeersDemote ToWarm _) = 30
envEventCredits (TraceEnvPeersDemote ToCold _) = 30
envEventCredits (TraceEnvPeersDemote Noop _) = 10
envEventCredits (TraceEnvPeersDemote ToWarm _) = 30
envEventCredits (TraceEnvPeersDemote ToCold _) = 30
envEventCredits (TraceEnvPeersDemote ToReallyCold _) = 30

envEventCredits TraceEnvPeersStatus{} = 0
-- These events are visible in the environment but are the result of actions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -414,14 +414,23 @@ mockPeerSelectionActions' tracer
PeerHot -> writeTVar v PeerWarm
>> return False
PeerWarm -> return False
PeerCold -> return True
PeerCold -> return False
PeerReallyCold -> return True
ToCold -> do
threadDelay (interpretScriptDelay delay)
atomically $ do
s <- readTVar v
case s of
PeerCold -> return True
PeerCold -> return False
_ -> writeTVar v PeerCold
>> return False
ToReallyCold -> do
threadDelay (interpretScriptDelay delay)
atomically $ do
s <- readTVar v
case s of
PeerReallyCold -> return True
_ -> writeTVar v PeerReallyCold
>> return True

traceWith tracer (TraceEnvPeersDemote demotion peeraddr)
Expand All @@ -439,8 +448,8 @@ mockPeerSelectionActions' tracer
atomically $ do
status <- readTVar conn
case status of
PeerHot -> error "activatePeerConnection of hot peer"
PeerWarm -> writeTVar conn PeerHot
PeerHot -> error "activatePeerConnection of hot peer"
PeerWarm -> writeTVar conn PeerHot
--TODO: check it's just a race condition and not just wrong:
--
-- We throw 'ActivationError' for the following reason:
Expand All @@ -450,31 +459,34 @@ mockPeerSelectionActions' tracer
-- errored. Otherwise 'jobPromoteWarmPeer' will try to update the
-- state as if the transition went fine which will violate
-- 'invariantPeerSelectionState'.
PeerCold -> throwIO ActivationError
PeerCold -> throwIO ActivationError
PeerReallyCold -> throwIO ActivationError

deactivatePeerConnection :: PeerConn m -> m ()
deactivatePeerConnection (PeerConn peeraddr _ conn) = do
traceWith tracer (TraceEnvDeactivatePeer peeraddr)
atomically $ do
status <- readTVar conn
case status of
PeerHot -> writeTVar conn PeerWarm
PeerHot -> writeTVar conn PeerWarm
--TODO: check it's just a race condition and not just wrong:
PeerWarm -> return ()
PeerWarm -> return ()
-- See the note in 'activatePeerConnection' why we throw an exception
-- here.
PeerCold -> throwIO DeactivationError
PeerCold -> throwIO DeactivationError
PeerReallyCold -> throwIO DeactivationError

closePeerConnection :: PeerConn m -> m ()
closePeerConnection (PeerConn peeraddr _ conn) = do
traceWith tracer (TraceEnvCloseConn peeraddr)
atomically $ do
status <- readTVar conn
case status of
PeerHot -> writeTVar conn PeerCold
PeerHot -> writeTVar conn PeerReallyCold
--TODO: check it's just a race condition and not just wrong:
PeerWarm -> writeTVar conn PeerCold
PeerCold -> return ()
PeerWarm -> writeTVar conn PeerReallyCold
PeerCold -> writeTVar conn PeerReallyCold
PeerReallyCold -> return ()
conns <- readTVar connsVar
let !conns' = Map.delete peeraddr conns
writeTVar connsVar conns'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ type ConnectionScript = TimedScript AsyncDemotion

data AsyncDemotion = ToWarm
| ToCold
| ToReallyCold
| Noop
deriving (Eq, Show)

Expand Down Expand Up @@ -204,11 +205,13 @@ simpleGraphRep (graph, vertexInfo, lookupVertex) =
instance Arbitrary AsyncDemotion where
arbitrary = frequency [ (2, pure ToWarm)
, (2, pure ToCold)
, (2, pure ToReallyCold)
, (6, pure Noop)
]
shrink ToWarm = [ToCold, Noop]
shrink ToCold = [Noop]
shrink Noop = []
shrink ToWarm = [ToCold, Noop]
shrink ToCold = [ToReallyCold, Noop]
shrink ToReallyCold = [Noop]
shrink Noop = []


instance Arbitrary GovernorScripts where
Expand Down
2 changes: 1 addition & 1 deletion ouroboros-network/src/Ouroboros/Network/Diffusion/P2P.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1171,7 +1171,7 @@ run tracers tracersExtra args argsExtra apps appsExtra = do
_ <- Signals.installHandler
Signals.sigUSR1
(Signals.Catch
(do state <- readState connectionManager
(do state <- atomically $ readState connectionManager
traceWith (dtConnectionManagerTracer tracersExtra)
(TrState state)
)
Expand Down
28 changes: 18 additions & 10 deletions ouroboros-network/src/Ouroboros/Network/PeerSelection/Governor.hs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ import Control.Monad.Class.MonadTimer.SI
import Control.Tracer (Tracer (..), traceWith)
import System.Random

import Data.Set (Set)
import qualified Data.Set as Set
import Ouroboros.Network.PeerSelection.Churn (peerChurnGovernor)
import qualified Ouroboros.Network.PeerSelection.Governor.ActivePeers as ActivePeers
import qualified Ouroboros.Network.PeerSelection.Governor.BigLedgerPeers as BigLedgerPeers
Expand Down Expand Up @@ -503,11 +505,15 @@ peerSelectionGovernorLoop tracer
stateVar
actions
policy
jobPool =
loop
jobPool
pst = do
cacheTVar <- newTVarIO Set.empty
loop cacheTVar pst
where
loop :: PeerSelectionState peeraddr peerconn -> m Void
loop !st = assertPeerSelectionState st $ do
loop :: StrictTVar m (Set peeraddr)
-> PeerSelectionState peeraddr peerconn
-> m Void
loop cacheTVar !st = assertPeerSelectionState st $ do
-- Update public state using 'toPublicState' to compute available peers
-- to share for peer sharing
atomically $ writeTVar stateVar (toPublicState st)
Expand All @@ -517,7 +523,7 @@ peerSelectionGovernorLoop tracer
st' = st { knownPeers = knownPeers',
establishedPeers = establishedPeers' }

timedDecision <- evalGuardedDecisions blockedAt (peerSharing actions) st'
timedDecision <- evalGuardedDecisions blockedAt (peerSharing actions) cacheTVar st'

-- get the current time after the governor returned from the blocking
-- 'evalGuardedDecisions' call.
Expand All @@ -531,14 +537,15 @@ peerSelectionGovernorLoop tracer
newCounters

mapM_ (JobPool.forkJob jobPool) decisionJobs
loop (decisionState { countersCache = Cache newCounters })
loop cacheTVar (decisionState { countersCache = Cache newCounters })

evalGuardedDecisions :: Time
-> PeerSharing
-> StrictTVar m (Set peeraddr)
-> PeerSelectionState peeraddr peerconn
-> m (TimedDecision m peeraddr peerconn)
evalGuardedDecisions blockedAt peerSharing st =
case guardedDecisions blockedAt peerSharing st of
evalGuardedDecisions blockedAt peerSharing cacheTVar st =
case guardedDecisions blockedAt peerSharing cacheTVar st of
GuardedSkip _ ->
-- impossible since guardedDecisions always has something to wait for
error "peerSelectionGovernorLoop: impossible: nothing to do"
Expand All @@ -560,11 +567,12 @@ peerSelectionGovernorLoop tracer

guardedDecisions :: Time
-> PeerSharing
-> StrictTVar m (Set peeraddr)
-> PeerSelectionState peeraddr peerconn
-> Guarded (STM m) (TimedDecision m peeraddr peerconn)
guardedDecisions blockedAt peerSharing st =
guardedDecisions blockedAt peerSharing cacheTVar st =
-- All the alternative potentially-blocking decisions.
Monitor.connections actions st
Monitor.connections cacheTVar actions st
<> Monitor.jobs jobPool st
<> Monitor.targetPeers actions st
<> Monitor.localRoots actions policy st
Expand Down
Loading

0 comments on commit 3052c1f

Please sign in to comment.