Skip to content

Commit 1db5ee7

Browse files
authored
Last revision reset (#2137)
> [!NOTE] > This is currently on top of the 0.22.2 tag and I could not find a 0.22 release branch to open the PR. We should probably not merge it like this. Instead I should rebase it to master and we cherry pick it back to do a 0.22.3 release. The last-known-revision kept by the Etcd network component may be incorrect if the etcd cluster compacted this revision while the node was offline or if the last-known-revision state file was removed. Both cases can be handled by detecting a failing watch request and at least using the compactRevision from the response. This is a somewhat exceptional situation and the node state may be inconsistent because of this. Hence we also log a warning when this happens. --- * [x] CHANGELOG updated * [ ] Documentation updated or not needed * [x] Haddocks updated * [x] No new TODOs introduced or explained herafter
2 parents b984abc + 51c82d3 commit 1db5ee7

File tree

5 files changed

+117
-41
lines changed

5 files changed

+117
-41
lines changed

CHANGELOG.md

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
88
As a minor extension, we also keep a semantic version for the `UNRELEASED`
99
changes.
1010

11-
## [0.22.2] - 2025.06.30
11+
## [0.22.3] - UNRELEASED
12+
13+
* Change behavior of `Hydra.Network.Etcd` to fallback to earliest possible
14+
revision if `last-known-revision` is missing or too old. This can happen if a
15+
node is down for a long time and the `etcd` cluster compacted the last known
16+
revision in the meantime
17+
[#2136](https://github.com/cardano-scaling/hydra/issues/2136).
18+
19+
## [0.22.2] - 2025-06-30
1220

1321
* Fix wrong hydra-script-tx-ids in networks.json
1422

15-
## [0.22.1] - 2025.06.27
23+
## [0.22.1] - 2025-06-27
1624

1725
* Fix for bug where node got stalled at `ReplayingState` [#2089](https://github.com/cardano-scaling/hydra/issues/2089)
1826

19-
## [0.22.0] - 2025.06.17
27+
## [0.22.0] - 2025-06-17
2028

2129
- Tested with `cardano-node 10.1.2` and `cardano-cli 10.1.1.0`.
2230

hydra-node/hydra-node.cabal

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
cabal-version: 3.0
22
name: hydra-node
3-
version: 0.22.2
3+
version: 0.22.3
44
synopsis: The Hydra node
55
author: IOG
66
copyright: 2022 IOG
@@ -399,6 +399,7 @@ test-suite tests
399399
, text
400400
, time
401401
, tls
402+
, typed-process
402403
, websockets
403404

404405
build-tool-depends: hspec-discover:hspec-discover

hydra-node/src/Hydra/Network/Etcd.hs

Lines changed: 49 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@
2222
-- only deliver messages that were not seen before. In case we are not connected
2323
-- to our 'etcd' instance or not enough peers (= on a minority cluster), we
2424
-- retry sending, but also store messages to broadcast in a 'PersistentQueue',
25-
-- which makes the node resilient against crashes while sending. TODO: Is this
26-
-- needed? performance limitation?
25+
-- which makes the node resilient against crashes while sending.
2726
--
2827
-- Connectivity and compatibility with other nodes on the cluster is tracked
2928
-- using the key-value service as well:
@@ -92,7 +91,6 @@ import Network.GRPC.Client (
9291
)
9392
import Network.GRPC.Client.StreamType.IO (biDiStreaming, nonStreaming)
9493
import Network.GRPC.Common (GrpcError (..), GrpcException (..), HTTP2Settings (..), NextElem (..), def, defaultHTTP2Settings)
95-
import Network.GRPC.Common.NextElem (whileNext_)
9694
import Network.GRPC.Common.Protobuf (Proto (..), Protobuf, defMessage, (.~))
9795
import Network.GRPC.Etcd (
9896
Compare'CompareResult (..),
@@ -101,6 +99,7 @@ import Network.GRPC.Etcd (
10199
Lease,
102100
Watch,
103101
)
102+
import Network.Socket (PortNumber)
104103
import System.Directory (createDirectoryIfMissing, listDirectory, removeFile)
105104
import System.Environment.Blank (getEnvironment)
106105
import System.FilePath (takeDirectory, (</>))
@@ -175,7 +174,7 @@ withEtcdNetwork tracer protocolVersion config callback action = do
175174
traceWith tracer Reconnecting
176175
pure $ reconnectPolicy doneVar
177176

178-
clientHost = Host{hostname = "127.0.0.1", port = clientPort}
177+
clientHost = Host{hostname = "127.0.0.1", port = getClientPort config}
179178

180179
grpcServer =
181180
ServerInsecure $
@@ -185,11 +184,6 @@ withEtcdNetwork tracer protocolVersion config callback action = do
185184
, addressAuthority = Nothing
186185
}
187186

188-
-- NOTE: Offset client port by the same amount as configured 'port' is offset
189-
-- from the default '5001'. This will result in the default client port 2379
190-
-- be used by default still.
191-
clientPort = 2379 + port listen - 5001
192-
193187
traceStderr p NetworkCallback{onConnectivity} =
194188
forever $ do
195189
bs <- BS.hGetLine (getStderr p)
@@ -249,6 +243,14 @@ withEtcdNetwork tracer protocolVersion config callback action = do
249243

250244
NetworkConfiguration{persistenceDir, listen, advertise, peers, whichEtcd} = config
251245

246+
-- | Get the client port corresponding to a listen address.
247+
--
248+
-- The client port used by the started etcd port is offset by the same amount as
249+
-- the listen address is offset by the default port 5001. This will result in
250+
-- the default client port 2379 be used by default still.
251+
getClientPort :: NetworkConfiguration -> PortNumber
252+
getClientPort NetworkConfiguration{listen} = 2379 + port listen - 5001
253+
252254
-- | Return the path of the etcd binary. Will either install it first, or just
253255
-- assume there is one available on the system path.
254256
getEtcdBinary :: FilePath -> WhichEtcd -> IO FilePath
@@ -297,8 +299,7 @@ checkVersion tracer conn ourVersion NetworkCallback{onConnectivity} = do
297299
Right theirVersion ->
298300
unless (theirVersion == ourVersion) $
299301
onConnectivity VersionMismatch{ourVersion, theirVersion = Just theirVersion}
300-
else
301-
traceWith tracer $ MatchingProtocolVersion{version = ourVersion}
302+
else traceWith tracer $ MatchingProtocolVersion{version = ourVersion}
302303
where
303304
versionKey = "version"
304305

@@ -371,11 +372,13 @@ waitMessages ::
371372
NetworkCallback msg IO ->
372373
IO ()
373374
waitMessages tracer conn directory NetworkCallback{deliver} = do
374-
revision <- getLastKnownRevision directory
375375
withGrpcContext "waitMessages" . forever $ do
376376
-- NOTE: We have not observed the watch (subscription) fail even when peers
377377
-- leave and we end up on a minority cluster.
378378
biDiStreaming conn (rpc @(Protobuf Watch "watch")) $ \send recv -> do
379+
revision <- getLastKnownRevision directory
380+
let startRevision = fromIntegral (revision + 1)
381+
traceWith tracer WatchMessagesStartRevision{startRevision}
379382
-- NOTE: Request all keys starting with 'msg'. See also section KeyRanges
380383
-- in https://etcd.io/docs/v3.5/learning/api/#key-value-api
381384
let watchRequest =
@@ -384,34 +387,48 @@ waitMessages tracer conn directory NetworkCallback{deliver} = do
384387
& #rangeEnd .~ "msh" -- NOTE: g+1 to query prefixes
385388
& #startRevision .~ fromIntegral (revision + 1)
386389
send . NextElem $ defMessage & #createRequest .~ watchRequest
387-
whileNext_ recv process
390+
loop send recv
388391
-- Wait before re-trying
389392
threadDelay 1
390393
where
391-
process res = do
392-
let revision = fromIntegral $ res ^. #header . #revision
393-
putLastKnownRevision directory revision
394-
forM_ (res ^. #events) $ \event -> do
395-
let value = event ^. #kv . #value
396-
case decodeFull' value of
397-
Left err ->
398-
traceWith
399-
tracer
400-
FailedToDecodeValue
401-
{ key = decodeUtf8 $ event ^. #kv . #key
402-
, value = encodeBase16 value
403-
, reason = show err
404-
}
405-
Right msg -> deliver msg
394+
loop send recv =
395+
recv >>= \case
396+
NoNextElem -> pure ()
397+
NextElem res ->
398+
if res ^. #canceled
399+
then do
400+
let compactRevision = res ^. #compactRevision
401+
traceWith tracer WatchMessagesFallbackTo{compactRevision}
402+
putLastKnownRevision directory . fromIntegral $ (compactRevision - 1) `max` 0
403+
-- Gracefully close watch stream
404+
send NoNextElem
405+
else do
406+
let revision = res ^. #header . #revision
407+
putLastKnownRevision directory . fromIntegral $ revision `max` 0
408+
forM_ (res ^. #events) process
409+
loop send recv
410+
411+
process event = do
412+
let value = event ^. #kv . #value
413+
case decodeFull' value of
414+
Left err ->
415+
traceWith
416+
tracer
417+
FailedToDecodeValue
418+
{ key = decodeUtf8 $ event ^. #kv . #key
419+
, value = encodeBase16 value
420+
, reason = show err
421+
}
422+
Right msg -> deliver msg
406423

407424
getLastKnownRevision :: MonadIO m => FilePath -> m Natural
408425
getLastKnownRevision directory = do
409426
liftIO $
410427
try (decodeFileStrict' $ directory </> "last-known-revision") >>= \case
411428
Right rev -> do
412-
pure $ fromMaybe 1 rev
429+
pure $ fromMaybe 0 rev
413430
Left (e :: IOException)
414-
| isDoesNotExistError e -> pure 1
431+
| isDoesNotExistError e -> pure 0
415432
| otherwise -> do
416433
fail $ "Failed to load last known revision: " <> show e
417434

@@ -614,5 +631,7 @@ data EtcdLog
614631
| LowLeaseTTL {ttlRemaining :: DiffTime}
615632
| NoKeepAliveResponse
616633
| MatchingProtocolVersion {version :: ProtocolVersion}
634+
| WatchMessagesStartRevision {startRevision :: Int64}
635+
| WatchMessagesFallbackTo {compactRevision :: Int64}
617636
deriving stock (Eq, Show, Generic)
618637
deriving anyclass (ToJSON)

hydra-node/test/Hydra/NetworkSpec.hs

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,12 @@ import Hydra.Network (
2323
ProtocolVersion (..),
2424
WhichEtcd (..),
2525
)
26-
import Hydra.Network.Etcd (withEtcdNetwork)
26+
import Hydra.Network.Etcd (getClientPort, withEtcdNetwork)
2727
import Hydra.Network.Message (Message (..))
2828
import Hydra.Node.Network (NetworkConfiguration (..))
2929
import System.Directory (removeFile)
3030
import System.FilePath ((</>))
31+
import System.Process.Typed (runProcess_, shell)
3132
import Test.Aeson.GenericSpecs (Settings (..), defaultSettings, roundtripAndGoldenADTSpecsWithSettings)
3233
import Test.Hydra.Node.Fixture (alice, aliceSk, bob, bobSk, carol, carolSk)
3334
import Test.Network.Ports (randomUnusedTCPPorts, withFreePort)
@@ -177,14 +178,60 @@ spec = do
177178
withEtcdNetwork @Int tracer v1 carolConfig recordCarol $ \_ -> do
178179
broadcast n1 1001
179180
waitCarol `shouldReturn` 1001
180-
-- We can reset the last known view (internal implementation detail)
181+
182+
it "handles compaction and lost local state" $ \tracer -> do
183+
withTempDir "test-etcd" $ \tmp -> do
184+
failAfter 20 $ do
185+
PeerConfig3{aliceConfig, bobConfig, carolConfig} <- setup3Peers tmp
186+
(recordBob, waitBob, _) <- newRecordingCallback
187+
(recordCarol, waitCarol, _) <- newRecordingCallback
188+
withEtcdNetwork @Int tracer v1 aliceConfig noopCallback $ \n1 ->
189+
withEtcdNetwork @Int tracer v1 bobConfig recordBob $ \_ -> do
190+
-- First we send 5 messages with carol online
191+
withEtcdNetwork @Int tracer v1 carolConfig recordCarol $ \_ -> do
192+
forM_ [1 .. 5] $ \msg -> do
193+
broadcast n1 msg
194+
waitBob `shouldReturn` msg
195+
waitCarol `shouldReturn` msg
196+
-- Carol stopped and we continue sending messages
197+
forM_ [5 .. 100] $ \msg -> do
198+
broadcast n1 msg
199+
waitBob `shouldReturn` msg
200+
-- Even while carol is down, the etcd component would
201+
-- "auto-compact" messages. By default down to 1000 messages
202+
-- after/every 5 minutes. This is interesting as it should
203+
-- result in carol never some messages, but is hard to test
204+
-- (without waiting 5 minutes). Instead we issue a direct etcd
205+
-- command to compact everything before revision 50.
206+
runProcess_ . shell $
207+
"etcdctl compact 50 --endpoints=127.0.0.1:" <> show (getClientPort aliceConfig)
208+
-- When carol starts now we would expect it to start catching up
209+
-- from the earliest possible revision 50. While missing some
210+
-- messages.
211+
withEtcdNetwork @Int tracer v1 carolConfig recordCarol $ \_ -> do
212+
-- NOTE: Revision 50 may not correspond to message 50, so we
213+
-- only assert its some message bigger than 25 and expect to
214+
-- see all further messages to 100.
215+
firstMsg <- waitCarol
216+
firstMsg `shouldSatisfy` (> 25)
217+
forM_ [firstMsg + 1 .. 100] $ \msg ->
218+
waitCarol `shouldReturn` msg
219+
-- Carol should be able to receive new messages just fine.
220+
forM_ [101 .. 105] $ \msg -> do
221+
broadcast n1 msg
222+
waitCarol `shouldReturn` msg
223+
-- Similarly, should carol lose its local state, we expect it to
224+
-- see everything from the last compacted revision 50. We can
225+
-- enforce this by removing the corresponding file (an internal
226+
-- implementation detail)
181227
removeFile (persistenceDir carolConfig </> "last-known-revision")
182228
withEtcdNetwork @Int tracer v1 carolConfig recordCarol $ \_ -> do
183-
-- NOTE: The etcd component would "auto-compact" messages down
184-
-- to 1000 messages after 5 minutes. This would result in
185-
-- starting at 1001 here, but is hard to test (without waiting
186-
-- 5 minutes).
187-
forM_ messages $ \msg ->
229+
-- NOTE: Revision 50 may not correspond to message 50, so we
230+
-- only assert its some message bigger than 25 and expect to
231+
-- see all further messages to 105.
232+
firstMsg <- waitCarol
233+
firstMsg `shouldSatisfy` (> 25)
234+
forM_ [firstMsg + 1 .. 105] $ \msg -> do
188235
waitCarol `shouldReturn` msg
189236

190237
it "emits cluster id mismatch" $ \tracer -> do

nix/hydra/packages.nix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ rec {
142142
buildInputs = [
143143
nativePkgs.hydra-node.components.tests.tests
144144
pkgs.check-jsonschema
145+
pkgs.etcd # For etcdctl command in tests
145146
];
146147
};
147148
hydra-cluster-tests = pkgs.mkShellNoCC {

0 commit comments

Comments
 (0)