Skip to content

Commit 3158365

Browse files
shumkovQuantumExplorer
authored andcommitted
fix(drive): uncommitted state if db transaction fails (#2305)
1 parent feacde2 commit 3158365

File tree

4 files changed

+157
-7
lines changed

4 files changed

+157
-7
lines changed

packages/rs-drive-abci/src/abci/handler/finalize_block.rs

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use crate::execution::types::block_execution_context::v0::BlockExecutionContextV
55
use crate::platform_types::cleaned_abci_messages::finalized_block_cleaned_request::v0::FinalizeBlockCleanedRequest;
66
use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
77
use crate::rpc::core::CoreRPCLike;
8+
use dpp::dashcore::Network;
89
use std::sync::atomic::Ordering;
910
use tenderdash_abci::proto::abci as proto;
1011

@@ -66,7 +67,30 @@ where
6667
));
6768
}
6869

69-
app.commit_transaction(platform_version)?;
70+
let result = app.commit_transaction(platform_version);
71+
72+
// We had a sequence of errors on the mainnet started since block 32326.
73+
// We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309).
74+
// Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966),
75+
// validators just proceeded to the next block partially committing the state and updating the cache.
76+
// Full nodes are stuck and proceeded after re-sync.
77+
// For the mainnet chain, we enable these fixes at the block when we consider the state is consistent.
78+
let config = &app.platform().config;
79+
80+
if app.platform().config.network == Network::Dash
81+
&& config.abci.chain_id == "evo1"
82+
&& block_height < 33000
83+
{
84+
// Old behavior on mainnet below block 33000
85+
result?;
86+
} else {
87+
// In case if transaction commit failed we still have caches in memory that
88+
// corresponds to the data that we weren't able to commit.
89+
// The simplified solution is to restart the Drive, so all caches
90+
// will be restored from the disk and try to process this block again.
91+
// TODO: We need a better handling of the transaction is busy error with retry logic.
92+
result.expect("commit transaction");
93+
}
7094

7195
app.platform()
7296
.committed_block_height_guard

packages/rs-drive-abci/src/abci/handler/info.rs

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use crate::abci::AbciError;
33
use crate::error::Error;
44
use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
55
use crate::rpc::core::CoreRPCLike;
6+
use dpp::dashcore::Network;
67
use dpp::version::DESIRED_PLATFORM_VERSION;
78
use tenderdash_abci::proto::abci as proto;
89

@@ -21,28 +22,67 @@ where
2122

2223
let platform_state = app.platform().state.load();
2324

24-
let state_app_hash = platform_state
25+
let last_block_height = platform_state.last_committed_block_height() as i64;
26+
27+
// Verify that Platform State corresponds to Drive commited state
28+
let platform_state_app_hash = platform_state
2529
.last_committed_block_app_hash()
26-
.map(|app_hash| app_hash.to_vec())
2730
.unwrap_or_default();
2831

32+
let grove_version = &platform_state
33+
.current_platform_version()?
34+
.drive
35+
.grove_version;
36+
37+
let drive_storage_root_hash = app
38+
.platform()
39+
.drive
40+
.grove
41+
.root_hash(None, grove_version)
42+
.unwrap()?;
43+
44+
// We had a sequence of errors on the mainnet started since block 32326.
45+
// We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309).
46+
// Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966),
47+
// validators just proceeded to the next block partially committing the state and updating the cache.
48+
// Full nodes are stuck and proceeded after re-sync.
49+
// For the mainnet chain, we enable these fixes at the block when we consider the state is consistent.
50+
let config = &app.platform().config;
51+
52+
#[allow(clippy::collapsible_if)]
53+
if !(config.network == Network::Dash
54+
&& config.abci.chain_id == "evo1"
55+
&& last_block_height < 33000)
56+
{
57+
// App hash in memory must be equal to app hash on disk
58+
if drive_storage_root_hash != platform_state_app_hash {
59+
// We panic because we can't recover from this situation.
60+
// Better to restart the Drive, so we might self-heal the node
61+
// reloading state form the disk
62+
panic!(
63+
"drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}",
64+
drive_storage_root_hash, platform_state_app_hash
65+
);
66+
}
67+
}
68+
2969
let desired_protocol_version = DESIRED_PLATFORM_VERSION.protocol_version;
3070

3171
let response = proto::ResponseInfo {
3272
data: "".to_string(),
3373
app_version: desired_protocol_version as u64,
34-
last_block_height: platform_state.last_committed_block_height() as i64,
74+
last_block_height,
3575
version: env!("CARGO_PKG_VERSION").to_string(),
36-
last_block_app_hash: state_app_hash.clone(),
76+
last_block_app_hash: platform_state_app_hash.to_vec(),
3777
};
3878

3979
tracing::debug!(
4080
desired_protocol_version,
4181
software_version = env!("CARGO_PKG_VERSION"),
4282
block_version = request.block_version,
4383
p2p_version = request.p2p_version,
44-
app_hash = hex::encode(state_app_hash),
45-
height = platform_state.last_committed_block_height(),
84+
app_hash = hex::encode(platform_state_app_hash),
85+
last_block_height,
4686
"Handshake with consensus engine",
4787
);
4888

packages/rs-drive-abci/src/abci/handler/prepare_proposal.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
1111
use crate::platform_types::state_transitions_processing_result::StateTransitionExecutionResult;
1212
use crate::rpc::core::CoreRPCLike;
1313
use dpp::dashcore::hashes::Hash;
14+
use dpp::dashcore::Network;
1415
use dpp::version::TryIntoPlatformVersioned;
1516
use drive::grovedb_storage::Error::RocksDBError;
1617
use tenderdash_abci::proto::abci as proto;
@@ -35,6 +36,48 @@ where
3536

3637
let platform_state = app.platform().state.load();
3738

39+
// Verify that Platform State corresponds to Drive commited state
40+
let platform_state_app_hash = platform_state
41+
.last_committed_block_app_hash()
42+
.unwrap_or_default();
43+
44+
let grove_version = &platform_state
45+
.current_platform_version()?
46+
.drive
47+
.grove_version;
48+
49+
let drive_storage_root_hash = app
50+
.platform()
51+
.drive
52+
.grove
53+
.root_hash(None, grove_version)
54+
.unwrap()?;
55+
56+
// We had a sequence of errors on the mainnet started since block 32326.
57+
// We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309).
58+
// Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966),
59+
// validators just proceeded to the next block partially committing the state and updating the cache.
60+
// Full nodes are stuck and proceeded after re-sync.
61+
// For the mainnet chain, we enable these fixes at the block when we consider the state is consistent.
62+
let config = &app.platform().config;
63+
64+
#[allow(clippy::collapsible_if)]
65+
if !(config.network == Network::Dash
66+
&& config.abci.chain_id == "evo1"
67+
&& request.height < 33000)
68+
{
69+
// App hash in memory must be equal to app hash on disk
70+
if drive_storage_root_hash != platform_state_app_hash {
71+
// We panic because we can't recover from this situation.
72+
// Better to restart the Drive, so we might self-heal the node
73+
// reloading state form the disk
74+
panic!(
75+
"drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}",
76+
drive_storage_root_hash, platform_state_app_hash
77+
);
78+
}
79+
}
80+
3881
let last_committed_core_height = platform_state.last_committed_core_height();
3982

4083
let starting_platform_version = platform_state.current_platform_version()?;

packages/rs-drive-abci/src/abci/handler/process_proposal.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use crate::platform_types::block_execution_outcome;
1212
use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
1313
use crate::platform_types::state_transitions_processing_result::StateTransitionExecutionResult;
1414
use crate::rpc::core::CoreRPCLike;
15+
use dpp::dashcore::Network;
1516
use dpp::version::TryIntoPlatformVersioned;
1617
use drive::grovedb_storage::Error::RocksDBError;
1718
use tenderdash_abci::proto::abci as proto;
@@ -179,6 +180,48 @@ where
179180

180181
let platform_state = app.platform().state.load();
181182

183+
// Verify that Platform State corresponds to Drive commited state
184+
let platform_state_app_hash = platform_state
185+
.last_committed_block_app_hash()
186+
.unwrap_or_default();
187+
188+
let grove_version = &platform_state
189+
.current_platform_version()?
190+
.drive
191+
.grove_version;
192+
193+
let drive_storage_root_hash = app
194+
.platform()
195+
.drive
196+
.grove
197+
.root_hash(None, grove_version)
198+
.unwrap()?;
199+
200+
// We had a sequence of errors on the mainnet started since block 32326.
201+
// We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309).
202+
// Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966),
203+
// validators just proceeded to the next block partially committing the state and updating the cache.
204+
// Full nodes are stuck and proceeded after re-sync.
205+
// For the mainnet chain, we enable these fixes at the block when we consider the state is consistent.
206+
let config = &app.platform().config;
207+
208+
#[allow(clippy::collapsible_if)]
209+
if !(app.platform().config.network == Network::Dash
210+
&& config.abci.chain_id == "evo1"
211+
&& request.height < 33000)
212+
{
213+
// App hash in memory must be equal to app hash on disk
214+
if drive_storage_root_hash != platform_state_app_hash {
215+
// We panic because we can't recover from this situation.
216+
// Better to restart the Drive, so we might self-heal the node
217+
// reloading state form the disk
218+
panic!(
219+
"drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}",
220+
drive_storage_root_hash, platform_state_app_hash
221+
);
222+
}
223+
}
224+
182225
let starting_platform_version = platform_state.current_platform_version()?;
183226

184227
// Running the proposal executes all the state transitions for the block

0 commit comments

Comments
 (0)