From 3bd185f82481b070e3f8431aef38c92221a081f3 Mon Sep 17 00:00:00 2001 From: wileyj <2847772+wileyj@users.noreply.github.com> Date: Mon, 12 Aug 2024 17:32:11 -0700 Subject: [PATCH 01/13] Adding changes for 2.5.0.0.6 --- CHANGELOG.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb6061cb9d..2cca1d273a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,27 @@ and this project adheres to the versioning scheme outlined in the [README.md](RE - `get-tenure-info?` added - `get-block-info?` removed +## [2.5.0.0.7] + +### Added + +feat: Neon mock miner replay (#5060) +chore: add warn logs for block validate rejections (#5079) + +### Changed + +- bugfix/boot_to_epoch_3 in SignerTest should wait for a new commit (#5087) +- Fix: Revert BurnchainHeaderHash serialization change (#5094) +- Feat/mock signing revamp (#5070) +- Fix block proposal rejection test (#5084) +- Fix/multi miner fixes jude (#5040) + +## [2.5.0.0.6] + +### Changed + +- If there is a getchunk/putchunk that fails due to a stale (or future) version NACK, the StackerDB sync state machine should immediately retry sync (#5066) + ## [2.5.0.0.5] ### Added From 88c95c6817f3412d5dc7b41c3d0dc8a9b3070ed3 Mon Sep 17 00:00:00 2001 From: wileyj <2847772+wileyj@users.noreply.github.com> Date: Wed, 28 Aug 2024 10:51:47 -0700 Subject: [PATCH 02/13] Update changelog for release --- CHANGELOG.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2cca1d273a..0f9a419f1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,16 +21,16 @@ and this project adheres to the versioning scheme outlined in the [README.md](RE ### Added -feat: Neon mock miner replay (#5060) -chore: add warn logs for block validate rejections (#5079) +- Neon mock miner replay (#5060) +- Add warn logs for block validate rejections (#5079) ### Changed -- bugfix/boot_to_epoch_3 in SignerTest should wait for a new commit (#5087) -- Fix: Revert BurnchainHeaderHash serialization change (#5094) -- Feat/mock signing revamp (#5070) +- boot_to_epoch_3 in SignerTest should wait for a new commit (#5087) +- Revert BurnchainHeaderHash serialization change (#5094) +- Mock signing revamp (#5070) - Fix block proposal rejection test (#5084) -- Fix/multi miner fixes jude (#5040) +- Multi miner fixes jude (#5040) ## [2.5.0.0.6] From 1584e9c422782e6e5e3f342f2e334f0e3d9e41b1 Mon Sep 17 00:00:00 2001 From: wileyj <2847772+wileyj@users.noreply.github.com> Date: Wed, 28 Aug 2024 10:52:51 -0700 Subject: [PATCH 03/13] Update changelog for release --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f9a419f1d..2add5b99f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,15 +21,15 @@ and this project adheres to the versioning scheme outlined in the [README.md](RE ### Added -- Neon mock miner replay (#5060) - Add warn logs for block validate rejections (#5079) +- Neon mock miner replay (#5060) ### Changed -- boot_to_epoch_3 in SignerTest should wait for a new commit (#5087) - Revert BurnchainHeaderHash serialization change (#5094) -- Mock signing revamp (#5070) +- boot_to_epoch_3 in SignerTest should wait for a new commit (#5087) - Fix block proposal rejection test (#5084) +- Mock signing revamp (#5070) - Multi miner fixes jude (#5040) ## [2.5.0.0.6] From 9eef4066cce42414a0d0690de4da23027b8292ba Mon Sep 17 00:00:00 2001 From: Jude Nelson Date: Wed, 28 Aug 2024 14:49:59 -0400 Subject: [PATCH 04/13] chore: remove spurious deadlock condition arising from needlessly opening a transaction whenever we open the sortition DB --- stackslib/src/chainstate/burn/db/sortdb.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/stackslib/src/chainstate/burn/db/sortdb.rs b/stackslib/src/chainstate/burn/db/sortdb.rs index 15a3bf5641..b538ae17fa 100644 --- a/stackslib/src/chainstate/burn/db/sortdb.rs +++ b/stackslib/src/chainstate/burn/db/sortdb.rs @@ -3456,6 +3456,14 @@ impl SortitionDB { SortitionDB::apply_schema_9(&tx.deref(), epochs)?; tx.commit()?; } else if version == expected_version { + // this transaction is almost never needed + let validated_epochs = StacksEpoch::validate_epochs(epochs); + let existing_epochs = Self::get_stacks_epochs(self.conn())?; + if existing_epochs == validated_epochs { + return Ok(()); + } + + // epochs are out of date let tx = self.tx_begin()?; SortitionDB::validate_and_replace_epochs(&tx, epochs)?; tx.commit()?; From afdaaf6b04d72b1d3c58f42bf7644b975ef1ded1 Mon Sep 17 00:00:00 2001 From: wileyj <2847772+wileyj@users.noreply.github.com> Date: Wed, 4 Sep 2024 10:47:07 -0700 Subject: [PATCH 05/13] Add deadlock fix addtion to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2add5b99f6..5302fc60c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ and this project adheres to the versioning scheme outlined in the [README.md](RE - Fix block proposal rejection test (#5084) - Mock signing revamp (#5070) - Multi miner fixes jude (#5040) +- Remove spurious deadlock condition whenever the sortition DB is opened ## [2.5.0.0.6] From 47c562a9d586a325a28629b5b9ede0203d0be8c5 Mon Sep 17 00:00:00 2001 From: Jude Nelson Date: Thu, 5 Sep 2024 22:20:25 -0400 Subject: [PATCH 06/13] fix: make a stackerdb shrink if its signer list becomes smaller than that in the DB --- stackslib/src/net/stackerdb/db.rs | 11 +++ stackslib/src/net/stackerdb/mod.rs | 8 ++- stackslib/src/net/stackerdb/tests/db.rs | 96 +++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 1 deletion(-) diff --git a/stackslib/src/net/stackerdb/db.rs b/stackslib/src/net/stackerdb/db.rs index 1dab3f4052..2b735668ac 100644 --- a/stackslib/src/net/stackerdb/db.rs +++ b/stackslib/src/net/stackerdb/db.rs @@ -293,6 +293,15 @@ impl<'a> StackerDBTx<'a> { Ok(()) } + /// Shrink a StackerDB. Remove all slots at and beyond a particular slot ID. + fn shrink_stackerdb(&self, stackerdb_id: i64, first_slot_id: u32) -> Result<(), net_error> { + let qry = "DELETE FROM chunks WHERE stackerdb_id = ?1 AND slot_id >= ?2"; + let args = params![&stackerdb_id, &first_slot_id]; + let mut stmt = self.sql_tx.prepare(&qry)?; + stmt.execute(args)?; + Ok(()) + } + /// Update a database's storage slots, e.g. from new configuration state in its smart contract. /// Chunk data for slots that no longer exist will be dropped. /// Newly-created slots will be instantiated with empty data. @@ -343,6 +352,8 @@ impl<'a> StackerDBTx<'a> { stmt.execute(args)?; } } + debug!("Shrink {} to {} slots", smart_contract, total_slots_read); + self.shrink_stackerdb(stackerdb_id, total_slots_read)?; Ok(()) } diff --git a/stackslib/src/net/stackerdb/mod.rs b/stackslib/src/net/stackerdb/mod.rs index d310998a19..ea01b1b22a 100644 --- a/stackslib/src/net/stackerdb/mod.rs +++ b/stackslib/src/net/stackerdb/mod.rs @@ -341,8 +341,14 @@ impl StackerDBs { &e ); } - } else if new_config != stackerdb_config && new_config.signers.len() > 0 { + } else if (new_config != stackerdb_config && new_config.signers.len() > 0) + || (new_config == stackerdb_config + && new_config.signers.len() + != self.get_slot_versions(&stackerdb_contract_id)?.len()) + { // only reconfigure if the config has changed + // (that second check on the length is needed in case the node is a victim of + // #5142, which was a bug whereby a stackerdb could never shrink) if let Err(e) = self.reconfigure_stackerdb(&stackerdb_contract_id, &new_config) { warn!( "Failed to create or reconfigure StackerDB {stackerdb_contract_id}: DB error {:?}", diff --git a/stackslib/src/net/stackerdb/tests/db.rs b/stackslib/src/net/stackerdb/tests/db.rs index 7371b6b9c5..9bcf800529 100644 --- a/stackslib/src/net/stackerdb/tests/db.rs +++ b/stackslib/src/net/stackerdb/tests/db.rs @@ -20,6 +20,7 @@ use std::path::Path; use clarity::vm::types::QualifiedContractIdentifier; use clarity::vm::ContractName; use libstackerdb::SlotMetadata; +use rusqlite::params; use stacks_common::address::{ AddressHashMode, C32_ADDRESS_VERSION_MAINNET_MULTISIG, C32_ADDRESS_VERSION_MAINNET_SINGLESIG, }; @@ -649,6 +650,16 @@ fn test_reconfigure_stackerdb() { initial_metadata.push((slot_metadata, chunk_data)); } + tx.commit().unwrap(); + + let db_slot_metadata = db.get_db_slot_metadata(&sc).unwrap(); + assert_eq!(db_slot_metadata.len(), pks.len()); + for (i, slot_md) in db_slot_metadata.iter().enumerate() { + let slot_metadata = db.get_slot_metadata(&sc, i as u32).unwrap().unwrap(); + assert_eq!(slot_metadata, *slot_md); + } + + let tx = db.tx_begin(StackerDBConfig::noop()).unwrap(); let new_pks: Vec<_> = (0..10).map(|_| StacksPrivateKey::new()).collect(); let reconfigured_pks = vec![ // first five slots are unchanged @@ -722,6 +733,91 @@ fn test_reconfigure_stackerdb() { assert_eq!(chunk.len(), 0); } } + + let db_slot_metadata = db.get_db_slot_metadata(&sc).unwrap(); + assert_eq!(db_slot_metadata.len(), reconfigured_pks.len()); + for (i, slot_md) in db_slot_metadata.iter().enumerate() { + let slot_metadata = db.get_slot_metadata(&sc, i as u32).unwrap().unwrap(); + assert_eq!(slot_metadata, *slot_md); + } + + // reconfigure with fewer slots + let new_pks: Vec<_> = (0..10).map(|_| StacksPrivateKey::new()).collect(); + let reconfigured_pks = vec![ + // first five slots are unchanged + pks[0], pks[1], pks[2], pks[3], pks[4], + // next five slots are different, so their contents will be dropped and versions and write + // timestamps reset + new_pks[0], new_pks[1], new_pks[2], new_pks[3], + new_pks[4], + // slots 10-15 will disappear + ]; + let reconfigured_addrs: Vec<_> = reconfigured_pks + .iter() + .map(|pk| { + StacksAddress::from_public_keys( + C32_ADDRESS_VERSION_MAINNET_SINGLESIG, + &AddressHashMode::SerializeP2PKH, + 1, + &vec![StacksPublicKey::from_private(&pk)], + ) + .unwrap() + }) + .collect(); + + let tx = db.tx_begin(StackerDBConfig::noop()).unwrap(); + + // reconfigure + tx.reconfigure_stackerdb( + &sc, + &reconfigured_addrs + .clone() + .into_iter() + .map(|addr| (addr, 1)) + .collect::>(), + ) + .unwrap(); + + tx.commit().unwrap(); + + for (i, pk) in new_pks.iter().enumerate() { + if i < 5 { + // first five are unchanged + let chunk_data = StackerDBChunkData { + slot_id: i as u32, + slot_version: 1, + sig: MessageSignature::empty(), + data: vec![i as u8; 128], + }; + + let slot_metadata = db.get_slot_metadata(&sc, i as u32).unwrap().unwrap(); + let chunk = db.get_latest_chunk(&sc, i as u32).unwrap().unwrap(); + + assert_eq!(initial_metadata[i].0, slot_metadata); + assert_eq!(initial_metadata[i].1.data, chunk); + } else if i < 10 { + // next five are wiped + let slot_metadata = db.get_slot_metadata(&sc, i as u32).unwrap().unwrap(); + assert_eq!(slot_metadata.slot_id, i as u32); + assert_eq!(slot_metadata.slot_version, 0); + assert_eq!(slot_metadata.data_hash, Sha512Trunc256Sum([0x00; 32])); + assert_eq!(slot_metadata.signature, MessageSignature::empty()); + + let chunk = db.get_latest_chunk(&sc, i as u32).unwrap().unwrap(); + assert_eq!(chunk.len(), 0); + } else { + // final five are gone + let slot_metadata_opt = db.get_slot_metadata(&sc, i as u32).unwrap(); + assert!(slot_metadata_opt.is_none()); + } + } + + let db_slot_metadata = db.get_db_slot_metadata(&sc).unwrap(); + assert_eq!(db_slot_metadata.len(), reconfigured_pks.len()); + for (i, slot_md) in db_slot_metadata.iter().enumerate() { + let slot_metadata = db.get_slot_metadata(&sc, i as u32).unwrap().unwrap(); + assert_eq!(slot_metadata, *slot_md); + } } // TODO: max chunk size From 47436e7ffed5dadff50739cd5ffbdcd962f2cb84 Mon Sep 17 00:00:00 2001 From: Jeff Bencin Date: Tue, 3 Sep 2024 17:17:35 -0400 Subject: [PATCH 07/13] fix: Improve logging in rusqlite busy handler, and fail eventually if probable deadlock detected --- stackslib/src/util_lib/db.rs | 54 ++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/stackslib/src/util_lib/db.rs b/stackslib/src/util_lib/db.rs index f54a9c97ec..be09de2556 100644 --- a/stackslib/src/util_lib/db.rs +++ b/stackslib/src/util_lib/db.rs @@ -655,31 +655,44 @@ impl<'a, C: Clone, T: MarfTrieId> DerefMut for IndexDBTx<'a, C, T> { } } +/// Called by `rusqlite` if we are waiting too long on a database lock pub fn tx_busy_handler(run_count: i32) -> bool { - let mut sleep_count = 2; - if run_count > 0 { - sleep_count = 2u64.saturating_pow(run_count as u32); + const TIMEOUT: Duration = Duration::from_secs(60); + const AVG_SLEEP_TIME_MS: u64 = 100; + + // First, check if this is taking unreasonably long. If so, it's probably a deadlock + let run_count = run_count.unsigned_abs(); + let approx_time_elapsed = + Duration::from_millis(AVG_SLEEP_TIME_MS.saturating_mul(u64::from(run_count))); + if approx_time_elapsed > TIMEOUT { + error!("Probable deadlock detected. Waited {} seconds (estimated) for database lock. Giving up", approx_time_elapsed.as_secs(); + "run_count" => run_count, + "backtrace" => ?Backtrace::capture() + ); + return false; } - sleep_count = sleep_count.saturating_add(thread_rng().gen::() % sleep_count); - if sleep_count > 100 { - let jitter = thread_rng().gen::() % 20; - sleep_count = 100 - jitter; + let mut sleep_time_ms = 2u64.saturating_pow(run_count); + + sleep_time_ms = sleep_time_ms.saturating_add(thread_rng().gen_range(0..sleep_time_ms)); + + if sleep_time_ms > AVG_SLEEP_TIME_MS { + let bound = 10; + let jitter = thread_rng().gen_range(0..bound * 2); + sleep_time_ms = (AVG_SLEEP_TIME_MS - bound) + jitter; } - debug!( - "Database is locked; sleeping {}ms and trying again", - &sleep_count; - "backtrace" => ?{ - if run_count > 10 && run_count % 10 == 0 { - Some(Backtrace::capture()) - } else { - None - } - }, - ); + let msg = format!("Database is locked; sleeping {sleep_time_ms}ms and trying again"); + if run_count > 10 && run_count % 10 == 0 { + warn!("{msg}"; + "run_count" => run_count, + "backtrace" => ?Backtrace::capture() + ); + } else { + debug!("{msg}"); + } - sleep_ms(sleep_count); + sleep_ms(sleep_time_ms); true } @@ -696,8 +709,7 @@ pub fn tx_begin_immediate<'a>(conn: &'a mut Connection) -> Result, Erro /// Sames as `tx_begin_immediate` except that it returns a rusqlite error. pub fn tx_begin_immediate_sqlite<'a>(conn: &'a mut Connection) -> Result, sqlite_error> { conn.busy_handler(Some(tx_busy_handler))?; - let tx = Transaction::new(conn, TransactionBehavior::Immediate)?; - Ok(tx) + Transaction::new(conn, TransactionBehavior::Immediate) } #[cfg(feature = "profile-sqlite")] From 3b24bd34c15f00ae0549a69717d5e2624ea816e0 Mon Sep 17 00:00:00 2001 From: Jeff Bencin Date: Wed, 4 Sep 2024 13:24:07 -0400 Subject: [PATCH 08/13] chore: Address PR comment from Brice --- stackslib/src/util_lib/db.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stackslib/src/util_lib/db.rs b/stackslib/src/util_lib/db.rs index be09de2556..4728b83e73 100644 --- a/stackslib/src/util_lib/db.rs +++ b/stackslib/src/util_lib/db.rs @@ -677,9 +677,9 @@ pub fn tx_busy_handler(run_count: i32) -> bool { sleep_time_ms = sleep_time_ms.saturating_add(thread_rng().gen_range(0..sleep_time_ms)); if sleep_time_ms > AVG_SLEEP_TIME_MS { - let bound = 10; - let jitter = thread_rng().gen_range(0..bound * 2); - sleep_time_ms = (AVG_SLEEP_TIME_MS - bound) + jitter; + let jitter = 10; + sleep_time_ms = + thread_rng().gen_range((AVG_SLEEP_TIME_MS - jitter)..(AVG_SLEEP_TIME_MS + jitter)); } let msg = format!("Database is locked; sleeping {sleep_time_ms}ms and trying again"); From 8dcfadf8c205c720a3b16086c195c9c2617bcf4e Mon Sep 17 00:00:00 2001 From: Jeff Bencin Date: Thu, 5 Sep 2024 08:56:27 -0400 Subject: [PATCH 09/13] chore: Update the `tx_busy_handler()` in stacks-common also --- clarity/src/vm/database/sqlite.rs | 2 +- stacks-common/src/util/db.rs | 64 +++++++++++++++++++++++++++++++ stacks-common/src/util/mod.rs | 27 +------------ stackslib/src/util_lib/db.rs | 38 +----------------- 4 files changed, 67 insertions(+), 64 deletions(-) create mode 100644 stacks-common/src/util/db.rs diff --git a/clarity/src/vm/database/sqlite.rs b/clarity/src/vm/database/sqlite.rs index dc3ad4f5bd..7d2af59eb5 100644 --- a/clarity/src/vm/database/sqlite.rs +++ b/clarity/src/vm/database/sqlite.rs @@ -21,7 +21,7 @@ use rusqlite::{ }; use stacks_common::types::chainstate::{BlockHeaderHash, StacksBlockId}; use stacks_common::types::sqlite::NO_PARAMS; -use stacks_common::util::db_common::tx_busy_handler; +use stacks_common::util::db::tx_busy_handler; use stacks_common::util::hash::Sha512Trunc256Sum; use super::clarity_store::{make_contract_hash_key, ContractCommitment}; diff --git a/stacks-common/src/util/db.rs b/stacks-common/src/util/db.rs new file mode 100644 index 0000000000..29ebdd0bb0 --- /dev/null +++ b/stacks-common/src/util/db.rs @@ -0,0 +1,64 @@ +// Copyright (C) 2013-2020 Blockstack PBC, a public benefit corporation +// Copyright (C) 2020 Stacks Open Internet Foundation +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +use std::backtrace::Backtrace; +use std::time::Duration; + +use rand::{thread_rng, Rng}; + +use crate::util::sleep_ms; + +/// Called by `rusqlite` if we are waiting too long on a database lock +/// If called too many times, will fail to avoid deadlocks +pub fn tx_busy_handler(run_count: i32) -> bool { + const TIMEOUT: Duration = Duration::from_secs(60); + const AVG_SLEEP_TIME_MS: u64 = 100; + + // First, check if this is taking unreasonably long. If so, it's probably a deadlock + let run_count = run_count.unsigned_abs(); + let approx_time_elapsed = + Duration::from_millis(AVG_SLEEP_TIME_MS.saturating_mul(u64::from(run_count))); + if approx_time_elapsed > TIMEOUT { + error!("Probable deadlock detected. Waited {} seconds (estimated) for database lock. Giving up", approx_time_elapsed.as_secs(); + "run_count" => run_count, + "backtrace" => ?Backtrace::capture() + ); + return false; + } + + let mut sleep_time_ms = 2u64.saturating_pow(run_count); + + sleep_time_ms = sleep_time_ms.saturating_add(thread_rng().gen_range(0..sleep_time_ms)); + + if sleep_time_ms > AVG_SLEEP_TIME_MS { + let jitter = 10; + sleep_time_ms = + thread_rng().gen_range((AVG_SLEEP_TIME_MS - jitter)..(AVG_SLEEP_TIME_MS + jitter)); + } + + let msg = format!("Database is locked; sleeping {sleep_time_ms}ms and trying again"); + if run_count > 10 && run_count % 10 == 0 { + warn!("{msg}"; + "run_count" => run_count, + "backtrace" => ?Backtrace::capture() + ); + } else { + debug!("{msg}"); + } + + sleep_ms(sleep_time_ms); + true +} diff --git a/stacks-common/src/util/mod.rs b/stacks-common/src/util/mod.rs index 13ab79dcb3..a9dfc47806 100644 --- a/stacks-common/src/util/mod.rs +++ b/stacks-common/src/util/mod.rs @@ -19,6 +19,7 @@ pub mod log; #[macro_use] pub mod macros; pub mod chunked_encoding; +pub mod db; pub mod hash; pub mod pair; pub mod pipe; @@ -85,32 +86,6 @@ impl error::Error for HexError { } } -pub mod db_common { - use std::{thread, time}; - - use rand::{thread_rng, Rng}; - - pub fn tx_busy_handler(run_count: i32) -> bool { - let mut sleep_count = 10; - if run_count > 0 { - sleep_count = 2u64.saturating_pow(run_count as u32); - } - sleep_count = sleep_count.saturating_add(thread_rng().gen::() % sleep_count); - - if sleep_count > 5000 { - sleep_count = 5000; - } - - debug!( - "Database is locked; sleeping {}ms and trying again", - &sleep_count - ); - - thread::sleep(time::Duration::from_millis(sleep_count)); - true - } -} - /// Write any `serde_json` object directly to a file pub fn serialize_json_to_file(json: &J, path: P) -> Result<(), std::io::Error> where diff --git a/stackslib/src/util_lib/db.rs b/stackslib/src/util_lib/db.rs index 4728b83e73..a0496d3bfc 100644 --- a/stackslib/src/util_lib/db.rs +++ b/stackslib/src/util_lib/db.rs @@ -657,43 +657,7 @@ impl<'a, C: Clone, T: MarfTrieId> DerefMut for IndexDBTx<'a, C, T> { /// Called by `rusqlite` if we are waiting too long on a database lock pub fn tx_busy_handler(run_count: i32) -> bool { - const TIMEOUT: Duration = Duration::from_secs(60); - const AVG_SLEEP_TIME_MS: u64 = 100; - - // First, check if this is taking unreasonably long. If so, it's probably a deadlock - let run_count = run_count.unsigned_abs(); - let approx_time_elapsed = - Duration::from_millis(AVG_SLEEP_TIME_MS.saturating_mul(u64::from(run_count))); - if approx_time_elapsed > TIMEOUT { - error!("Probable deadlock detected. Waited {} seconds (estimated) for database lock. Giving up", approx_time_elapsed.as_secs(); - "run_count" => run_count, - "backtrace" => ?Backtrace::capture() - ); - return false; - } - - let mut sleep_time_ms = 2u64.saturating_pow(run_count); - - sleep_time_ms = sleep_time_ms.saturating_add(thread_rng().gen_range(0..sleep_time_ms)); - - if sleep_time_ms > AVG_SLEEP_TIME_MS { - let jitter = 10; - sleep_time_ms = - thread_rng().gen_range((AVG_SLEEP_TIME_MS - jitter)..(AVG_SLEEP_TIME_MS + jitter)); - } - - let msg = format!("Database is locked; sleeping {sleep_time_ms}ms and trying again"); - if run_count > 10 && run_count % 10 == 0 { - warn!("{msg}"; - "run_count" => run_count, - "backtrace" => ?Backtrace::capture() - ); - } else { - debug!("{msg}"); - } - - sleep_ms(sleep_time_ms); - true + stacks_common::util::db::tx_busy_handler(run_count) } /// Begin an immediate-mode transaction, and handle busy errors with exponential backoff. From 021ab583d2bbfde9e8e93de5b6674a7bc62d6091 Mon Sep 17 00:00:00 2001 From: Jeff Bencin Date: Wed, 11 Sep 2024 12:31:31 -0400 Subject: [PATCH 10/13] chore: Add `LOCK_TABLE` to print out all DB locks if deadlock detected --- stacks-common/src/util/db.rs | 29 ++++++++++++++++++++++++----- stackslib/src/util_lib/db.rs | 10 ++++++++-- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/stacks-common/src/util/db.rs b/stacks-common/src/util/db.rs index 29ebdd0bb0..fff39a3297 100644 --- a/stacks-common/src/util/db.rs +++ b/stacks-common/src/util/db.rs @@ -15,16 +15,32 @@ // along with this program. If not, see . use std::backtrace::Backtrace; -use std::time::Duration; +use std::sync::{LazyLock, Mutex}; +use std::thread; +use std::time::{Duration, Instant}; +use hashbrown::HashMap; use rand::{thread_rng, Rng}; use crate::util::sleep_ms; +/// Keep track of DB locks, for deadlock debugging +/// - **key:** `rusqlite::Connection` debug print +/// - **value:** Lock holder (thread name + timestamp) +/// +/// This uses a `Mutex` inside of `LazyLock` because: +/// - Using `Mutex` alone, it can't be statically initialized because `HashMap::new()` isn't `const` +/// - Using `LazyLock` alone doesn't allow interior mutability +pub static LOCK_TABLE: LazyLock>> = + LazyLock::new(|| Mutex::new(HashMap::new())); +/// Generate timestanps for use in `LOCK_TABLE` +/// `Instant` is preferable to `SystemTime` because it uses `CLOCK_MONOTONIC` and is not affected by NTP adjustments +pub static LOCK_TABLE_TIMER: LazyLock = LazyLock::new(Instant::now); + /// Called by `rusqlite` if we are waiting too long on a database lock -/// If called too many times, will fail to avoid deadlocks +/// If called too many times, will assume a deadlock and panic pub fn tx_busy_handler(run_count: i32) -> bool { - const TIMEOUT: Duration = Duration::from_secs(60); + const TIMEOUT: Duration = Duration::from_secs(300); const AVG_SLEEP_TIME_MS: u64 = 100; // First, check if this is taking unreasonably long. If so, it's probably a deadlock @@ -32,11 +48,14 @@ pub fn tx_busy_handler(run_count: i32) -> bool { let approx_time_elapsed = Duration::from_millis(AVG_SLEEP_TIME_MS.saturating_mul(u64::from(run_count))); if approx_time_elapsed > TIMEOUT { - error!("Probable deadlock detected. Waited {} seconds (estimated) for database lock. Giving up", approx_time_elapsed.as_secs(); + error!("Deadlock detected. Waited {} seconds (estimated) for database lock. Giving up", approx_time_elapsed.as_secs(); "run_count" => run_count, "backtrace" => ?Backtrace::capture() ); - return false; + for (k, v) in LOCK_TABLE.lock().unwrap().iter() { + error!("Database '{k}' last locked by {v}"); + } + panic!("Deadlock in thread {:?}", thread::current().name()); } let mut sleep_time_ms = 2u64.saturating_pow(run_count); diff --git a/stackslib/src/util_lib/db.rs b/stackslib/src/util_lib/db.rs index a0496d3bfc..70850d372c 100644 --- a/stackslib/src/util_lib/db.rs +++ b/stackslib/src/util_lib/db.rs @@ -18,7 +18,7 @@ use std::backtrace::Backtrace; use std::io::Error as IOError; use std::ops::{Deref, DerefMut}; use std::path::{Path, PathBuf}; -use std::time::Duration; +use std::time::{Duration, SystemTime}; use std::{error, fmt, fs, io}; use clarity::vm::types::QualifiedContractIdentifier; @@ -32,6 +32,7 @@ use serde_json::Error as serde_error; use stacks_common::types::chainstate::{SortitionId, StacksAddress, StacksBlockId, TrieHash}; use stacks_common::types::sqlite::NO_PARAMS; use stacks_common::types::Address; +use stacks_common::util::db::{LOCK_TABLE, LOCK_TABLE_TIMER}; use stacks_common::util::hash::to_hex; use stacks_common::util::secp256k1::{Secp256k1PrivateKey, Secp256k1PublicKey}; use stacks_common::util::sleep_ms; @@ -673,7 +674,12 @@ pub fn tx_begin_immediate<'a>(conn: &'a mut Connection) -> Result, Erro /// Sames as `tx_begin_immediate` except that it returns a rusqlite error. pub fn tx_begin_immediate_sqlite<'a>(conn: &'a mut Connection) -> Result, sqlite_error> { conn.busy_handler(Some(tx_busy_handler))?; - Transaction::new(conn, TransactionBehavior::Immediate) + let tx = Transaction::new(conn, TransactionBehavior::Immediate)?; + let time = LOCK_TABLE_TIMER.elapsed().as_millis(); + let k = format!("{:?}", tx.deref()); + let v = format!("{:?}@{time}", std::thread::current().name()); + LOCK_TABLE.lock().unwrap().insert(k, v); + Ok(tx) } #[cfg(feature = "profile-sqlite")] From 67d0f308edd39a7270ea327c92c510fe59fd27ff Mon Sep 17 00:00:00 2001 From: Jeff Bencin Date: Wed, 11 Sep 2024 12:45:16 -0400 Subject: [PATCH 11/13] refactor: Move logic for updating `LOCK_TABLE` into stacks-common --- stacks-common/src/util/db.rs | 14 ++++++++++++-- stackslib/src/util_lib/db.rs | 7 ++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/stacks-common/src/util/db.rs b/stacks-common/src/util/db.rs index fff39a3297..257a98aab9 100644 --- a/stacks-common/src/util/db.rs +++ b/stacks-common/src/util/db.rs @@ -21,6 +21,7 @@ use std::time::{Duration, Instant}; use hashbrown::HashMap; use rand::{thread_rng, Rng}; +use rusqlite::Connection; use crate::util::sleep_ms; @@ -31,11 +32,20 @@ use crate::util::sleep_ms; /// This uses a `Mutex` inside of `LazyLock` because: /// - Using `Mutex` alone, it can't be statically initialized because `HashMap::new()` isn't `const` /// - Using `LazyLock` alone doesn't allow interior mutability -pub static LOCK_TABLE: LazyLock>> = +static LOCK_TABLE: LazyLock>> = LazyLock::new(|| Mutex::new(HashMap::new())); /// Generate timestanps for use in `LOCK_TABLE` /// `Instant` is preferable to `SystemTime` because it uses `CLOCK_MONOTONIC` and is not affected by NTP adjustments -pub static LOCK_TABLE_TIMER: LazyLock = LazyLock::new(Instant::now); +static LOCK_TABLE_TIMER: LazyLock = LazyLock::new(Instant::now); + +/// Call when using an operation which locks a database +/// Updates `LOCK_TABLE` +pub fn update_lock_table(conn: &Connection) { + let timestamp = LOCK_TABLE_TIMER.elapsed().as_millis(); + let k = format!("{conn:?}"); + let v = format!("{:?}@{timestamp}", std::thread::current().name()); + LOCK_TABLE.lock().unwrap().insert(k, v); +} /// Called by `rusqlite` if we are waiting too long on a database lock /// If called too many times, will assume a deadlock and panic diff --git a/stackslib/src/util_lib/db.rs b/stackslib/src/util_lib/db.rs index 70850d372c..53f597daa2 100644 --- a/stackslib/src/util_lib/db.rs +++ b/stackslib/src/util_lib/db.rs @@ -32,7 +32,7 @@ use serde_json::Error as serde_error; use stacks_common::types::chainstate::{SortitionId, StacksAddress, StacksBlockId, TrieHash}; use stacks_common::types::sqlite::NO_PARAMS; use stacks_common::types::Address; -use stacks_common::util::db::{LOCK_TABLE, LOCK_TABLE_TIMER}; +use stacks_common::util::db::update_lock_table; use stacks_common::util::hash::to_hex; use stacks_common::util::secp256k1::{Secp256k1PrivateKey, Secp256k1PublicKey}; use stacks_common::util::sleep_ms; @@ -675,10 +675,7 @@ pub fn tx_begin_immediate<'a>(conn: &'a mut Connection) -> Result, Erro pub fn tx_begin_immediate_sqlite<'a>(conn: &'a mut Connection) -> Result, sqlite_error> { conn.busy_handler(Some(tx_busy_handler))?; let tx = Transaction::new(conn, TransactionBehavior::Immediate)?; - let time = LOCK_TABLE_TIMER.elapsed().as_millis(); - let k = format!("{:?}", tx.deref()); - let v = format!("{:?}@{time}", std::thread::current().name()); - LOCK_TABLE.lock().unwrap().insert(k, v); + update_lock_table(tx.deref()); Ok(tx) } From 292cd8902286c3b9224d31c2050cff2de7049774 Mon Sep 17 00:00:00 2001 From: Jeff Bencin Date: Wed, 11 Sep 2024 12:50:04 -0400 Subject: [PATCH 12/13] chore: Add comment --- stacks-common/src/util/db.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stacks-common/src/util/db.rs b/stacks-common/src/util/db.rs index 257a98aab9..89fe4677c7 100644 --- a/stacks-common/src/util/db.rs +++ b/stacks-common/src/util/db.rs @@ -42,8 +42,9 @@ static LOCK_TABLE_TIMER: LazyLock = LazyLock::new(Instant::now); /// Updates `LOCK_TABLE` pub fn update_lock_table(conn: &Connection) { let timestamp = LOCK_TABLE_TIMER.elapsed().as_millis(); + // The debug format for `Connection` includes the path let k = format!("{conn:?}"); - let v = format!("{:?}@{timestamp}", std::thread::current().name()); + let v = format!("{:?}@{timestamp}", thread::current().name()); LOCK_TABLE.lock().unwrap().insert(k, v); } From c77c024db6192cf8bbbac99249fd985418d246d5 Mon Sep 17 00:00:00 2001 From: Brice Dobry Date: Fri, 13 Sep 2024 10:52:47 -0400 Subject: [PATCH 13/13] chore: add info to signature gathering failure logs --- .../stacks-node/src/nakamoto_node/miner.rs | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/testnet/stacks-node/src/nakamoto_node/miner.rs b/testnet/stacks-node/src/nakamoto_node/miner.rs index c3a1baea60..ef2a5e609d 100644 --- a/testnet/stacks-node/src/nakamoto_node/miner.rs +++ b/testnet/stacks-node/src/nakamoto_node/miner.rs @@ -358,22 +358,32 @@ impl BlockMinerThread { &mut attempts, ) { Ok(x) => x, - Err(e) => { - match e { - NakamotoNodeError::StacksTipChanged => { - info!("Stacks tip changed while waiting for signatures"); - return Err(e); - } - NakamotoNodeError::BurnchainTipChanged => { - info!("Burnchain tip changed while waiting for signatures"); - return Err(e); - } - _ => { - error!("Error while gathering signatures: {e:?}. Will try mining again."); - continue; - } + Err(e) => match e { + NakamotoNodeError::StacksTipChanged => { + info!("Stacks tip changed while waiting for signatures"; + "signer_sighash" => %new_block.header.signer_signature_hash(), + "block_height" => new_block.header.chain_length, + "consensus_hash" => %new_block.header.consensus_hash, + ); + return Err(e); } - } + NakamotoNodeError::BurnchainTipChanged => { + info!("Burnchain tip changed while waiting for signatures"; + "signer_sighash" => %new_block.header.signer_signature_hash(), + "block_height" => new_block.header.chain_length, + "consensus_hash" => %new_block.header.consensus_hash, + ); + return Err(e); + } + _ => { + error!("Error while gathering signatures: {e:?}. Will try mining again."; + "signer_sighash" => %new_block.header.signer_signature_hash(), + "block_height" => new_block.header.chain_length, + "consensus_hash" => %new_block.header.consensus_hash, + ); + continue; + } + }, }; new_block.header.signer_signature = signer_signature;