diff --git a/CHANGELOG.md b/CHANGELOG.md index eb6061cb9d..5302fc60c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,28 @@ and this project adheres to the versioning scheme outlined in the [README.md](RE - `get-tenure-info?` added - `get-block-info?` removed +## [2.5.0.0.7] + +### Added + +- Add warn logs for block validate rejections (#5079) +- Neon mock miner replay (#5060) + +### Changed + +- Revert BurnchainHeaderHash serialization change (#5094) +- boot_to_epoch_3 in SignerTest should wait for a new commit (#5087) +- Fix block proposal rejection test (#5084) +- Mock signing revamp (#5070) +- Multi miner fixes jude (#5040) +- Remove spurious deadlock condition whenever the sortition DB is opened + +## [2.5.0.0.6] + +### Changed + +- If there is a getchunk/putchunk that fails due to a stale (or future) version NACK, the StackerDB sync state machine should immediately retry sync (#5066) + ## [2.5.0.0.5] ### Added diff --git a/clarity/src/vm/database/sqlite.rs b/clarity/src/vm/database/sqlite.rs index dc3ad4f5bd..7d2af59eb5 100644 --- a/clarity/src/vm/database/sqlite.rs +++ b/clarity/src/vm/database/sqlite.rs @@ -21,7 +21,7 @@ use rusqlite::{ }; use stacks_common::types::chainstate::{BlockHeaderHash, StacksBlockId}; use stacks_common::types::sqlite::NO_PARAMS; -use stacks_common::util::db_common::tx_busy_handler; +use stacks_common::util::db::tx_busy_handler; use stacks_common::util::hash::Sha512Trunc256Sum; use super::clarity_store::{make_contract_hash_key, ContractCommitment}; diff --git a/stacks-common/src/util/db.rs b/stacks-common/src/util/db.rs new file mode 100644 index 0000000000..89fe4677c7 --- /dev/null +++ b/stacks-common/src/util/db.rs @@ -0,0 +1,94 @@ +// Copyright (C) 2013-2020 Blockstack PBC, a public benefit corporation +// Copyright (C) 2020 Stacks Open Internet Foundation +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +use std::backtrace::Backtrace; +use std::sync::{LazyLock, Mutex}; +use std::thread; +use std::time::{Duration, Instant}; + +use hashbrown::HashMap; +use rand::{thread_rng, Rng}; +use rusqlite::Connection; + +use crate::util::sleep_ms; + +/// Keep track of DB locks, for deadlock debugging +/// - **key:** `rusqlite::Connection` debug print +/// - **value:** Lock holder (thread name + timestamp) +/// +/// This uses a `Mutex` inside of `LazyLock` because: +/// - Using `Mutex` alone, it can't be statically initialized because `HashMap::new()` isn't `const` +/// - Using `LazyLock` alone doesn't allow interior mutability +static LOCK_TABLE: LazyLock>> = + LazyLock::new(|| Mutex::new(HashMap::new())); +/// Generate timestanps for use in `LOCK_TABLE` +/// `Instant` is preferable to `SystemTime` because it uses `CLOCK_MONOTONIC` and is not affected by NTP adjustments +static LOCK_TABLE_TIMER: LazyLock = LazyLock::new(Instant::now); + +/// Call when using an operation which locks a database +/// Updates `LOCK_TABLE` +pub fn update_lock_table(conn: &Connection) { + let timestamp = LOCK_TABLE_TIMER.elapsed().as_millis(); + // The debug format for `Connection` includes the path + let k = format!("{conn:?}"); + let v = format!("{:?}@{timestamp}", thread::current().name()); + LOCK_TABLE.lock().unwrap().insert(k, v); +} + +/// Called by `rusqlite` if we are waiting too long on a database lock +/// If called too many times, will assume a deadlock and panic +pub fn tx_busy_handler(run_count: i32) -> bool { + const TIMEOUT: Duration = Duration::from_secs(300); + const AVG_SLEEP_TIME_MS: u64 = 100; + + // First, check if this is taking unreasonably long. If so, it's probably a deadlock + let run_count = run_count.unsigned_abs(); + let approx_time_elapsed = + Duration::from_millis(AVG_SLEEP_TIME_MS.saturating_mul(u64::from(run_count))); + if approx_time_elapsed > TIMEOUT { + error!("Deadlock detected. Waited {} seconds (estimated) for database lock. Giving up", approx_time_elapsed.as_secs(); + "run_count" => run_count, + "backtrace" => ?Backtrace::capture() + ); + for (k, v) in LOCK_TABLE.lock().unwrap().iter() { + error!("Database '{k}' last locked by {v}"); + } + panic!("Deadlock in thread {:?}", thread::current().name()); + } + + let mut sleep_time_ms = 2u64.saturating_pow(run_count); + + sleep_time_ms = sleep_time_ms.saturating_add(thread_rng().gen_range(0..sleep_time_ms)); + + if sleep_time_ms > AVG_SLEEP_TIME_MS { + let jitter = 10; + sleep_time_ms = + thread_rng().gen_range((AVG_SLEEP_TIME_MS - jitter)..(AVG_SLEEP_TIME_MS + jitter)); + } + + let msg = format!("Database is locked; sleeping {sleep_time_ms}ms and trying again"); + if run_count > 10 && run_count % 10 == 0 { + warn!("{msg}"; + "run_count" => run_count, + "backtrace" => ?Backtrace::capture() + ); + } else { + debug!("{msg}"); + } + + sleep_ms(sleep_time_ms); + true +} diff --git a/stacks-common/src/util/mod.rs b/stacks-common/src/util/mod.rs index 13ab79dcb3..a9dfc47806 100644 --- a/stacks-common/src/util/mod.rs +++ b/stacks-common/src/util/mod.rs @@ -19,6 +19,7 @@ pub mod log; #[macro_use] pub mod macros; pub mod chunked_encoding; +pub mod db; pub mod hash; pub mod pair; pub mod pipe; @@ -85,32 +86,6 @@ impl error::Error for HexError { } } -pub mod db_common { - use std::{thread, time}; - - use rand::{thread_rng, Rng}; - - pub fn tx_busy_handler(run_count: i32) -> bool { - let mut sleep_count = 10; - if run_count > 0 { - sleep_count = 2u64.saturating_pow(run_count as u32); - } - sleep_count = sleep_count.saturating_add(thread_rng().gen::() % sleep_count); - - if sleep_count > 5000 { - sleep_count = 5000; - } - - debug!( - "Database is locked; sleeping {}ms and trying again", - &sleep_count - ); - - thread::sleep(time::Duration::from_millis(sleep_count)); - true - } -} - /// Write any `serde_json` object directly to a file pub fn serialize_json_to_file(json: &J, path: P) -> Result<(), std::io::Error> where diff --git a/stackslib/src/util_lib/db.rs b/stackslib/src/util_lib/db.rs index f54a9c97ec..53f597daa2 100644 --- a/stackslib/src/util_lib/db.rs +++ b/stackslib/src/util_lib/db.rs @@ -18,7 +18,7 @@ use std::backtrace::Backtrace; use std::io::Error as IOError; use std::ops::{Deref, DerefMut}; use std::path::{Path, PathBuf}; -use std::time::Duration; +use std::time::{Duration, SystemTime}; use std::{error, fmt, fs, io}; use clarity::vm::types::QualifiedContractIdentifier; @@ -32,6 +32,7 @@ use serde_json::Error as serde_error; use stacks_common::types::chainstate::{SortitionId, StacksAddress, StacksBlockId, TrieHash}; use stacks_common::types::sqlite::NO_PARAMS; use stacks_common::types::Address; +use stacks_common::util::db::update_lock_table; use stacks_common::util::hash::to_hex; use stacks_common::util::secp256k1::{Secp256k1PrivateKey, Secp256k1PublicKey}; use stacks_common::util::sleep_ms; @@ -655,32 +656,9 @@ impl<'a, C: Clone, T: MarfTrieId> DerefMut for IndexDBTx<'a, C, T> { } } +/// Called by `rusqlite` if we are waiting too long on a database lock pub fn tx_busy_handler(run_count: i32) -> bool { - let mut sleep_count = 2; - if run_count > 0 { - sleep_count = 2u64.saturating_pow(run_count as u32); - } - sleep_count = sleep_count.saturating_add(thread_rng().gen::() % sleep_count); - - if sleep_count > 100 { - let jitter = thread_rng().gen::() % 20; - sleep_count = 100 - jitter; - } - - debug!( - "Database is locked; sleeping {}ms and trying again", - &sleep_count; - "backtrace" => ?{ - if run_count > 10 && run_count % 10 == 0 { - Some(Backtrace::capture()) - } else { - None - } - }, - ); - - sleep_ms(sleep_count); - true + stacks_common::util::db::tx_busy_handler(run_count) } /// Begin an immediate-mode transaction, and handle busy errors with exponential backoff. @@ -697,6 +675,7 @@ pub fn tx_begin_immediate<'a>(conn: &'a mut Connection) -> Result, Erro pub fn tx_begin_immediate_sqlite<'a>(conn: &'a mut Connection) -> Result, sqlite_error> { conn.busy_handler(Some(tx_busy_handler))?; let tx = Transaction::new(conn, TransactionBehavior::Immediate)?; + update_lock_table(tx.deref()); Ok(tx) } diff --git a/testnet/stacks-node/src/nakamoto_node/miner.rs b/testnet/stacks-node/src/nakamoto_node/miner.rs index e57e7cc177..da1c75c708 100644 --- a/testnet/stacks-node/src/nakamoto_node/miner.rs +++ b/testnet/stacks-node/src/nakamoto_node/miner.rs @@ -366,22 +366,32 @@ impl BlockMinerThread { &mut attempts, ) { Ok(x) => x, - Err(e) => { - match e { - NakamotoNodeError::StacksTipChanged => { - info!("Stacks tip changed while waiting for signatures"); - return Err(e); - } - NakamotoNodeError::BurnchainTipChanged => { - info!("Burnchain tip changed while waiting for signatures"); - return Err(e); - } - _ => { - error!("Error while gathering signatures: {e:?}. Will try mining again."); - continue; - } + Err(e) => match e { + NakamotoNodeError::StacksTipChanged => { + info!("Stacks tip changed while waiting for signatures"; + "signer_sighash" => %new_block.header.signer_signature_hash(), + "block_height" => new_block.header.chain_length, + "consensus_hash" => %new_block.header.consensus_hash, + ); + return Err(e); } - } + NakamotoNodeError::BurnchainTipChanged => { + info!("Burnchain tip changed while waiting for signatures"; + "signer_sighash" => %new_block.header.signer_signature_hash(), + "block_height" => new_block.header.chain_length, + "consensus_hash" => %new_block.header.consensus_hash, + ); + return Err(e); + } + _ => { + error!("Error while gathering signatures: {e:?}. Will try mining again."; + "signer_sighash" => %new_block.header.signer_signature_hash(), + "block_height" => new_block.header.chain_length, + "consensus_hash" => %new_block.header.consensus_hash, + ); + continue; + } + }, }; new_block.header.signer_signature = signer_signature;