Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop' into fix/5136-node-and-…
Browse files Browse the repository at this point in the history
…miner
  • Loading branch information
kantai committed Sep 13, 2024
2 parents 02cc0c5 + 1fa594a commit 8efaed0
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 68 deletions.
22 changes: 22 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,28 @@ and this project adheres to the versioning scheme outlined in the [README.md](RE
- `get-tenure-info?` added
- `get-block-info?` removed

## [2.5.0.0.7]

### Added

- Add warn logs for block validate rejections (#5079)
- Neon mock miner replay (#5060)

### Changed

- Revert BurnchainHeaderHash serialization change (#5094)
- boot_to_epoch_3 in SignerTest should wait for a new commit (#5087)
- Fix block proposal rejection test (#5084)
- Mock signing revamp (#5070)
- Multi miner fixes jude (#5040)
- Remove spurious deadlock condition whenever the sortition DB is opened

## [2.5.0.0.6]

### Changed

- If there is a getchunk/putchunk that fails due to a stale (or future) version NACK, the StackerDB sync state machine should immediately retry sync (#5066)

## [2.5.0.0.5]

### Added
Expand Down
2 changes: 1 addition & 1 deletion clarity/src/vm/database/sqlite.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use rusqlite::{
};
use stacks_common::types::chainstate::{BlockHeaderHash, StacksBlockId};
use stacks_common::types::sqlite::NO_PARAMS;
use stacks_common::util::db_common::tx_busy_handler;
use stacks_common::util::db::tx_busy_handler;
use stacks_common::util::hash::Sha512Trunc256Sum;

use super::clarity_store::{make_contract_hash_key, ContractCommitment};
Expand Down
94 changes: 94 additions & 0 deletions stacks-common/src/util/db.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// Copyright (C) 2013-2020 Blockstack PBC, a public benefit corporation
// Copyright (C) 2020 Stacks Open Internet Foundation
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

use std::backtrace::Backtrace;
use std::sync::{LazyLock, Mutex};
use std::thread;
use std::time::{Duration, Instant};

use hashbrown::HashMap;
use rand::{thread_rng, Rng};
use rusqlite::Connection;

use crate::util::sleep_ms;

/// Keep track of DB locks, for deadlock debugging
/// - **key:** `rusqlite::Connection` debug print
/// - **value:** Lock holder (thread name + timestamp)
///
/// This uses a `Mutex` inside of `LazyLock` because:
/// - Using `Mutex` alone, it can't be statically initialized because `HashMap::new()` isn't `const`
/// - Using `LazyLock` alone doesn't allow interior mutability
static LOCK_TABLE: LazyLock<Mutex<HashMap<String, String>>> =
LazyLock::new(|| Mutex::new(HashMap::new()));
/// Generate timestanps for use in `LOCK_TABLE`
/// `Instant` is preferable to `SystemTime` because it uses `CLOCK_MONOTONIC` and is not affected by NTP adjustments
static LOCK_TABLE_TIMER: LazyLock<Instant> = LazyLock::new(Instant::now);

/// Call when using an operation which locks a database
/// Updates `LOCK_TABLE`
pub fn update_lock_table(conn: &Connection) {
let timestamp = LOCK_TABLE_TIMER.elapsed().as_millis();
// The debug format for `Connection` includes the path
let k = format!("{conn:?}");
let v = format!("{:?}@{timestamp}", thread::current().name());
LOCK_TABLE.lock().unwrap().insert(k, v);
}

/// Called by `rusqlite` if we are waiting too long on a database lock
/// If called too many times, will assume a deadlock and panic
pub fn tx_busy_handler(run_count: i32) -> bool {
const TIMEOUT: Duration = Duration::from_secs(300);
const AVG_SLEEP_TIME_MS: u64 = 100;

// First, check if this is taking unreasonably long. If so, it's probably a deadlock
let run_count = run_count.unsigned_abs();
let approx_time_elapsed =
Duration::from_millis(AVG_SLEEP_TIME_MS.saturating_mul(u64::from(run_count)));
if approx_time_elapsed > TIMEOUT {
error!("Deadlock detected. Waited {} seconds (estimated) for database lock. Giving up", approx_time_elapsed.as_secs();
"run_count" => run_count,
"backtrace" => ?Backtrace::capture()
);
for (k, v) in LOCK_TABLE.lock().unwrap().iter() {
error!("Database '{k}' last locked by {v}");
}
panic!("Deadlock in thread {:?}", thread::current().name());
}

let mut sleep_time_ms = 2u64.saturating_pow(run_count);

sleep_time_ms = sleep_time_ms.saturating_add(thread_rng().gen_range(0..sleep_time_ms));

if sleep_time_ms > AVG_SLEEP_TIME_MS {
let jitter = 10;
sleep_time_ms =
thread_rng().gen_range((AVG_SLEEP_TIME_MS - jitter)..(AVG_SLEEP_TIME_MS + jitter));
}

let msg = format!("Database is locked; sleeping {sleep_time_ms}ms and trying again");
if run_count > 10 && run_count % 10 == 0 {
warn!("{msg}";
"run_count" => run_count,
"backtrace" => ?Backtrace::capture()
);
} else {
debug!("{msg}");
}

sleep_ms(sleep_time_ms);
true
}
27 changes: 1 addition & 26 deletions stacks-common/src/util/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pub mod log;
#[macro_use]
pub mod macros;
pub mod chunked_encoding;
pub mod db;
pub mod hash;
pub mod pair;
pub mod pipe;
Expand Down Expand Up @@ -85,32 +86,6 @@ impl error::Error for HexError {
}
}

pub mod db_common {
use std::{thread, time};

use rand::{thread_rng, Rng};

pub fn tx_busy_handler(run_count: i32) -> bool {
let mut sleep_count = 10;
if run_count > 0 {
sleep_count = 2u64.saturating_pow(run_count as u32);
}
sleep_count = sleep_count.saturating_add(thread_rng().gen::<u64>() % sleep_count);

if sleep_count > 5000 {
sleep_count = 5000;
}

debug!(
"Database is locked; sleeping {}ms and trying again",
&sleep_count
);

thread::sleep(time::Duration::from_millis(sleep_count));
true
}
}

/// Write any `serde_json` object directly to a file
pub fn serialize_json_to_file<J, P>(json: &J, path: P) -> Result<(), std::io::Error>
where
Expand Down
31 changes: 5 additions & 26 deletions stackslib/src/util_lib/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use std::backtrace::Backtrace;
use std::io::Error as IOError;
use std::ops::{Deref, DerefMut};
use std::path::{Path, PathBuf};
use std::time::Duration;
use std::time::{Duration, SystemTime};
use std::{error, fmt, fs, io};

use clarity::vm::types::QualifiedContractIdentifier;
Expand All @@ -32,6 +32,7 @@ use serde_json::Error as serde_error;
use stacks_common::types::chainstate::{SortitionId, StacksAddress, StacksBlockId, TrieHash};
use stacks_common::types::sqlite::NO_PARAMS;
use stacks_common::types::Address;
use stacks_common::util::db::update_lock_table;
use stacks_common::util::hash::to_hex;
use stacks_common::util::secp256k1::{Secp256k1PrivateKey, Secp256k1PublicKey};
use stacks_common::util::sleep_ms;
Expand Down Expand Up @@ -655,32 +656,9 @@ impl<'a, C: Clone, T: MarfTrieId> DerefMut for IndexDBTx<'a, C, T> {
}
}

/// Called by `rusqlite` if we are waiting too long on a database lock
pub fn tx_busy_handler(run_count: i32) -> bool {
let mut sleep_count = 2;
if run_count > 0 {
sleep_count = 2u64.saturating_pow(run_count as u32);
}
sleep_count = sleep_count.saturating_add(thread_rng().gen::<u64>() % sleep_count);

if sleep_count > 100 {
let jitter = thread_rng().gen::<u64>() % 20;
sleep_count = 100 - jitter;
}

debug!(
"Database is locked; sleeping {}ms and trying again",
&sleep_count;
"backtrace" => ?{
if run_count > 10 && run_count % 10 == 0 {
Some(Backtrace::capture())
} else {
None
}
},
);

sleep_ms(sleep_count);
true
stacks_common::util::db::tx_busy_handler(run_count)
}

/// Begin an immediate-mode transaction, and handle busy errors with exponential backoff.
Expand All @@ -697,6 +675,7 @@ pub fn tx_begin_immediate<'a>(conn: &'a mut Connection) -> Result<DBTx<'a>, Erro
pub fn tx_begin_immediate_sqlite<'a>(conn: &'a mut Connection) -> Result<DBTx<'a>, sqlite_error> {
conn.busy_handler(Some(tx_busy_handler))?;
let tx = Transaction::new(conn, TransactionBehavior::Immediate)?;
update_lock_table(tx.deref());
Ok(tx)
}

Expand Down
40 changes: 25 additions & 15 deletions testnet/stacks-node/src/nakamoto_node/miner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -366,22 +366,32 @@ impl BlockMinerThread {
&mut attempts,
) {
Ok(x) => x,
Err(e) => {
match e {
NakamotoNodeError::StacksTipChanged => {
info!("Stacks tip changed while waiting for signatures");
return Err(e);
}
NakamotoNodeError::BurnchainTipChanged => {
info!("Burnchain tip changed while waiting for signatures");
return Err(e);
}
_ => {
error!("Error while gathering signatures: {e:?}. Will try mining again.");
continue;
}
Err(e) => match e {
NakamotoNodeError::StacksTipChanged => {
info!("Stacks tip changed while waiting for signatures";
"signer_sighash" => %new_block.header.signer_signature_hash(),
"block_height" => new_block.header.chain_length,
"consensus_hash" => %new_block.header.consensus_hash,
);
return Err(e);
}
}
NakamotoNodeError::BurnchainTipChanged => {
info!("Burnchain tip changed while waiting for signatures";
"signer_sighash" => %new_block.header.signer_signature_hash(),
"block_height" => new_block.header.chain_length,
"consensus_hash" => %new_block.header.consensus_hash,
);
return Err(e);
}
_ => {
error!("Error while gathering signatures: {e:?}. Will try mining again.";
"signer_sighash" => %new_block.header.signer_signature_hash(),
"block_height" => new_block.header.chain_length,
"consensus_hash" => %new_block.header.consensus_hash,
);
continue;
}
},
};

new_block.header.signer_signature = signer_signature;
Expand Down

0 comments on commit 8efaed0

Please sign in to comment.