-
Notifications
You must be signed in to change notification settings - Fork 679
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add script to recover missing data in archival nodes (#12007)
This script contains the minimal set of commands to restore missing data from archival nodes. The data loss appeared in three time periods: - early January - after first resharding - after second resharding Due to the nature of how trie nodes are stored in the database, one missing node in the past may cause errors when querying accounts at very distant future heights. Recovery takes many days and required an SSD backed cold storage. I've tested the procedure's outcome with manual queries and by iterating tries from different state roots.
- Loading branch information
Showing
1 changed file
with
80 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
#!/bin/bash | ||
set -eox pipefail | ||
|
||
# This script can be used on archival nodes to recover missing data from early 2024. | ||
# | ||
# Requirements: | ||
# - The cold store must be mounted on an SSD to make the operation faster. | ||
# - The config 'resharding_config.batch_delay' must be set to 0. | ||
# - neard service must be stopped for the entire duration. | ||
# | ||
# The script is idempotent. If interrupted, it can be restarted from the beginning, | ||
# although in the interest of time it's advised to continue from the last successful step. | ||
# | ||
# On top of every command, where applicable, we include the number of expected database writes. | ||
# The estimated number of batches assumes a batch size of 500.0 KB. | ||
# | ||
# The entire procedure is expected to take several days to complete. | ||
|
||
########## | ||
# neard must be stopped | ||
########## | ||
systemctl is-active --quiet neard && echo "neard must be stopped" && exit 1 | ||
|
||
|
||
########## | ||
# Recover missing trie nodes before resharding by re-applying blocks | ||
########## | ||
export RUST_LOG=info | ||
|
||
echo "Regenerating trie nodes between heights 109913254 - 110050000 in shard 2" | ||
# DB writes = 377624 | ||
/home/ubuntu/neard view-state -t cold --readwrite apply-range --start-index 109913254 --end-index 110050000 --shard-id 2 --storage trie-free --save-state cold sequential | ||
|
||
|
||
########## | ||
# Perform first resharding | ||
########## | ||
export RUST_LOG=debug | ||
|
||
echo "Resharding database at height 114580307 and shard 0" | ||
# DB writes >= 133646770, batch_count = 16569 | ||
/home/ubuntu/neard database resharding --height 114580307 --shard-id 0 --restore | ||
|
||
echo "Resharding database at height 114580307 and shard 1" | ||
# DB writes = 79328587, batch_count = 13437 | ||
/home/ubuntu/neard database resharding --height 114580307 --shard-id 1 --restore | ||
|
||
echo "Resharding database at height 114580307 and shard 2" | ||
# DB writes >= 92619516, batch_count = 16467 | ||
/home/ubuntu/neard database resharding --height 114580307 --shard-id 2 --restore | ||
|
||
echo "Resharding database at height 114580307 and shard 3" | ||
# DB writes = 258676854, batch_count = 28279 | ||
/home/ubuntu/neard database resharding --height 114580307 --shard-id 3 --restore | ||
|
||
|
||
########## | ||
# Perform second resharding | ||
########## | ||
export RUST_LOG=debug | ||
|
||
echo "Resharding database at height 115185107 and shard 0" | ||
# DB writes = 135273932, batch_count = 16856 | ||
/home/ubuntu/neard database resharding --height 115185107 --shard-id 0 --restore | ||
|
||
echo "Resharding database at height 115185107 and shard 1" | ||
# DB writes = 79959560, batch_count = 13549 | ||
/home/ubuntu/neard database resharding --height 115185107 --shard-id 1 --restore | ||
|
||
echo "Resharding database at height 115185107 and shard 2" | ||
# DB writes = 188940809, batch_count = 19209 | ||
/home/ubuntu/neard database resharding --height 115185107 --shard-id 2 --restore | ||
|
||
echo "Resharding database at height 115185107 and shard 3" | ||
# DB writes = 66932493, batch_count = 9886 | ||
/home/ubuntu/neard database resharding --height 115185107 --shard-id 3 --restore | ||
|
||
echo "Resharding database at height 115185107 and shard 4" | ||
# DB writes = 194985099, batch_count = 18833 | ||
/home/ubuntu/neard database resharding --height 115185107 --shard-id 4 --restore |