From 44dd90cd12b07c5bc7c54be32f2822f8cd5485b8 Mon Sep 17 00:00:00 2001 From: ljedrz Date: Mon, 12 May 2025 15:00:15 +0200 Subject: [PATCH 1/6] feat: allow making RocksDB checkpoints Signed-off-by: ljedrz --- node/rest/src/lib.rs | 1 + node/rest/src/routes.rs | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/node/rest/src/lib.rs b/node/rest/src/lib.rs index b45a659bde..ffbaded391 100644 --- a/node/rest/src/lib.rs +++ b/node/rest/src/lib.rs @@ -154,6 +154,7 @@ impl, R: Routing> Rest { // All the endpoints before the call to `route_layer` are protected with JWT auth. .route(&format!("/{network}/node/address"), get(Self::get_node_address)) .route(&format!("/{network}/program/{{id}}/mapping/{{name}}"), get(Self::get_mapping_values)) + .route(&format!("/{network}/db_backup"), post(Self::db_backup)) .route_layer(middleware::from_fn(auth_middleware)) // Get ../consensus_version diff --git a/node/rest/src/routes.rs b/node/rest/src/routes.rs index 742f99f4b1..ae57f25636 100644 --- a/node/rest/src/routes.rs +++ b/node/rest/src/routes.rs @@ -35,6 +35,11 @@ pub(crate) struct BlockRange { end: u32, } +#[derive(Deserialize, Serialize)] +pub(crate) struct BackupPath { + path: std::path::PathBuf, +} + /// The query object for `get_mapping_value` and `get_mapping_values`. #[derive(Copy, Clone, Deserialize, Serialize)] pub(crate) struct Metadata { @@ -583,6 +588,16 @@ impl, R: Routing> Rest { Ok(ErasedJson::pretty(solution_id)) } + // POST /{network}/db_backup?path=new_fs_path + pub(crate) async fn db_backup( + State(rest): State, + backup_path: Query, + ) -> Result { + rest.ledger.backup_database(&backup_path.path).map_err(RestError::from)?; + + Ok(ErasedJson::pretty(())) + } + // GET /{network}/block/{blockHeight}/history/{mapping} #[cfg(feature = "history")] pub(crate) async fn get_history( From 2b3e779c1b1a96672d54d82b17a9051931a00509 Mon Sep 17 00:00:00 2001 From: ljedrz Date: Thu, 5 Jun 2025 11:31:58 +0200 Subject: [PATCH 2/6] tests: checkpoint-backed db rollbacks Signed-off-by: ljedrz --- .circleci/db_backup_ci.sh | 145 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100755 .circleci/db_backup_ci.sh diff --git a/.circleci/db_backup_ci.sh b/.circleci/db_backup_ci.sh new file mode 100755 index 0000000000..67464ecce9 --- /dev/null +++ b/.circleci/db_backup_ci.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +# Network parameters +total_validators=4 +network_id=0 +network_name="mainnet" + +# Stopping conditions +checkpoint_height=3 +rollback_height=10 + +# Use fixed JWT values in order to be able to create checkpoints +jwt_secret="ZGJjaGVja3BvaW50dGVzdA==" +jwt_ts=1749116345 +jwt[0]="eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJhbGVvMXJoZ2R1NzdoZ3lxZDN4amo4dWN1M2pqOXIya3J3ejZtbnp5ZDgwZ25jcjVmeGN3bGg1cnN2enA5cHgiLCJpYXQiOjE3NDkxMTYzNDUsImV4cCI6MjA2NDQ3NjM0NX0.qm2idfIm4ZTFOsyT19lH9pcWzzAtP5mbymkN4oL6_sc" +jwt[1]="eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJhbGVvMXMzd3M1dHJhODdmanljbmpyd3NqY3JudzJxeHI4amZxcWR1Z25mMHh6cXF3MjlxOW01cHFlbTJ1NHQiLCJpYXQiOjE3NDkxMTYzNDUsImV4cCI6MjA2NDQ3NjM0NX0.4efs4qWJuG0Lm2CxrLMIKrrbJiGD-XNqHlk_AUaXOBo" +jwt[2]="eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJhbGVvMWFzaHl1OTZ0andlNjN1MGd0bm52OHo1bGhhcGR1NGw1cGpzbDJraGE3ZnY3aHZ6MmVxeHM1ZHowcmciLCJpYXQiOjE3NDkxMTYzNDUsImV4cCI6MjA2NDQ3NjM0NX0.zxO1ajmQ0Wqr1gg4NuRzH4i_hiUBt7_fP9WP3KHbp4c" +jwt[3]="eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJhbGVvMTJ1eDNnZGF1Y2swdjYwd2VzdGdjcHFqN3Y4cnJjcjN2MzQ2ZTRqdHEwNHE3a2t0MjJjenNoODA4djIiLCJpYXQiOjE3NDkxMTYzNDUsImV4cCI6MjA2NDQ3NjM0NX0.bJZ-fcrJwaI5YdPXDQ1nySV-jmxeABQCSvL1Ag9CSpo" + +# Array to store PIDs of all processes +declare -a PIDS + +# Start all validator nodes in the background +for ((validator_index = 0; validator_index < $total_validators; validator_index++)); do + snarkos start --nodisplay --network $network_id --dev $validator_index --dev-num-validators $total_validators --validator --jwt-secret $jwt_secret --jwt-timestamp $jwt_ts & + PIDS[$validator_index]=$! + echo "Started validator $validator_index with PID ${PIDS[$validator_index]}" + # Add 1-second delay between starting nodes to avoid hitting rate limits + sleep 1 +done + +# Function to check block heights; the 1st parameter is the desired height +check_heights() { + echo "Checking block heights on all nodes..." + num_done=0 + for ((node_index = 0; node_index < $total_validators; node_index++)); do + port=$((3030 + node_index)) + height=$(curl -s "http://127.0.0.1:$port/$network_name/block/height/latest" || echo "0") + + # Track highest height for reporting + if [[ "$height" =~ ^[0-9]+$ ]] && [ $height -ge $1 ]; then + num_done=$((num_done + 1)) + fi + done + + if [ $num_done -eq $total_validators ]; then + echo "All nodes reached the height of $1" + return 0 + else + return 1 + fi +} + +# Create database checkpoints +create_checkpoints() { + for ((node_index = 0; node_index < $total_validators; node_index++)); do + port=$((3030 + node_index)) + result=$(curl -s -X "POST" -H "Authorization: Bearer ${jwt[node_index]}" "http://127.0.0.1:$port/$network_name/db_backup?path=/tmp/checkpoint$node_index" || echo "fail") + + # Track highest height for reporting + if [ "$result" = "fail" ]; then + return 1 + fi + done + + echo "All nodes created a checkpoint" + return 0 +} + +# Wait for 15 seconds to let the network start +echo "Waiting 15 seconds for network to start up..." +sleep 15 + +# Check heights periodically with a timeout +total_wait=0 +checkpoints_created=false +while [ $total_wait -lt 300 ]; do # 5 minutes max + # Apply short-circuiting + if [[ $checkpoints_created = true ]] || check_heights "$checkpoint_height"; then + if [[ $checkpoints_created = false ]]; then + # Create checkpoints at the specified height + create_checkpoints + checkpoints_created=true + fi + + # Wait until the specified rollback height is reached + if check_heights "$rollback_height"; then + echo "All nodes reached rollback height." + + # Gracefully shut down the validators + for pid in "${PIDS[@]}"; do + kill -15 $pid 2>/dev/null || true + done + # Wait until the shutdown concludes. + sleep 5 + + for ((validator_index = 0; validator_index < $total_validators; validator_index++)); do + # Remove the original ledger + snarkos clean --network $network_id --dev $validator_index + # Wait until the cleanup concludes + sleep 1 + # Restart using the checkpoint + snarkos start --nodisplay --network $network_id --dev $validator_index --dev-num-validators $total_validators --validator --storage /tmp/checkpoint$validator_index & + PIDS[$validator_index]=$! + echo "Restarted validator $validator_index with PID ${PIDS[$validator_index]}" + # Add 1-second delay between starting nodes to avoid hitting rate limits + sleep 1 + + port=$((3030 + validator_index)) + height=$(curl -s "http://127.0.0.1:$port/$network_name/block/height/latest" || echo "0") + echo "Node height after restart: $height" + + # Ensure that the height is below the rollback height + if [[ "$height" =~ ^[0-9]+$ ]] && [ $height -ge $rollback_height ]; then + echo "❌ Test failed!" + exit 1 + fi + done + + echo "SUCCESS!" + + # Cleanup: kill all processes + for pid in "${PIDS[@]}"; do + kill -9 $pid 2>/dev/null || true + done + + exit 0 + fi + fi + + # Continue waiting + sleep 3 + total_wait=$((total_wait + 3)) + echo "Waited $total_wait seconds so far..." +done + +# The main loop has expired by now +echo "❌ Test failed!" + +# Cleanup: kill all processes +for pid in "${PIDS[@]}"; do + kill -9 $pid 2>/dev/null || true +done + +exit 1 From d92b68f3c5c3765c48136bf4481ed68c62ee4bbc Mon Sep 17 00:00:00 2001 From: ljedrz Date: Fri, 6 Jun 2025 11:34:54 +0200 Subject: [PATCH 3/6] fix: don't assume that --dev nodes may only ever use StorageMode::Development Signed-off-by: ljedrz --- cli/src/commands/start.rs | 6 +++--- node/bft/examples/simple_node.rs | 14 +++++++++++--- node/bft/src/bft.rs | 15 +++++++++++++-- node/bft/src/primary.rs | 7 ++++--- node/bft/tests/common/primary.rs | 2 ++ node/consensus/src/lib.rs | 5 ++++- node/src/client/mod.rs | 3 ++- node/src/node.rs | 8 ++++++-- node/src/prover/mod.rs | 5 ++--- node/src/validator/mod.rs | 13 ++++++++----- node/tests/common/node.rs | 4 +++- 11 files changed, 58 insertions(+), 24 deletions(-) diff --git a/cli/src/commands/start.rs b/cli/src/commands/start.rs index 8909870616..5bd158198a 100644 --- a/cli/src/commands/start.rs +++ b/cli/src/commands/start.rs @@ -708,9 +708,9 @@ impl Start { // Initialize the node. match node_type { - NodeType::Validator => Node::new_validator(node_ip, self.bft, rest_ip, self.rest_rps, account, &trusted_peers, &trusted_validators, genesis, cdn, storage_mode, self.allow_external_peers, dev_txs, shutdown.clone()).await, - NodeType::Prover => Node::new_prover(node_ip, account, &trusted_peers, genesis, storage_mode, shutdown.clone()).await, - NodeType::Client => Node::new_client(node_ip, rest_ip, self.rest_rps, account, &trusted_peers, genesis, cdn, storage_mode, self.rotate_external_peers, shutdown).await, + NodeType::Validator => Node::new_validator(node_ip, self.bft, rest_ip, self.rest_rps, account, &trusted_peers, &trusted_validators, genesis, cdn, storage_mode, self.allow_external_peers, dev_txs, self.dev, shutdown.clone()).await, + NodeType::Prover => Node::new_prover(node_ip, account, &trusted_peers, genesis, self.dev, shutdown.clone()).await, + NodeType::Client => Node::new_client(node_ip, rest_ip, self.rest_rps, account, &trusted_peers, genesis, cdn, storage_mode, self.rotate_external_peers, self.dev, shutdown).await, } } diff --git a/node/bft/examples/simple_node.rs b/node/bft/examples/simple_node.rs index cf0b71e23c..fc82ea9136 100644 --- a/node/bft/examples/simple_node.rs +++ b/node/bft/examples/simple_node.rs @@ -145,7 +145,7 @@ pub async fn start_bft( // Initialize the BFT instance. let block_sync = Arc::new(BlockSync::new(ledger.clone())); let mut bft = - BFT::::new(account, storage, ledger, block_sync, ip, &trusted_validators, storage_mode)?; + BFT::::new(account, storage, ledger, block_sync, ip, &trusted_validators, storage_mode, None)?; // Run the BFT instance. bft.run(None, Some(consensus_sender), sender.clone(), receiver).await?; // Retrieve the BFT's primary. @@ -184,8 +184,16 @@ pub async fn start_primary( let trusted_validators = trusted_validators(node_id, num_nodes, peers); // Initialize the primary instance. let block_sync = Arc::new(BlockSync::new(ledger.clone())); - let mut primary = - Primary::::new(account, storage, ledger, block_sync, ip, &trusted_validators, storage_mode)?; + let mut primary = Primary::::new( + account, + storage, + ledger, + block_sync, + ip, + &trusted_validators, + storage_mode, + None, + )?; // Run the primary instance. primary.run(None, None, sender.clone(), receiver).await?; // Handle OS signals. diff --git a/node/bft/src/bft.rs b/node/bft/src/bft.rs index 3dd4b6f5cc..2d878d7bfb 100644 --- a/node/bft/src/bft.rs +++ b/node/bft/src/bft.rs @@ -88,6 +88,7 @@ pub struct BFT { impl BFT { /// Initializes a new instance of the BFT. + #[allow(clippy::too_many_arguments)] pub fn new( account: Account, storage: Storage, @@ -96,9 +97,10 @@ impl BFT { ip: Option, trusted_validators: &[SocketAddr], storage_mode: StorageMode, + dev: Option, ) -> Result { Ok(Self { - primary: Primary::new(account, storage, ledger, block_sync, ip, trusted_validators, storage_mode)?, + primary: Primary::new(account, storage, ledger, block_sync, ip, trusted_validators, storage_mode, dev)?, dag: Default::default(), leader_certificate: Default::default(), leader_certificate_timer: Default::default(), @@ -991,7 +993,16 @@ mod tests { // Create the block synchronization logic. let block_sync = Arc::new(BlockSync::new(ledger.clone())); // Initialize the BFT. - BFT::new(account.clone(), storage.clone(), ledger.clone(), block_sync, None, &[], StorageMode::new_test(None)) + BFT::new( + account.clone(), + storage.clone(), + ledger.clone(), + block_sync, + None, + &[], + StorageMode::new_test(None), + None, + ) } #[test] diff --git a/node/bft/src/primary.rs b/node/bft/src/primary.rs index 779a371325..45b7917e5c 100644 --- a/node/bft/src/primary.rs +++ b/node/bft/src/primary.rs @@ -119,6 +119,7 @@ impl Primary { pub const MAX_TRANSMISSIONS_TOLERANCE: usize = BatchHeader::::MAX_TRANSMISSIONS_PER_BATCH * 2; /// Initializes a new primary instance. + #[allow(clippy::too_many_arguments)] pub fn new( account: Account, storage: Storage, @@ -127,10 +128,10 @@ impl Primary { ip: Option, trusted_validators: &[SocketAddr], storage_mode: StorageMode, + dev: Option, ) -> Result { // Initialize the gateway. - let gateway = - Gateway::new(account, storage.clone(), ledger.clone(), ip, trusted_validators, storage_mode.dev())?; + let gateway = Gateway::new(account, storage.clone(), ledger.clone(), ip, trusted_validators, dev)?; // Initialize the sync module. let sync = Sync::new(gateway.clone(), storage.clone(), ledger.clone(), block_sync); @@ -1996,7 +1997,7 @@ mod tests { let account = accounts[account_index].1.clone(); let block_sync = Arc::new(BlockSync::new(ledger.clone())); let mut primary = - Primary::new(account, storage, ledger, block_sync, None, &[], StorageMode::Test(None)).unwrap(); + Primary::new(account, storage, ledger, block_sync, None, &[], StorageMode::Test(None), None).unwrap(); // Construct a worker instance. primary.workers = Arc::from([Worker::new( diff --git a/node/bft/tests/common/primary.rs b/node/bft/tests/common/primary.rs index 42714c4436..3af72f1da7 100644 --- a/node/bft/tests/common/primary.rs +++ b/node/bft/tests/common/primary.rs @@ -176,6 +176,7 @@ impl TestNetwork { Some(SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), MEMORY_POOL_PORT + id as u16)), &[], StorageMode::new_test(None), + None, ) .unwrap(); (bft.primary().clone(), Some(bft)) @@ -188,6 +189,7 @@ impl TestNetwork { Some(SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), MEMORY_POOL_PORT + id as u16)), &[], StorageMode::new_test(None), + None, ) .unwrap(); (primary, None) diff --git a/node/consensus/src/lib.rs b/node/consensus/src/lib.rs index dfacfb39b1..3e523925af 100644 --- a/node/consensus/src/lib.rs +++ b/node/consensus/src/lib.rs @@ -123,6 +123,7 @@ pub struct Consensus { impl Consensus { /// Initializes a new instance of consensus and spawn its background tasks. + #[allow(clippy::too_many_arguments)] pub async fn new( account: Account, ledger: Arc>, @@ -131,6 +132,7 @@ impl Consensus { trusted_validators: &[SocketAddr], storage_mode: StorageMode, ping: Arc>, + dev: Option, ) -> Result { // Initialize the primary channels. let (primary_sender, primary_receiver) = init_primary_channels::(); @@ -139,7 +141,8 @@ impl Consensus { // Initialize the Narwhal storage. let storage = NarwhalStorage::new(ledger.clone(), transmissions, BatchHeader::::MAX_GC_ROUNDS as u64); // Initialize the BFT. - let bft = BFT::new(account, storage, ledger.clone(), block_sync.clone(), ip, trusted_validators, storage_mode)?; + let bft = + BFT::new(account, storage, ledger.clone(), block_sync.clone(), ip, trusted_validators, storage_mode, dev)?; // Create a new instance of Consensus. let mut _self = Self { ledger, diff --git a/node/src/client/mod.rs b/node/src/client/mod.rs index 7d00893bf4..30b84d5b22 100644 --- a/node/src/client/mod.rs +++ b/node/src/client/mod.rs @@ -144,6 +144,7 @@ impl> Client { cdn: Option, storage_mode: StorageMode, rotate_external_peers: bool, + dev: Option, shutdown: Arc, ) -> Result { // Initialize the signal handler. @@ -167,7 +168,7 @@ impl> Client { Self::MAXIMUM_NUMBER_OF_PEERS as u16, rotate_external_peers, allow_external_peers, - matches!(storage_mode, StorageMode::Development(_)), + dev.is_some(), ) .await?; diff --git a/node/src/node.rs b/node/src/node.rs index d992ff97cd..1d920822bc 100644 --- a/node/src/node.rs +++ b/node/src/node.rs @@ -56,6 +56,7 @@ impl Node { storage_mode: StorageMode, allow_external_peers: bool, dev_txs: bool, + dev: Option, shutdown: Arc, ) -> Result { Ok(Self::Validator(Arc::new( @@ -72,6 +73,7 @@ impl Node { storage_mode, allow_external_peers, dev_txs, + dev, shutdown, ) .await?, @@ -84,10 +86,10 @@ impl Node { account: Account, trusted_peers: &[SocketAddr], genesis: Block, - storage_mode: StorageMode, + dev: Option, shutdown: Arc, ) -> Result { - Ok(Self::Prover(Arc::new(Prover::new(node_ip, account, trusted_peers, genesis, storage_mode, shutdown).await?))) + Ok(Self::Prover(Arc::new(Prover::new(node_ip, account, trusted_peers, genesis, dev, shutdown).await?))) } /// Initializes a new client node. @@ -101,6 +103,7 @@ impl Node { cdn: Option, storage_mode: StorageMode, rotate_external_peers: bool, + dev: Option, shutdown: Arc, ) -> Result { Ok(Self::Client(Arc::new( @@ -114,6 +117,7 @@ impl Node { cdn, storage_mode, rotate_external_peers, + dev, shutdown, ) .await?, diff --git a/node/src/prover/mod.rs b/node/src/prover/mod.rs index 3884849d43..94bd1be6be 100644 --- a/node/src/prover/mod.rs +++ b/node/src/prover/mod.rs @@ -42,7 +42,6 @@ use snarkvm::{ synthesizer::VM, }; -use aleo_std::StorageMode; use anyhow::Result; use colored::Colorize; use core::{marker::PhantomData, time::Duration}; @@ -97,7 +96,7 @@ impl> Prover { account: Account, trusted_peers: &[SocketAddr], genesis: Block, - storage_mode: StorageMode, + dev: Option, shutdown: Arc, ) -> Result { // Initialize the signal handler. @@ -120,7 +119,7 @@ impl> Prover { Self::MAXIMUM_NUMBER_OF_PEERS as u16, rotate_external_peers, allow_external_peers, - matches!(storage_mode, StorageMode::Development(_)), + dev.is_some(), ) .await?; diff --git a/node/src/validator/mod.rs b/node/src/validator/mod.rs index 4ea9f17ec6..c0d82b01ca 100644 --- a/node/src/validator/mod.rs +++ b/node/src/validator/mod.rs @@ -93,6 +93,7 @@ impl> Validator { storage_mode: StorageMode, allow_external_peers: bool, dev_txs: bool, + dev: Option, shutdown: Arc, ) -> Result { // Initialize the signal handler. @@ -117,7 +118,7 @@ impl> Validator { Self::MAXIMUM_NUMBER_OF_PEERS as u16, rotate_external_peers, allow_external_peers, - matches!(storage_mode, StorageMode::Development(_)), + dev.is_some(), ) .await?; @@ -135,6 +136,7 @@ impl> Validator { trusted_validators, storage_mode.clone(), ping.clone(), + dev, ) .await?; @@ -154,7 +156,7 @@ impl> Validator { let cdn_sync = cdn.map(|base_url| Arc::new(CdnBlockSync::new(base_url, ledger.clone(), shutdown))); // Initialize the transaction pool. - node.initialize_transaction_pool(storage_mode.clone(), dev_txs)?; + node.initialize_transaction_pool(dev, dev_txs)?; // Initialize the REST server. if let Some(rest_ip) = rest_ip { @@ -373,7 +375,7 @@ impl> Validator { // } /// Initialize the transaction pool. - fn initialize_transaction_pool(&self, storage_mode: StorageMode, dev_txs: bool) -> Result<()> { + fn initialize_transaction_pool(&self, dev: Option, dev_txs: bool) -> Result<()> { use snarkvm::console::{ program::{Identifier, Literal, ProgramID, Value}, types::U64, @@ -384,9 +386,9 @@ impl> Validator { let locator = (ProgramID::from_str("credits.aleo")?, Identifier::from_str("transfer_public")?); // Determine whether to start the loop. - match storage_mode { + match dev { // If the node is running in development mode, only generate if you are allowed. - StorageMode::Development(id) => { + Some(id) => { // If the node is not the first node, or if we should not create dev traffic, do not start the loop. if id != 0 || !dev_txs { return Ok(()); @@ -523,6 +525,7 @@ mod tests { storage_mode, false, dev_txs, + None, Default::default(), ) .await diff --git a/node/tests/common/node.rs b/node/tests/common/node.rs index 6e30fae4d8..e068a4720c 100644 --- a/node/tests/common/node.rs +++ b/node/tests/common/node.rs @@ -32,6 +32,7 @@ pub async fn client() -> Client> None, // No CDN. StorageMode::new_test(None), false, // No extra peer rotation. + None, Default::default(), ) .await @@ -44,7 +45,7 @@ pub async fn prover() -> Prover> Account::::from_str("APrivateKey1zkp2oVPTci9kKcUprnbzMwq95Di1MQERpYBhEeqvkrDirK1").unwrap(), &[], sample_genesis_block(), - StorageMode::new_test(None), + None, Default::default(), ) .await @@ -65,6 +66,7 @@ pub async fn validator() -> Validator Date: Mon, 9 Jun 2025 12:32:06 +0200 Subject: [PATCH 4/6] tests: test 2 separate db rollbacks Signed-off-by: ljedrz --- .circleci/db_backup_ci.sh | 51 +++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/.circleci/db_backup_ci.sh b/.circleci/db_backup_ci.sh index 67464ecce9..74e92809e5 100755 --- a/.circleci/db_backup_ci.sh +++ b/.circleci/db_backup_ci.sh @@ -8,6 +8,8 @@ network_name="mainnet" # Stopping conditions checkpoint_height=3 rollback_height=10 +num_checkpoints=0 +remaining_checkpoints=2 # Use fixed JWT values in order to be able to create checkpoints jwt_secret="ZGJjaGVja3BvaW50dGVzdA==" @@ -55,7 +57,8 @@ check_heights() { create_checkpoints() { for ((node_index = 0; node_index < $total_validators; node_index++)); do port=$((3030 + node_index)) - result=$(curl -s -X "POST" -H "Authorization: Bearer ${jwt[node_index]}" "http://127.0.0.1:$port/$network_name/db_backup?path=/tmp/checkpoint$node_index" || echo "fail") + suffix="${node_index}_$1" + result=$(curl -s -X "POST" -H "Authorization: Bearer ${jwt[node_index]}" "http://127.0.0.1:$port/$network_name/db_backup?path=/tmp/checkpoint_$suffix" || echo "fail") # Track highest height for reporting if [ "$result" = "fail" ]; then @@ -73,20 +76,27 @@ sleep 15 # Check heights periodically with a timeout total_wait=0 -checkpoints_created=false +checkpoint_created=false while [ $total_wait -lt 300 ]; do # 5 minutes max # Apply short-circuiting - if [[ $checkpoints_created = true ]] || check_heights "$checkpoint_height"; then - if [[ $checkpoints_created = false ]]; then + if [[ $checkpoint_created = true ]] || check_heights "$checkpoint_height"; then + if [[ $checkpoint_created = false ]]; then # Create checkpoints at the specified height - create_checkpoints - checkpoints_created=true + create_checkpoints $num_checkpoints + checkpoint_created=true + checkpoint_height=$((checkpoint_height+2)) + num_checkpoints=$((num_checkpoints+1)) + + echo "num_checkpoints: $num_checkpoints" + sleep 2 fi # Wait until the specified rollback height is reached if check_heights "$rollback_height"; then echo "All nodes reached rollback height." + checkpoint_created=false + # Gracefully shut down the validators for pid in "${PIDS[@]}"; do kill -15 $pid 2>/dev/null || true @@ -96,11 +106,17 @@ while [ $total_wait -lt 300 ]; do # 5 minutes max for ((validator_index = 0; validator_index < $total_validators; validator_index++)); do # Remove the original ledger - snarkos clean --network $network_id --dev $validator_index + if (( num_checkpoints == 1 )); then + snarkos clean --network $network_id --dev $validator_index + else + suffix="${validator_index}_$((num_checkpoints-2))" + snarkos clean --network $network_id --dev $validator_index --path=/tmp/checkpoint_$suffix + fi # Wait until the cleanup concludes sleep 1 # Restart using the checkpoint - snarkos start --nodisplay --network $network_id --dev $validator_index --dev-num-validators $total_validators --validator --storage /tmp/checkpoint$validator_index & + suffix="${validator_index}_$((num_checkpoints-1))" + snarkos start --nodisplay --network $network_id --dev $validator_index --dev-num-validators $total_validators --validator --jwt-secret $jwt_secret --jwt-timestamp $jwt_ts --storage /tmp/checkpoint_$suffix & PIDS[$validator_index]=$! echo "Restarted validator $validator_index with PID ${PIDS[$validator_index]}" # Add 1-second delay between starting nodes to avoid hitting rate limits @@ -111,20 +127,25 @@ while [ $total_wait -lt 300 ]; do # 5 minutes max echo "Node height after restart: $height" # Ensure that the height is below the rollback height - if [[ "$height" =~ ^[0-9]+$ ]] && [ $height -ge $rollback_height ]; then + if [[ "$height" =~ ^[0-9]+$ ]] && (( height >= rollback_height )) && (( height < checkpoint_height )); then echo "❌ Test failed!" exit 1 fi done - echo "SUCCESS!" + if (( remaining_checkpoints == 0 )); then + echo "SUCCESS!" - # Cleanup: kill all processes - for pid in "${PIDS[@]}"; do - kill -9 $pid 2>/dev/null || true - done + # Cleanup: kill all processes + for pid in "${PIDS[@]}"; do + kill -9 $pid 2>/dev/null || true + done + + exit 0 + fi + + remaining_checkpoints=$((remaining_checkpoints-1)) - exit 0 fi fi From 80beb2f9e6c297701a42784a663ac8e0c55c858c Mon Sep 17 00:00:00 2001 From: ljedrz Date: Thu, 10 Jul 2025 10:27:11 +0200 Subject: [PATCH 5/6] deps: update snarkVM Signed-off-by: ljedrz --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 5f9837df12..0157d69c72 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,7 +46,7 @@ default-features = false [workspace.dependencies.snarkvm] #path = "../snarkVM" git = "https://github.com/ProvableHQ/snarkVM.git" -rev = "668b72b" +rev = "0e0e3c7" #version = "=3.8.0" default-features = false #features = [ "circuit", "console", "rocks" ] From 0394ea6cb1d3d1a00913652adf6263659c27b6c4 Mon Sep 17 00:00:00 2001 From: ljedrz Date: Fri, 11 Jul 2025 10:16:00 +0200 Subject: [PATCH 6/6] ci: run the db checkpoint test script before the devnet one Signed-off-by: ljedrz --- .circleci/config.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5d00dcbafc..c8402b5797 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -195,6 +195,11 @@ commands: name: "Run devnet test" timeout: 20m # Allow 20 minutes total command: | + ./.circleci/db_backup_ci.sh # run the db checkpoint test script first, and clean the dev ledgers afterwards + snarkos clean --dev 0 + snarkos clean --dev 1 + snarkos clean --dev 2 + snarkos clean --dev 3 ./.circleci/devnet_ci.sh << parameters.validators >> << parameters.clients >> << parameters.network_id >> << parameters.min_height >> - clear_environment: cache_key: << parameters.cache_key >>