From 01faa668fb77099d69f0fd64da8bb7c4bd76743d Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sat, 10 Dec 2022 22:04:02 +0900 Subject: [PATCH 01/43] Add fully-reproducible online tracer for banking --- Cargo.lock | 9 + banking-bench/src/main.rs | 27 +- core/Cargo.toml | 1 + core/benches/banking_stage.rs | 25 +- core/benches/banking_trace.rs | 148 ++++++ core/benches/sigverify_stage.rs | 16 +- core/src/banking_stage.rs | 110 ++-- core/src/banking_trace.rs | 497 ++++++++++++++++++ core/src/cluster_info_vote_listener.rs | 9 +- core/src/leader_slot_banking_stage_metrics.rs | 57 +- core/src/lib.rs | 1 + core/src/packet_deserializer.rs | 75 ++- core/src/sigverify.rs | 18 +- core/src/sigverify_stage.rs | 20 +- core/src/tpu.rs | 10 +- core/src/validator.rs | 11 + ledger/src/blockstore.rs | 5 +- local-cluster/src/validator_configs.rs | 1 + perf/src/cuda_runtime.rs | 4 +- perf/src/packet.rs | 4 +- sdk/src/packet.rs | 35 +- validator/src/cli.rs | 13 + validator/src/main.rs | 10 + 23 files changed, 985 insertions(+), 121 deletions(-) create mode 100644 core/benches/banking_trace.rs create mode 100644 core/src/banking_trace.rs diff --git a/Cargo.lock b/Cargo.lock index a6b82eecf00..a7da1106b1c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4081,6 +4081,14 @@ dependencies = [ "librocksdb-sys", ] +[[package]] +name = "rolling-file" +version = "0.1.0" +source = "git+https://github.com/ryoqun/rolling-file-rs?rev=664504e77fad7292dedf5a119d950b80aef67934#664504e77fad7292dedf5a119d950b80aef67934" +dependencies = [ + "chrono", +] + [[package]] name = "rpassword" version = "7.0.0" @@ -5190,6 +5198,7 @@ dependencies = [ "rand_chacha 0.2.2", "raptorq", "rayon", + "rolling-file", "rustc_version 0.4.0", "serde", "serde_derive", diff --git a/banking-bench/src/main.rs b/banking-bench/src/main.rs index afeec1dfa56..3eb411209fd 100644 --- a/banking-bench/src/main.rs +++ b/banking-bench/src/main.rs @@ -6,7 +6,10 @@ use { rand::{thread_rng, Rng}, rayon::prelude::*, solana_client::connection_cache::ConnectionCache, - solana_core::banking_stage::BankingStage, + solana_core::{ + banking_stage::BankingStage, + banking_trace::{BankingPacketBatch, BankingTracer, DEFAULT_BANKING_TRACE_SIZE}, + }, solana_gossip::cluster_info::{ClusterInfo, Node}, solana_ledger::{ blockstore::Blockstore, @@ -255,6 +258,12 @@ fn main() { .takes_value(false) .help("Skip transaction sanity execution"), ) + .arg( + Arg::new("trace_banking") + .long("trace-banking") + .takes_value(false) + .help("Enable banking tracing"), + ) .arg( Arg::new("write_lock_contention") .long("write-lock-contention") @@ -407,9 +416,16 @@ fn main() { let leader_schedule_cache = Arc::new(LeaderScheduleCache::new_from_bank(&bank)); let (exit, poh_recorder, poh_service, signal_receiver) = create_test_recorder(&bank, &blockstore, None, Some(leader_schedule_cache)); - let (non_vote_sender, non_vote_receiver) = unbounded(); - let (tpu_vote_sender, tpu_vote_receiver) = unbounded(); - let (gossip_vote_sender, gossip_vote_receiver) = unbounded(); + let banking_tracer = BankingTracer::new(matches.is_present("trace_banking").then_some(( + blockstore.banking_tracer_path(), + exit.clone(), + DEFAULT_BANKING_TRACE_SIZE, + ))) + .unwrap(); + let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); + let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); + let (gossip_vote_sender, gossip_vote_receiver) = + banking_tracer.create_channel_gossip_vote(); let cluster_info = { let keypair = Arc::new(Keypair::new()); let node = Node::new_localhost_with_pubkey(&keypair.pubkey()); @@ -433,6 +449,7 @@ fn main() { None, Arc::new(connection_cache), bank_forks.clone(), + banking_tracer, ); poh_recorder.write().unwrap().set_bank(&bank, false); @@ -462,7 +479,7 @@ fn main() { timestamp(), ); non_vote_sender - .send((vec![packet_batch.clone()], None)) + .send(BankingPacketBatch::new((vec![packet_batch.clone()], None))) .unwrap(); } diff --git a/core/Cargo.toml b/core/Cargo.toml index ef8bf57f2a0..363235ee26a 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -34,6 +34,7 @@ num_enum = "0.5.7" rand = "0.7.0" rand_chacha = "0.2.2" rayon = "1.5.3" +rolling-file = { git = "https://github.com/ryoqun/rolling-file-rs", rev = "664504e77fad7292dedf5a119d950b80aef67934" } serde = "1.0.144" serde_derive = "1.0.103" solana-address-lookup-table-program = { path = "../programs/address-lookup-table", version = "=1.15.0" } diff --git a/core/benches/banking_stage.rs b/core/benches/banking_stage.rs index bc4539c2d9f..f1a31e135bc 100644 --- a/core/benches/banking_stage.rs +++ b/core/benches/banking_stage.rs @@ -11,6 +11,7 @@ use { solana_client::connection_cache::ConnectionCache, solana_core::{ banking_stage::{BankingStage, BankingStageStats}, + banking_trace::{BankingPacketBatch, BankingTracer}, leader_slot_banking_stage_metrics::LeaderSlotMetricsTracker, qos_service::QosService, unprocessed_packet_batches::*, @@ -102,7 +103,7 @@ fn bench_consume_buffered(bencher: &mut Bencher) { &BankingStageStats::default(), &recorder, &QosService::new(1), - &mut LeaderSlotMetricsTracker::new(0), + &mut LeaderSlotMetricsTracker::new_for_test(), None, ); }); @@ -197,9 +198,10 @@ fn bench_banking(bencher: &mut Bencher, tx_type: TransactionType) { // during the benchmark genesis_config.ticks_per_slot = 10_000; - let (non_vote_sender, non_vote_receiver) = unbounded(); - let (tpu_vote_sender, tpu_vote_receiver) = unbounded(); - let (gossip_vote_sender, gossip_vote_receiver) = unbounded(); + let banking_tracer = BankingTracer::new_disabled(); + let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); + let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); + let (gossip_vote_sender, gossip_vote_receiver) = banking_tracer.create_channel_gossip_vote(); let mut bank = Bank::new_for_benches(&genesis_config); // Allow arbitrary transaction processing time for the purposes of this bench @@ -288,6 +290,7 @@ fn bench_banking(bencher: &mut Bencher, tx_type: TransactionType) { None, Arc::new(ConnectionCache::default()), bank_forks, + banking_tracer, ); poh_recorder.write().unwrap().set_bank(&bank, false); @@ -304,10 +307,16 @@ fn bench_banking(bencher: &mut Bencher, tx_type: TransactionType) { let mut sent = 0; if let Some(vote_packets) = &vote_packets { tpu_vote_sender - .send((vote_packets[start..start + chunk_len].to_vec(), None)) + .send(BankingPacketBatch::new(( + vote_packets[start..start + chunk_len].to_vec(), + None, + ))) .unwrap(); gossip_vote_sender - .send((vote_packets[start..start + chunk_len].to_vec(), None)) + .send(BankingPacketBatch::new(( + vote_packets[start..start + chunk_len].to_vec(), + None, + ))) .unwrap(); } for v in verified[start..start + chunk_len].chunks(chunk_len / num_threads) { @@ -321,7 +330,9 @@ fn bench_banking(bencher: &mut Bencher, tx_type: TransactionType) { for xv in v { sent += xv.len(); } - non_vote_sender.send((v.to_vec(), None)).unwrap(); + non_vote_sender + .send(BankingPacketBatch::new((v.to_vec(), None))) + .unwrap(); } check_txs(&signal_receiver2, txes / CHUNKS); diff --git a/core/benches/banking_trace.rs b/core/benches/banking_trace.rs new file mode 100644 index 00000000000..54224bf6f6a --- /dev/null +++ b/core/benches/banking_trace.rs @@ -0,0 +1,148 @@ +#![feature(test)] + +extern crate test; + +use { + solana_core::banking_trace::{ + drop_and_clean_temp_dir_unless_suppressed, sample_packet_batch, + sender_overhead_minimized_receiver_loop, terminate_tracer, BankingPacketBatch, + BankingTracer, TraceError, TracerThreadResult, DEFAULT_BANKING_TRACE_SIZE, + }, + std::{ + path::PathBuf, + sync::{atomic::AtomicBool, Arc}, + thread, + }, + tempfile::TempDir, + test::Bencher, +}; + +fn ensure_fresh_setup_to_benchmark(path: &PathBuf) { + // make sure fresh setup; otherwise banking tracer appends and rotates + // trace files created by prior bench iterations, slightly skewing perf + // result... + BankingTracer::ensure_cleanup_path(path).unwrap(); +} + +fn black_box_packet_batch(packet_batch: BankingPacketBatch) -> TracerThreadResult { + test::black_box(packet_batch); + Ok(()) +} + +#[bench] +fn bench_banking_tracer_main_thread_overhead_noop_baseline(bencher: &mut Bencher) { + let exit = Arc::::default(); + let tracer = BankingTracer::new_disabled(); + let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); + + let exit_for_dummy_thread = exit.clone(); + let dummy_main_thread = thread::spawn(move || { + sender_overhead_minimized_receiver_loop::<_, TraceError, 0>( + exit_for_dummy_thread, + non_vote_receiver, + black_box_packet_batch, + ) + }); + + let packet_batch = sample_packet_batch(); + bencher.iter(|| { + non_vote_sender.send(packet_batch.clone()).unwrap(); + }); + terminate_tracer(tracer, dummy_main_thread, non_vote_sender, Some(exit)); +} + +#[bench] +fn bench_banking_tracer_main_thread_overhead_under_peak_write(bencher: &mut Bencher) { + let temp_dir = TempDir::new().unwrap(); + + let exit = Arc::::default(); + let tracer = BankingTracer::new(Some(( + temp_dir.path().join("banking-trace"), + exit.clone(), + DEFAULT_BANKING_TRACE_SIZE, + ))) + .unwrap(); + let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); + + let exit_for_dummy_thread = exit.clone(); + let dummy_main_thread = thread::spawn(move || { + sender_overhead_minimized_receiver_loop::<_, TraceError, 0>( + exit_for_dummy_thread, + non_vote_receiver, + black_box_packet_batch, + ) + }); + + let packet_batch = sample_packet_batch(); + bencher.iter(|| { + non_vote_sender.send(packet_batch.clone()).unwrap(); + }); + + terminate_tracer(tracer, dummy_main_thread, non_vote_sender, Some(exit)); + drop_and_clean_temp_dir_unless_suppressed(temp_dir); +} + +#[bench] +fn bench_banking_tracer_main_thread_overhead_under_sustained_write(bencher: &mut Bencher) { + let temp_dir = TempDir::new().unwrap(); + + let exit = Arc::::default(); + let tracer = BankingTracer::new(Some(( + temp_dir.path().join("banking-trace"), + exit.clone(), + 1024 * 1024, // cause more frequent trace file rotation + ))) + .unwrap(); + let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); + + let exit_for_dummy_thread = exit.clone(); + let dummy_main_thread = thread::spawn(move || { + sender_overhead_minimized_receiver_loop::<_, TraceError, 0>( + exit_for_dummy_thread, + non_vote_receiver, + black_box_packet_batch, + ) + }); + + let packet_batch = sample_packet_batch(); + bencher.iter(|| { + non_vote_sender.send(packet_batch.clone()).unwrap(); + }); + + terminate_tracer(tracer, dummy_main_thread, non_vote_sender, Some(exit)); + drop_and_clean_temp_dir_unless_suppressed(temp_dir); +} + +#[bench] +fn bench_banking_tracer_background_thread_throughput(bencher: &mut Bencher) { + let temp_dir = TempDir::new().unwrap(); + let base_path = temp_dir.path(); + + let packet_batch = sample_packet_batch(); + + bencher.iter(move || { + let path = base_path.join("banking-trace"); + ensure_fresh_setup_to_benchmark(&path); + + let exit = Arc::::default(); + + let tracer = BankingTracer::new(Some((path, exit.clone(), 50 * 1024 * 1024))).unwrap(); + let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); + + let dummy_main_thread = thread::spawn(move || { + sender_overhead_minimized_receiver_loop::<_, TraceError, 0>( + exit.clone(), + non_vote_receiver, + black_box_packet_batch, + ) + }); + + for _ in 0..1000 { + non_vote_sender.send(packet_batch.clone()).unwrap(); + } + + terminate_tracer(tracer, dummy_main_thread, non_vote_sender, None); + }); + + drop_and_clean_temp_dir_unless_suppressed(temp_dir); +} diff --git a/core/benches/sigverify_stage.rs b/core/benches/sigverify_stage.rs index 90903889480..2eb03c52395 100644 --- a/core/benches/sigverify_stage.rs +++ b/core/benches/sigverify_stage.rs @@ -12,6 +12,7 @@ use { thread_rng, Rng, }, solana_core::{ + banking_trace::BankingTracer, sigverify::TransactionSigVerifier, sigverify_stage::{SigVerifier, SigVerifyStage}, }, @@ -147,7 +148,7 @@ fn bench_sigverify_stage(bencher: &mut Bencher) { solana_logger::setup(); trace!("start"); let (packet_s, packet_r) = unbounded(); - let (verified_s, verified_r) = unbounded(); + let (verified_s, verified_r) = BankingTracer::channel_for_test(); let verifier = TransactionSigVerifier::new(verified_s); let stage = SigVerifyStage::new(packet_r, verifier, "bench"); @@ -170,18 +171,21 @@ fn bench_sigverify_stage(bencher: &mut Bencher) { } let mut received = 0; trace!("sent: {}", sent_len); + let mut messages = vec![]; loop { - if let Ok((mut verifieds, _)) = verified_r.recv_timeout(Duration::from_millis(10)) { - while let Some(v) = verifieds.pop() { + if let Ok(message) = verified_r.recv_timeout(Duration::from_millis(10)) { + let (verifieds, _) = &*message; + for v in verifieds.iter().rev() { received += v.len(); - batches.push(v); } + messages.push(message); if use_same_tx || received >= sent_len { break; } } } trace!("received: {}", received); + test::black_box(messages); }); stage.join().unwrap(); } @@ -224,8 +228,8 @@ fn prepare_batches(discard_factor: i32) -> (Vec, usize) { fn bench_shrink_sigverify_stage_core(bencher: &mut Bencher, discard_factor: i32) { let (batches0, num_valid_packets) = prepare_batches(discard_factor); - let (_verified_s, _verified_r) = unbounded(); - let verifier = TransactionSigVerifier::new(_verified_s); + let (verified_s, _verified_r) = BankingTracer::channel_for_test(); + let verifier = TransactionSigVerifier::new(verified_s); let mut c = 0; let mut total_shrink_time = 0; diff --git a/core/src/banking_stage.rs b/core/src/banking_stage.rs index 7ec397acb56..38d0745b929 100644 --- a/core/src/banking_stage.rs +++ b/core/src/banking_stage.rs @@ -8,6 +8,7 @@ use { packet_receiver::PacketReceiver, }, crate::{ + banking_trace::{BankingPacketReceiver, BankingTracer, TracerThreadResult}, forward_packet_batches_by_accounts::ForwardPacketBatchesByAccounts, immutable_deserialized_packet::ImmutableDeserializedPacket, latest_unprocessed_votes::{LatestUnprocessedVotes, VoteSource}, @@ -18,7 +19,6 @@ use { next_leader::{next_leader_tpu_forwards, next_leader_tpu_vote}, packet_deserializer::PacketDeserializer, qos_service::QosService, - sigverify::SigverifyTracerPacketStats, tracer_packet_stats::TracerPacketStats, unprocessed_packet_batches::*, unprocessed_transaction_storage::{ @@ -26,9 +26,7 @@ use { }, }, core::iter::repeat, - crossbeam_channel::{ - Receiver as CrossbeamReceiver, RecvTimeoutError, Sender as CrossbeamSender, - }, + crossbeam_channel::RecvTimeoutError, histogram::Histogram, itertools::Itertools, solana_client::{connection_cache::ConnectionCache, tpu_connection::TpuConnection}, @@ -41,7 +39,7 @@ use { solana_metrics::inc_new_counter_info, solana_perf::{ data_budget::DataBudget, - packet::{Packet, PacketBatch, PACKETS_PER_BATCH}, + packet::{Packet, PACKETS_PER_BATCH}, }, solana_poh::poh_recorder::{BankStart, PohRecorder, PohRecorderError, TransactionRecorder}, solana_program_runtime::timings::ExecuteTimings, @@ -100,9 +98,6 @@ const MIN_THREADS_BANKING: u32 = 1; const MIN_TOTAL_THREADS: u32 = NUM_VOTE_PROCESSING_THREADS + MIN_THREADS_BANKING; const SLOT_BOUNDARY_CHECK_PERIOD: Duration = Duration::from_millis(10); -pub type BankingPacketBatch = (Vec, Option); -pub type BankingPacketSender = CrossbeamSender; -pub type BankingPacketReceiver = CrossbeamReceiver; pub struct ProcessTransactionBatchOutput { // The number of transactions filtered out by the cost model @@ -354,6 +349,7 @@ pub struct BatchedTransactionErrorDetails { /// Stores the stage's thread handle and output receiver. pub struct BankingStage { bank_thread_hdls: Vec>, + tracer_thread_hdl: Option>, } #[derive(Debug, Clone)] @@ -386,6 +382,7 @@ impl BankingStage { log_messages_bytes_limit: Option, connection_cache: Arc, bank_forks: Arc>, + banking_tracer: BankingTracer, ) -> Self { Self::new_num_threads( cluster_info, @@ -399,6 +396,7 @@ impl BankingStage { log_messages_bytes_limit, connection_cache, bank_forks, + banking_tracer, ) } @@ -415,6 +413,7 @@ impl BankingStage { log_messages_bytes_limit: Option, connection_cache: Arc, bank_forks: Arc>, + banking_tracer: BankingTracer, ) -> Self { assert!(num_threads >= MIN_TOTAL_THREADS); // Single thread to generate entries from many banks. @@ -433,6 +432,7 @@ impl BankingStage { .is_active(&allow_votes_to_directly_update_vote_state::id()) }) .unwrap_or(false); + let (tracer_thread_hdl, banking_tracer) = banking_tracer.finalize_under_arc(); // Many banks that process transactions in parallel. let bank_thread_hdls: Vec> = (0..num_threads) .map(|i| { @@ -484,6 +484,7 @@ impl BankingStage { let data_budget = data_budget.clone(); let connection_cache = connection_cache.clone(); let bank_forks = bank_forks.clone(); + let banking_tracer = banking_tracer.clone(); Builder::new() .name(format!("solBanknStgTx{i:02}")) .spawn(move || { @@ -500,12 +501,16 @@ impl BankingStage { connection_cache, &bank_forks, unprocessed_transaction_storage, + banking_tracer, ); }) .unwrap() }) .collect(); - Self { bank_thread_hdls } + Self { + bank_thread_hdls, + tracer_thread_hdl, + } } /// Forwards all valid, unprocessed packets in the buffer, up to a rate limit. Returns @@ -765,6 +770,7 @@ impl BankingStage { connection_cache: &ConnectionCache, tracer_packet_stats: &mut TracerPacketStats, bank_forks: &Arc>, + unreceived_batch_count: usize, ) { if unprocessed_transaction_storage.should_not_process() { return; @@ -960,6 +966,7 @@ impl BankingStage { connection_cache: Arc, bank_forks: &Arc>, mut unprocessed_transaction_storage: UnprocessedTransactionStorage, + banking_tracer: Arc, ) { let recorder = poh_recorder.read().unwrap().recorder(); let socket = UdpSocket::bind("0.0.0.0:0").unwrap(); @@ -967,7 +974,7 @@ impl BankingStage { let mut tracer_packet_stats = TracerPacketStats::new(id); let qos_service = QosService::new(id); - let mut slot_metrics_tracker = LeaderSlotMetricsTracker::new(id); + let mut slot_metrics_tracker = LeaderSlotMetricsTracker::new(id, banking_tracer); let mut last_metrics_update = Instant::now(); loop { @@ -993,6 +1000,7 @@ impl BankingStage { &connection_cache, &mut tracer_packet_stats, bank_forks, + packet_deserializer.unreceived_batch_count(), ), "process_buffered_packets", ); @@ -1740,6 +1748,11 @@ impl BankingStage { for bank_thread_hdl in self.bank_thread_hdls { bank_thread_hdl.join()?; } + if let Some(tracer_thread_hdl) = self.tracer_thread_hdl { + if let Err(tracer_result) = tracer_thread_hdl.join()? { + error!("tracer thread error: {:?}", tracer_result); + } + } Ok(()) } } @@ -1748,7 +1761,7 @@ impl BankingStage { mod tests { use { super::*, - crate::unprocessed_packet_batches, + crate::{banking_trace::BankingPacketBatch, unprocessed_packet_batches}, crossbeam_channel::{unbounded, Receiver}, solana_address_lookup_table_program::state::{AddressLookupTable, LookupTableMeta}, solana_entry::entry::{next_entry, next_versioned_entry, Entry, EntrySlice}, @@ -1761,7 +1774,7 @@ mod tests { get_tmp_ledger_path_auto_delete, leader_schedule_cache::LeaderScheduleCache, }, - solana_perf::packet::{to_packet_batches, PacketFlags}, + solana_perf::packet::{to_packet_batches, PacketBatch, PacketFlags}, solana_poh::{ poh_recorder::{create_test_recorder, Record, WorkingBankEntry}, poh_service::PohService, @@ -1812,9 +1825,11 @@ mod tests { let bank = Bank::new_no_wallclock_throttle_for_tests(&genesis_config); let bank_forks = Arc::new(RwLock::new(BankForks::new(bank))); let bank = Arc::new(bank_forks.read().unwrap().get(0).unwrap()); - let (non_vote_sender, non_vote_receiver) = unbounded(); - let (tpu_vote_sender, tpu_vote_receiver) = unbounded(); - let (gossip_vote_sender, gossip_vote_receiver) = unbounded(); + let banking_tracer = BankingTracer::new_disabled(); + let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); + let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); + let (gossip_vote_sender, gossip_vote_receiver) = + banking_tracer.create_channel_gossip_vote(); let ledger_path = get_tmp_ledger_path_auto_delete!(); { let blockstore = Arc::new( @@ -1838,6 +1853,7 @@ mod tests { None, Arc::new(ConnectionCache::default()), bank_forks, + banking_tracer, ); drop(non_vote_sender); drop(tpu_vote_sender); @@ -1861,9 +1877,11 @@ mod tests { let bank_forks = Arc::new(RwLock::new(BankForks::new(bank))); let bank = Arc::new(bank_forks.read().unwrap().get(0).unwrap()); let start_hash = bank.last_blockhash(); - let (non_vote_sender, non_vote_receiver) = unbounded(); - let (tpu_vote_sender, tpu_vote_receiver) = unbounded(); - let (gossip_vote_sender, gossip_vote_receiver) = unbounded(); + let banking_tracer = BankingTracer::new_disabled(); + let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); + let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); + let (gossip_vote_sender, gossip_vote_receiver) = + banking_tracer.create_channel_gossip_vote(); let ledger_path = get_tmp_ledger_path_auto_delete!(); { let blockstore = Arc::new( @@ -1891,6 +1909,7 @@ mod tests { None, Arc::new(ConnectionCache::default()), bank_forks, + banking_tracer, ); trace!("sending bank"); drop(non_vote_sender); @@ -1937,9 +1956,11 @@ mod tests { let bank_forks = Arc::new(RwLock::new(BankForks::new(bank))); let bank = Arc::new(bank_forks.read().unwrap().get(0).unwrap()); let start_hash = bank.last_blockhash(); - let (non_vote_sender, non_vote_receiver) = unbounded(); - let (tpu_vote_sender, tpu_vote_receiver) = unbounded(); - let (gossip_vote_sender, gossip_vote_receiver) = unbounded(); + let banking_tracer = BankingTracer::new_disabled(); + let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); + let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); + let (gossip_vote_sender, gossip_vote_receiver) = + banking_tracer.create_channel_gossip_vote(); let ledger_path = get_tmp_ledger_path_auto_delete!(); { let blockstore = Arc::new( @@ -1969,6 +1990,7 @@ mod tests { None, Arc::new(ConnectionCache::default()), bank_forks, + banking_tracer, ); // fund another account so we can send 2 good transactions in a single batch. @@ -2002,7 +2024,7 @@ mod tests { .collect(); let packet_batches = convert_from_old_verified(packet_batches); non_vote_sender // no_ver, anf, tx - .send((packet_batches, None)) + .send(BankingPacketBatch::new((packet_batches, None))) .unwrap(); drop(non_vote_sender); @@ -2061,7 +2083,8 @@ mod tests { mint_keypair, .. } = create_slow_genesis_config(2); - let (non_vote_sender, non_vote_receiver) = unbounded(); + let banking_tracer = BankingTracer::new_disabled(); + let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); // Process a batch that includes a transaction that receives two lamports. let alice = Keypair::new(); @@ -2074,7 +2097,9 @@ mod tests { .map(|batch| (batch, vec![1u8])) .collect(); let packet_batches = convert_from_old_verified(packet_batches); - non_vote_sender.send((packet_batches, None)).unwrap(); + non_vote_sender + .send(BankingPacketBatch::new((packet_batches, None))) + .unwrap(); // Process a second batch that uses the same from account, so conflicts with above TX let tx = @@ -2085,10 +2110,13 @@ mod tests { .map(|batch| (batch, vec![1u8])) .collect(); let packet_batches = convert_from_old_verified(packet_batches); - non_vote_sender.send((packet_batches, None)).unwrap(); + non_vote_sender + .send(BankingPacketBatch::new((packet_batches, None))) + .unwrap(); - let (tpu_vote_sender, tpu_vote_receiver) = unbounded(); - let (gossip_vote_sender, gossip_vote_receiver) = unbounded(); + let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); + let (gossip_vote_sender, gossip_vote_receiver) = + banking_tracer.create_channel_gossip_vote(); let ledger_path = get_tmp_ledger_path_auto_delete!(); { let (replay_vote_sender, _replay_vote_receiver) = unbounded(); @@ -2124,6 +2152,7 @@ mod tests { None, Arc::new(ConnectionCache::default()), bank_forks, + banking_tracer, ); // wait for banking_stage to eat the packets @@ -2651,7 +2680,7 @@ mod tests { } fn simulate_poh( - record_receiver: CrossbeamReceiver, + record_receiver: Receiver, poh_recorder: &Arc>, ) -> JoinHandle<()> { let poh_recorder = poh_recorder.clone(); @@ -3392,7 +3421,7 @@ mod tests { &BankingStageStats::default(), &recorder, &QosService::new(1), - &mut LeaderSlotMetricsTracker::new(0), + &mut LeaderSlotMetricsTracker::new_for_test(), None, ); assert!(buffered_packet_batches.is_empty()); @@ -3450,7 +3479,7 @@ mod tests { &BankingStageStats::default(), &recorder, &QosService::new(1), - &mut LeaderSlotMetricsTracker::new(0), + &mut LeaderSlotMetricsTracker::new_for_test(), None, ); assert!(buffered_packet_batches.is_empty()); @@ -3512,7 +3541,7 @@ mod tests { &BankingStageStats::default(), &recorder, &QosService::new(1), - &mut LeaderSlotMetricsTracker::new(0), + &mut LeaderSlotMetricsTracker::new_for_test(), None, ); @@ -3605,7 +3634,7 @@ mod tests { &socket, true, &data_budget, - &mut LeaderSlotMetricsTracker::new(0), + &mut LeaderSlotMetricsTracker::new_for_test(), &stats, &connection_cache, &mut TracerPacketStats::new(0), @@ -3702,7 +3731,7 @@ mod tests { &socket, hold, &DataBudget::default(), - &mut LeaderSlotMetricsTracker::new(0), + &mut LeaderSlotMetricsTracker::new_for_test(), &stats, &connection_cache, &mut TracerPacketStats::new(0), @@ -3780,9 +3809,11 @@ mod tests { let bank_forks = Arc::new(RwLock::new(BankForks::new(bank))); let bank = Arc::new(bank_forks.read().unwrap().get(0).unwrap()); let start_hash = bank.last_blockhash(); - let (non_vote_sender, non_vote_receiver) = unbounded(); - let (tpu_vote_sender, tpu_vote_receiver) = unbounded(); - let (gossip_vote_sender, gossip_vote_receiver) = unbounded(); + let banking_tracer = BankingTracer::new_disabled(); + let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); + let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); + let (gossip_vote_sender, gossip_vote_receiver) = + banking_tracer.create_channel_gossip_vote(); let ledger_path = get_tmp_ledger_path_auto_delete!(); { let blockstore = Arc::new( @@ -3812,6 +3843,7 @@ mod tests { None, Arc::new(ConnectionCache::default()), bank_forks, + banking_tracer, ); let keypairs = (0..100).map(|_| Keypair::new()).collect_vec(); @@ -3885,7 +3917,11 @@ mod tests { .into_iter() .map(|(packet_batches, sender)| { Builder::new() - .spawn(move || sender.send((packet_batches, None)).unwrap()) + .spawn(move || { + sender + .send(BankingPacketBatch::new((packet_batches, None))) + .unwrap() + }) .unwrap() }) .for_each(|handle| handle.join().unwrap()); diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs new file mode 100644 index 00000000000..ba9083214fa --- /dev/null +++ b/core/src/banking_trace.rs @@ -0,0 +1,497 @@ +use { + crate::sigverify::SigverifyTracerPacketStats, + bincode::serialize_into, + chrono::{DateTime, Local}, + crossbeam_channel::{unbounded, Receiver, SendError, Sender, TryRecvError}, + rolling_file::{RollingCondition, RollingConditionBasic, RollingFileAppender}, + solana_perf::{ + packet::{to_packet_batches, PacketBatch}, + test_tx::test_tx, + }, + solana_sdk::slot_history::Slot, + std::{ + fs::{create_dir_all, remove_dir_all}, + io::{self, Write}, + path::PathBuf, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + thread::{self, sleep, JoinHandle}, + time::{Duration, SystemTime}, + }, + tempfile::TempDir, + thiserror::Error, +}; + +pub type BankingPacketBatch = Arc<(Vec, Option)>; +pub type BankingPacketSender = TracedSender; +pub type BankingPacketReceiver = Receiver; +pub type TracerThreadResult = Result<(), TraceError>; + +#[derive(Error, Debug)] +pub enum TraceError { + #[error("IO Error: {0}")] + IoError(#[from] std::io::Error), + + #[error("Serialization Error: {0}")] + SerializeError(#[from] bincode::Error), + + #[error("Integer Cast Error: {0}")] + IntegerCastError(#[from] std::num::TryFromIntError), + + #[error("Trace size is too small (must be larger than {1}): {0}")] + TooSmallTraceSize(u64, u64), +} + +const BASENAME: &str = "events"; +const TRACE_FILE_ROTATE_COUNT: u64 = 14; // target 2 weeks retention under normal load +const TRACE_FILE_WRITE_INTERVAL_MS: u64 = 100; +const BUF_WRITER_CAPACITY: usize = 10 * 1024 * 1024; +pub const TRACE_FILE_DEFAULT_ROTATE_BYTE_THRESHOLD: u64 = 1024 * 1024 * 1024; +pub const EMPTY_BANKING_TRACE_SIZE: u64 = 0; +pub const DEFAULT_BANKING_TRACE_SIZE: u64 = + TRACE_FILE_DEFAULT_ROTATE_BYTE_THRESHOLD * TRACE_FILE_ROTATE_COUNT; + +#[allow(clippy::type_complexity)] +#[derive(Debug)] +pub struct BankingTracer { + enabled_tracer: Option<( + Sender, + Option>, + Arc, + )>, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct TimedTracedEvent(std::time::SystemTime, TracedEvent); + +#[derive(Serialize, Deserialize, Debug)] +enum TracedEvent { + Bank(Slot, u32, BankStatus, usize), + PacketBatch(ChannelLabel, BankingPacketBatch), +} + +#[derive(Serialize, Deserialize, Debug)] +enum BankStatus { + Started, + Ended, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Copy)] +pub enum ChannelLabel { + NonVote, + TpuVote, + GossipVote, + Dummy, +} + +struct RollingConditionGrouped { + basic: RollingConditionBasic, + is_checked: bool, +} + +impl RollingConditionGrouped { + fn new(basic: RollingConditionBasic) -> Self { + Self { + basic, + is_checked: bool::default(), + } + } + + fn reset(&mut self) { + self.is_checked = false; + } +} + +struct GroupedWriter<'a> { + now: DateTime, + underlying: &'a mut RollingFileAppender, +} + +impl<'a> GroupedWriter<'a> { + fn new(underlying: &'a mut RollingFileAppender) -> Self { + Self { + now: Local::now(), + underlying, + } + } +} + +impl RollingCondition for RollingConditionGrouped { + fn should_rollover(&mut self, now: &DateTime, current_filesize: u64) -> bool { + if !self.is_checked { + self.is_checked = true; + self.basic.should_rollover(now, current_filesize) + } else { + false + } + } +} + +impl<'a> Write for GroupedWriter<'a> { + fn write(&mut self, buf: &[u8]) -> std::result::Result { + self.underlying.write_with_datetime(buf, &self.now) + } + fn flush(&mut self) -> std::result::Result<(), io::Error> { + self.underlying.flush() + } +} + +pub fn sender_overhead_minimized_receiver_loop( + exit: Arc, + receiver: Receiver, + mut on_recv: impl FnMut(T) -> Result<(), E>, +) -> Result<(), E> { + 'outer: while !exit.load(Ordering::Relaxed) { + 'inner: loop { + // avoid futex-based blocking here, otherwise a sender would have to + // wake me up at a syscall cost... + match receiver.try_recv() { + Ok(message) => on_recv(message)?, + Err(TryRecvError::Empty) => break 'inner, + Err(TryRecvError::Disconnected) => { + assert_eq!(receiver.len(), 0); + break 'outer; + } + }; + if exit.load(Ordering::Relaxed) { + break 'outer; + } + } + sleep(Duration::from_millis(SLEEP_MS)); + } + + Ok(()) +} + +impl BankingTracer { + pub fn new(maybe_config: Option<(PathBuf, Arc, u64)>) -> Result { + let enabled_tracer = maybe_config + .map(|(path, exit, total_size)| -> Result<_, TraceError> { + let rotate_threshold_size = total_size / TRACE_FILE_ROTATE_COUNT; + if rotate_threshold_size == 0 { + return Err(TraceError::TooSmallTraceSize( + total_size, + TRACE_FILE_ROTATE_COUNT, + )); + } + + let (trace_sender, trace_receiver) = unbounded(); + + Self::ensure_prepare_path(&path)?; + let file_appender = Self::create_file_appender(path, rotate_threshold_size)?; + + let tracing_thread = + Self::spawn_background_thread(trace_receiver, file_appender, exit.clone())?; + + Ok((trace_sender, Some(tracing_thread), exit)) + }) + .transpose()?; + + Ok(Self { enabled_tracer }) + } + + pub fn new_disabled() -> Self { + Self { + enabled_tracer: None, + } + } + + fn create_channel(&self, label: ChannelLabel) -> (BankingPacketSender, BankingPacketReceiver) { + Self::channel( + label, + self.enabled_tracer + .as_ref() + .map(|(sender, _, exit)| (sender.clone(), exit.clone())), + ) + } + + pub fn create_channel_non_vote(&self) -> (BankingPacketSender, BankingPacketReceiver) { + self.create_channel(ChannelLabel::NonVote) + } + + pub fn create_channel_tpu_vote(&self) -> (BankingPacketSender, BankingPacketReceiver) { + self.create_channel(ChannelLabel::TpuVote) + } + + pub fn create_channel_gossip_vote(&self) -> (BankingPacketSender, BankingPacketReceiver) { + self.create_channel(ChannelLabel::GossipVote) + } + + pub fn finalize_under_arc(mut self) -> (Option>, Arc) { + ( + self.enabled_tracer + .as_mut() + .and_then(|(_, tracer_thread, _)| tracer_thread.take()), + Arc::new(self), + ) + } + + fn bank_event(&self, slot: Slot, id: u32, status: BankStatus, unreceived_batch_count: usize) { + if let Some((sender, _, exit)) = &self.enabled_tracer { + if !exit.load(Ordering::Relaxed) { + sender + .send(TimedTracedEvent( + SystemTime::now(), + TracedEvent::Bank(slot, id, status, unreceived_batch_count), + )) + .expect("active tracer thread unless exited"); + } + } + } + + pub fn bank_start(&self, slot: Slot, id: u32, unreceived_batch_count: usize) { + self.bank_event(slot, id, BankStatus::Started, unreceived_batch_count); + } + + pub fn bank_end(&self, slot: Slot, id: u32, unreceived_batch_count: usize) { + self.bank_event(slot, id, BankStatus::Ended, unreceived_batch_count); + } + + pub fn channel_for_test() -> (TracedSender, Receiver) { + Self::channel(ChannelLabel::Dummy, None) + } + + pub fn channel( + label: ChannelLabel, + trace_sender: Option<(Sender, Arc)>, + ) -> (TracedSender, Receiver) { + let (sender, receiver) = unbounded(); + (TracedSender::new(label, sender, trace_sender), receiver) + } + + fn ensure_prepare_path(path: &PathBuf) -> Result<(), io::Error> { + create_dir_all(path) + } + + pub fn ensure_cleanup_path(path: &PathBuf) -> Result<(), io::Error> { + remove_dir_all(path).or_else(|err| { + if err.kind() == io::ErrorKind::NotFound { + Ok(()) + } else { + Err(err) + } + }) + } + + fn create_file_appender( + path: PathBuf, + rotate_threshold_size: u64, + ) -> Result, TraceError> { + let grouped = RollingConditionGrouped::new( + RollingConditionBasic::new() + .daily() + .max_size(rotate_threshold_size), + ); + let appender = RollingFileAppender::new_with_buffer_capacity( + path.join(BASENAME), + grouped, + (TRACE_FILE_ROTATE_COUNT - 1).try_into()?, + BUF_WRITER_CAPACITY, + )?; + Ok(appender) + } + + fn spawn_background_thread( + trace_receiver: Receiver, + mut file_appender: RollingFileAppender, + exit: Arc, + ) -> Result, TraceError> { + let thread = thread::Builder::new().name("solBanknTracer".into()).spawn( + move || -> TracerThreadResult { + sender_overhead_minimized_receiver_loop::< + _, + TraceError, + TRACE_FILE_WRITE_INTERVAL_MS, + >(exit, trace_receiver, |event| -> Result<(), TraceError> { + file_appender.condition_mut().reset(); + serialize_into(&mut GroupedWriter::new(&mut file_appender), &event)?; + Ok(()) + })?; + file_appender.flush()?; + Ok(()) + }, + )?; + + Ok(thread) + } +} + +pub struct TracedSender { + label: ChannelLabel, + sender: Sender, + trace_sender: Option<(Sender, Arc)>, +} + +impl TracedSender { + fn new( + label: ChannelLabel, + sender: Sender, + trace_sender: Option<(Sender, Arc)>, + ) -> Self { + Self { + label, + sender, + trace_sender, + } + } + + pub fn send(&self, batch: BankingPacketBatch) -> Result<(), SendError> { + if let Some((trace_sender, exit)) = &self.trace_sender { + if !exit.load(Ordering::Relaxed) { + trace_sender + .send(TimedTracedEvent( + SystemTime::now(), + TracedEvent::PacketBatch(self.label, Arc::clone(&batch)), + )) + .expect("active tracer thread unless exited"); + } + } + self.sender.send(batch) + } +} + +pub fn sample_packet_batch() -> BankingPacketBatch { + BankingPacketBatch::new((to_packet_batches(&vec![test_tx(); 4], 10), None)) +} + +pub fn drop_and_clean_temp_dir_unless_suppressed(temp_dir: TempDir) { + std::env::var("BANKING_TRACE_LEAVE_FILES_FROM_LAST_ITERATION") + .is_ok() + .then(|| { + eprintln!("prevented to remove {:?}", temp_dir.path()); + drop(temp_dir.into_path()); + }); +} + +pub fn terminate_tracer( + tracer: BankingTracer, + main_thread: JoinHandle, + sender: TracedSender, + exit: Option>, +) { + if let Some(exit) = exit { + exit.store(true, Ordering::Relaxed); + } + let (tracer_thread, tracer) = tracer.finalize_under_arc(); + drop((sender, tracer)); + main_thread.join().unwrap().unwrap(); + if let Some(tracer_thread) = tracer_thread { + tracer_thread.join().unwrap().unwrap(); + } +} + +#[cfg(test)] +mod tests { + use { + super::*, + std::{fs::File, io::BufReader}, + }; + + #[test] + fn test_new_disabled() { + let exit = Arc::::default(); + + let tracer = BankingTracer::new_disabled(); + let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); + + let dummy_main_thread = thread::spawn(move || { + sender_overhead_minimized_receiver_loop::<_, TraceError, 0>( + exit, + non_vote_receiver, + |_packet_batch| Ok(()), + ) + }); + + non_vote_sender + .send(BankingPacketBatch::new((vec![], None))) + .unwrap(); + terminate_tracer(tracer, dummy_main_thread, non_vote_sender, None); + } + + #[test] + fn test_send_after_exited() { + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir.path().join("banking-trace"); + let exit = Arc::::default(); + let tracer = BankingTracer::new(Some((path, exit.clone(), u64::max_value()))).unwrap(); + let (tracer_thread, tracer) = tracer.finalize_under_arc(); + let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); + + let exit_for_dummy_thread = Arc::::default(); + let exit_for_dummy_thread2 = exit_for_dummy_thread.clone(); + let dummy_main_thread = thread::spawn(move || { + sender_overhead_minimized_receiver_loop::<_, TraceError, 0>( + exit_for_dummy_thread, + non_vote_receiver, + |_packet_batch| Ok(()), + ) + }); + + // kill and join the tracer thread + exit.store(true, Ordering::Relaxed); + tracer_thread.unwrap().join().unwrap().unwrap(); + + // shouldn't panic + tracer.bank_end(1, 2, 3); + + drop(tracer); + + // shouldn't panic + non_vote_sender.send(sample_packet_batch()).unwrap(); + + // finally terminate and join the main thread + exit_for_dummy_thread2.store(true, Ordering::Relaxed); + dummy_main_thread.join().unwrap().unwrap(); + } + + #[test] + fn test_record_and_restore() { + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir.path().join("banking-trace"); + let exit = Arc::::default(); + let tracer = + BankingTracer::new(Some((path.clone(), exit.clone(), u64::max_value()))).unwrap(); + let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); + + let dummy_main_thread = thread::spawn(move || { + sender_overhead_minimized_receiver_loop::<_, TraceError, 0>( + exit, + non_vote_receiver, + |_packet_batch| Ok(()), + ) + }); + + non_vote_sender.send(sample_packet_batch()).unwrap(); + tracer.bank_start(1, 2, 3); + + terminate_tracer(tracer, dummy_main_thread, non_vote_sender, None); + + let mut stream = BufReader::new(File::open(path.join(BASENAME)).unwrap()); + let results = (0..3) + .map(|_| bincode::deserialize_from::<_, TimedTracedEvent>(&mut stream)) + .collect::>(); + + assert_matches!( + results[0], + Ok(TimedTracedEvent( + _, + TracedEvent::PacketBatch(ChannelLabel::NonVote, _) + )) + ); + assert_matches!( + results[1], + Ok(TimedTracedEvent( + _, + TracedEvent::Bank(1, 2, BankStatus::Started, 3) + )) + ); + assert_matches!( + results[2], + Err(_) // in this way, rustfmt formats this line like the above + ); + + drop_and_clean_temp_dir_unless_suppressed(temp_dir); + } +} diff --git a/core/src/cluster_info_vote_listener.rs b/core/src/cluster_info_vote_listener.rs index 8e56a210edd..e250e59c49f 100644 --- a/core/src/cluster_info_vote_listener.rs +++ b/core/src/cluster_info_vote_listener.rs @@ -1,6 +1,6 @@ use { crate::{ - banking_stage::BankingPacketSender, + banking_trace::{BankingPacketBatch, BankingPacketSender}, optimistic_confirmation_verifier::OptimisticConfirmationVerifier, replay_stage::DUPLICATE_THRESHOLD, result::{Error, Result}, @@ -472,7 +472,8 @@ impl ClusterInfoVoteListener { for single_validator_votes in gossip_votes_iterator { bank_send_votes_stats.num_votes_sent += single_validator_votes.len(); bank_send_votes_stats.num_batches_sent += 1; - verified_packets_sender.send((single_validator_votes, None))?; + verified_packets_sender + .send(BankingPacketBatch::new((single_validator_votes, None)))?; } filter_gossip_votes_timing.stop(); bank_send_votes_stats.total_elapsed += filter_gossip_votes_timing.as_us(); @@ -871,6 +872,7 @@ impl ClusterInfoVoteListener { mod tests { use { super::*, + crate::banking_trace::BankingTracer, solana_perf::packet, solana_rpc::optimistically_confirmed_bank_tracker::OptimisticallyConfirmedBank, solana_runtime::{ @@ -1685,7 +1687,8 @@ mod tests { let current_leader_bank = Arc::new(Bank::new_for_tests(&genesis_config)); let mut bank_vote_sender_state_option: Option = None; let verified_vote_packets = VerifiedVotePackets::default(); - let (verified_packets_sender, _verified_packets_receiver) = unbounded(); + let (verified_packets_sender, _verified_packets_receiver) = + BankingTracer::channel_for_test(); // 1) If we hand over a `current_leader_bank`, vote sender state should be updated ClusterInfoVoteListener::check_for_leader_bank_and_send_votes( diff --git a/core/src/leader_slot_banking_stage_metrics.rs b/core/src/leader_slot_banking_stage_metrics.rs index 6e9898c33ff..b57e4d035f4 100644 --- a/core/src/leader_slot_banking_stage_metrics.rs +++ b/core/src/leader_slot_banking_stage_metrics.rs @@ -1,12 +1,12 @@ use { crate::{ - leader_slot_banking_stage_timing_metrics::*, + banking_trace::BankingTracer, leader_slot_banking_stage_timing_metrics::*, unprocessed_transaction_storage::InsertPacketBatchSummary, }, solana_poh::poh_recorder::BankStart, solana_runtime::transaction_error_metrics::*, solana_sdk::{clock::Slot, saturating_add_assign}, - std::time::Instant, + std::{sync::Arc, time::Instant}, }; /// A summary of what happened to transactions passed to the execution pipeline. @@ -358,16 +358,45 @@ pub struct LeaderSlotMetricsTracker { // otherwise `None` leader_slot_metrics: Option, id: u32, + banking_tracer: Arc, + unreceived_batch_count: usize, } impl LeaderSlotMetricsTracker { - pub fn new(id: u32) -> Self { + pub fn new(id: u32, banking_tracer: Arc) -> Self { Self { leader_slot_metrics: None, id, + banking_tracer, + unreceived_batch_count: 0, } } + pub fn refresh_unreceived_batch_count(&mut self, batch_count: usize) { + self.unreceived_batch_count = batch_count; + } + + pub fn new_for_test() -> Self { + Self::new(0, Arc::new(BankingTracer::new_disabled())) + } + + fn create_new_slot_metrics(&self, bank_start: &BankStart) -> Option { + let metrics = LeaderSlotMetrics::new( + self.id, + bank_start.working_bank.slot(), + &bank_start.bank_creation_time, + ); + self.banking_tracer + .bank_start(metrics.slot, self.id, self.unreceived_batch_count); + + Some(metrics) + } + + pub fn trace_bank_end(&self, metrics_slot: Slot) { + self.banking_tracer + .bank_end(metrics_slot, self.id, self.unreceived_batch_count); + } + // Check leader slot, return MetricsTrackerAction to be applied by apply_action() pub(crate) fn check_leader_slot_boundary( &mut self, @@ -377,28 +406,27 @@ impl LeaderSlotMetricsTracker { (None, None) => MetricsTrackerAction::Noop, (Some(leader_slot_metrics), None) => { + let slot = leader_slot_metrics.slot; leader_slot_metrics.mark_slot_end_detected(); + self.trace_bank_end(slot); + MetricsTrackerAction::ReportAndResetTracker } // Our leader slot has begain, time to create a new slot tracker (None, Some(bank_start)) => { - MetricsTrackerAction::NewTracker(Some(LeaderSlotMetrics::new( - self.id, - bank_start.working_bank.slot(), - &bank_start.bank_creation_time, - ))) + MetricsTrackerAction::NewTracker(self.create_new_slot_metrics(bank_start)) } (Some(leader_slot_metrics), Some(bank_start)) => { if leader_slot_metrics.slot != bank_start.working_bank.slot() { // Last slot has ended, new slot has began + let slot = leader_slot_metrics.slot; leader_slot_metrics.mark_slot_end_detected(); - MetricsTrackerAction::ReportAndNewTracker(Some(LeaderSlotMetrics::new( - self.id, - bank_start.working_bank.slot(), - &bank_start.bank_creation_time, - ))) + self.trace_bank_end(slot); + MetricsTrackerAction::ReportAndNewTracker( + self.create_new_slot_metrics(bank_start), + ) } else { MetricsTrackerAction::Noop } @@ -877,8 +905,7 @@ mod tests { bank_creation_time: Arc::new(Instant::now()), }; - let banking_stage_thread_id = 0; - let leader_slot_metrics_tracker = LeaderSlotMetricsTracker::new(banking_stage_thread_id); + let leader_slot_metrics_tracker = LeaderSlotMetricsTracker::new_for_test(); TestSlotBoundaryComponents { first_bank, diff --git a/core/src/lib.rs b/core/src/lib.rs index ff0b259246c..b74164b52e8 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -11,6 +11,7 @@ pub mod accounts_hash_verifier; pub mod ancestor_hashes_service; pub mod banking_stage; +pub mod banking_trace; pub mod broadcast_stage; pub mod cache_block_meta_service; pub mod cluster_info_vote_listener; diff --git a/core/src/packet_deserializer.rs b/core/src/packet_deserializer.rs index 8ae302ee3c4..06cec393991 100644 --- a/core/src/packet_deserializer.rs +++ b/core/src/packet_deserializer.rs @@ -2,17 +2,15 @@ use { crate::{ + banking_trace::{BankingPacketBatch, BankingPacketReceiver}, immutable_deserialized_packet::ImmutableDeserializedPacket, sigverify::SigverifyTracerPacketStats, }, - crossbeam_channel::{Receiver as CrossbeamReceiver, RecvTimeoutError}, + crossbeam_channel::RecvTimeoutError, solana_perf::packet::PacketBatch, std::time::{Duration, Instant}, }; -pub type BankingPacketBatch = (Vec, Option); -pub type BankingPacketReceiver = CrossbeamReceiver; - /// Results from deserializing packet batches. pub struct ReceivePacketResults { /// Deserialized packets from all received packet batches @@ -37,37 +35,53 @@ impl PacketDeserializer { } } + pub fn unreceived_batch_count(&self) -> usize { + self.packet_batch_receiver.len() + } + /// Handles receiving packet batches from sigverify and returns a vector of deserialized packets pub fn handle_received_packets( &self, recv_timeout: Duration, capacity: usize, ) -> Result { - let (packet_batches, sigverify_tracer_stats_option) = + let (packet_count, packet_batches, sigverify_tracer_stats_option) = self.receive_until(recv_timeout, capacity)?; Ok(Self::deserialize_and_collect_packets( + packet_count, &packet_batches, sigverify_tracer_stats_option, )) } + fn for_each_packet_batch( + banking_batches: &[BankingPacketBatch], + mut for_each: impl FnMut(&PacketBatch), + ) { + for banking_batch in banking_batches { + for batch in &banking_batch.0 { + for_each(batch) + } + } + } + /// Deserialize packet batches and collect them into ReceivePacketResults fn deserialize_and_collect_packets( - packet_batches: &[PacketBatch], + packet_count: usize, + packet_batches: &[BankingPacketBatch], sigverify_tracer_stats_option: Option, ) -> ReceivePacketResults { - let packet_count: usize = packet_batches.iter().map(|x| x.len()).sum(); let mut passed_sigverify_count: usize = 0; let mut failed_sigverify_count: usize = 0; let mut deserialized_packets = Vec::with_capacity(packet_count); - for packet_batch in packet_batches { + Self::for_each_packet_batch(packet_batches, |packet_batch| { let packet_indexes = Self::generate_packet_indexes(packet_batch); passed_sigverify_count += packet_indexes.len(); failed_sigverify_count += packet_batch.len().saturating_sub(packet_indexes.len()); deserialized_packets.extend(Self::deserialize_packets(packet_batch, &packet_indexes)); - } + }); ReceivePacketResults { deserialized_packets, @@ -82,25 +96,32 @@ impl PacketDeserializer { &self, recv_timeout: Duration, packet_count_upperbound: usize, - ) -> Result<(Vec, Option), RecvTimeoutError> { + ) -> Result< + ( + usize, + Vec, + Option, + ), + RecvTimeoutError, + > { let start = Instant::now(); - let (mut packet_batches_received_so_far, mut aggregated_tracer_packet_stats_option) = - self.packet_batch_receiver.recv_timeout(recv_timeout)?; - let mut num_packets_received = packet_batches_received_so_far + let message = self.packet_batch_receiver.recv_timeout(recv_timeout)?; + let (packet_batches, mut aggregated_tracer_packet_stats_option) = + (&message.0, message.1.clone()); + let mut num_packets_received = packet_batches .iter() .map(|batch| batch.len()) .sum::(); + let mut messages = vec![message]; - while let Ok((packet_batches, tracer_packet_stats_option)) = - self.packet_batch_receiver.try_recv() - { + while let Ok(message) = self.packet_batch_receiver.try_recv() { + let (packet_batches, tracer_packet_stats_option) = (&message.0, message.1.clone()); trace!("got more packet batches in packet deserializer"); num_packets_received += packet_batches .iter() .map(|batch| batch.len()) .sum::(); - packet_batches_received_so_far.extend(packet_batches); if let Some(tracer_packet_stats) = &tracer_packet_stats_option { if let Some(aggregated_tracer_packet_stats) = @@ -111,6 +132,7 @@ impl PacketDeserializer { aggregated_tracer_packet_stats_option = tracer_packet_stats_option; } } + messages.push(message); if start.elapsed() >= recv_timeout || num_packets_received >= packet_count_upperbound { break; @@ -118,7 +140,8 @@ impl PacketDeserializer { } Ok(( - packet_batches_received_so_far, + num_packets_received, + messages, aggregated_tracer_packet_stats_option, )) } @@ -159,7 +182,7 @@ mod tests { #[test] fn test_deserialize_and_collect_packets_empty() { - let results = PacketDeserializer::deserialize_and_collect_packets(&[], None); + let results = PacketDeserializer::deserialize_and_collect_packets(0, &[], None); assert_eq!(results.deserialized_packets.len(), 0); assert!(results.new_tracer_stats_option.is_none()); assert_eq!(results.passed_sigverify_count, 0); @@ -172,7 +195,12 @@ mod tests { let packet_batches = to_packet_batches(&transactions, 1); assert_eq!(packet_batches.len(), 2); - let results = PacketDeserializer::deserialize_and_collect_packets(&packet_batches, None); + let packet_count: usize = packet_batches.iter().map(|x| x.len()).sum(); + let results = PacketDeserializer::deserialize_and_collect_packets( + packet_count, + &[BankingPacketBatch::new((packet_batches, None))], + None, + ); assert_eq!(results.deserialized_packets.len(), 2); assert!(results.new_tracer_stats_option.is_none()); assert_eq!(results.passed_sigverify_count, 2); @@ -186,7 +214,12 @@ mod tests { assert_eq!(packet_batches.len(), 2); packet_batches[0][0].meta_mut().set_discard(true); - let results = PacketDeserializer::deserialize_and_collect_packets(&packet_batches, None); + let packet_count: usize = packet_batches.iter().map(|x| x.len()).sum(); + let results = PacketDeserializer::deserialize_and_collect_packets( + packet_count, + &[BankingPacketBatch::new((packet_batches, None))], + None, + ); assert_eq!(results.deserialized_packets.len(), 1); assert!(results.new_tracer_stats_option.is_none()); assert_eq!(results.passed_sigverify_count, 1); diff --git a/core/src/sigverify.rs b/core/src/sigverify.rs index 573da5eef14..8140efac7ec 100644 --- a/core/src/sigverify.rs +++ b/core/src/sigverify.rs @@ -9,15 +9,14 @@ pub use solana_perf::sigverify::{ }; use { crate::{ - banking_stage::BankingPacketBatch, + banking_trace::{BankingPacketBatch, BankingPacketSender}, sigverify_stage::{SigVerifier, SigVerifyServiceError}, }, - crossbeam_channel::Sender, solana_perf::{cuda_runtime::PinnedVec, packet::PacketBatch, recycler::Recycler, sigverify}, solana_sdk::{packet::Packet, saturating_add_assign}, }; -#[derive(Debug, Default, Clone, PartialEq, Eq)] +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct SigverifyTracerPacketStats { pub total_removed_before_sigverify_stage: usize, pub total_tracer_packets_received_in_sigverify_stage: usize, @@ -55,9 +54,8 @@ impl SigverifyTracerPacketStats { } } -#[derive(Clone)] pub struct TransactionSigVerifier { - packet_sender: Sender<::SendType>, + packet_sender: BankingPacketSender, tracer_packet_stats: SigverifyTracerPacketStats, recycler: Recycler, recycler_out: Recycler>, @@ -65,13 +63,13 @@ pub struct TransactionSigVerifier { } impl TransactionSigVerifier { - pub fn new_reject_non_vote(packet_sender: Sender<::SendType>) -> Self { + pub fn new_reject_non_vote(packet_sender: BankingPacketSender) -> Self { let mut new_self = Self::new(packet_sender); new_self.reject_non_vote = true; new_self } - pub fn new(packet_sender: Sender<::SendType>) -> Self { + pub fn new(packet_sender: BankingPacketSender) -> Self { init(); Self { packet_sender, @@ -128,8 +126,10 @@ impl SigVerifier for TransactionSigVerifier { packet_batches: Vec, ) -> Result<(), SigVerifyServiceError> { let tracer_packet_stats_to_send = std::mem::take(&mut self.tracer_packet_stats); - self.packet_sender - .send((packet_batches, Some(tracer_packet_stats_to_send)))?; + self.packet_sender.send(BankingPacketBatch::new(( + packet_batches, + Some(tracer_packet_stats_to_send), + )))?; Ok(()) } diff --git a/core/src/sigverify_stage.rs b/core/src/sigverify_stage.rs index 8edadebe2fb..09fd0fa525c 100644 --- a/core/src/sigverify_stage.rs +++ b/core/src/sigverify_stage.rs @@ -234,7 +234,7 @@ impl SigVerifier for DisabledSigVerifier { impl SigVerifyStage { #[allow(clippy::new_ret_no_self)] - pub fn new( + pub fn new( packet_receiver: FindPacketSenderStakeReceiver, verifier: T, name: &'static str, @@ -402,7 +402,7 @@ impl SigVerifyStage { Ok(()) } - fn verifier_service( + fn verifier_service( packet_receiver: FindPacketSenderStakeReceiver, mut verifier: T, name: &'static str, @@ -443,7 +443,7 @@ impl SigVerifyStage { .unwrap() } - fn verifier_services( + fn verifier_services( packet_receiver: FindPacketSenderStakeReceiver, verifier: T, name: &'static str, @@ -460,7 +460,10 @@ impl SigVerifyStage { mod tests { use { super::*, - crate::{sigverify::TransactionSigVerifier, sigverify_stage::timing::duration_as_ms}, + crate::{ + banking_trace::BankingTracer, sigverify::TransactionSigVerifier, + sigverify_stage::timing::duration_as_ms, + }, crossbeam_channel::unbounded, solana_perf::{ packet::{to_packet_batches, Packet}, @@ -532,7 +535,7 @@ mod tests { solana_logger::setup(); trace!("start"); let (packet_s, packet_r) = unbounded(); - let (verified_s, verified_r) = unbounded(); + let (verified_s, verified_r) = BankingTracer::channel_for_test(); let verifier = TransactionSigVerifier::new(verified_s); let stage = SigVerifyStage::new(packet_r, verifier, "test"); @@ -565,7 +568,8 @@ mod tests { let mut total_tracer_packets_received_in_sigverify_stage = 0; trace!("sent: {}", sent_len); loop { - if let Ok((mut verifieds, tracer_packet_stats_option)) = verified_r.recv() { + if let Ok(message) = verified_r.recv() { + let (verifieds, tracer_packet_stats_option) = (&message.0, message.1.clone()); let tracer_packet_stats = tracer_packet_stats_option.unwrap(); total_tracer_packets_received_in_sigverify_stage += tracer_packet_stats.total_tracer_packets_received_in_sigverify_stage; @@ -603,9 +607,9 @@ mod tests { ); } assert_eq!(tracer_packet_stats.total_excess_tracer_packets, 0); - while let Some(v) = verifieds.pop() { + for v in verifieds.iter().rev() { received += v.len(); - batches.push(v); + batches.push(v.clone()); } } diff --git a/core/src/tpu.rs b/core/src/tpu.rs index fd8030703ff..a34306f8de9 100644 --- a/core/src/tpu.rs +++ b/core/src/tpu.rs @@ -4,6 +4,7 @@ use { crate::{ banking_stage::BankingStage, + banking_trace::BankingTracer, broadcast_stage::{BroadcastStage, BroadcastStageType, RetransmitSlotsReceiver}, cluster_info_vote_listener::{ ClusterInfoVoteListener, GossipDuplicateConfirmedSlotsSender, @@ -99,6 +100,7 @@ impl Tpu { staked_nodes: &Arc>, shared_staked_nodes_overrides: Arc>>, tpu_enable_udp: bool, + banking_tracer: BankingTracer, ) -> Self { let TpuSockets { transactions: transactions_sockets, @@ -154,7 +156,7 @@ impl Tpu { "Vote", ); - let (non_vote_sender, non_vote_receiver) = unbounded(); + let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); let stats = Arc::new(StreamStats::default()); let (_, tpu_quic_t) = spawn_server( @@ -192,7 +194,7 @@ impl Tpu { SigVerifyStage::new(find_packet_sender_stake_receiver, verifier, "tpu-verifier") }; - let (tpu_vote_sender, tpu_vote_receiver) = unbounded(); + let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); let vote_sigverify_stage = { let verifier = TransactionSigVerifier::new_reject_non_vote(tpu_vote_sender); @@ -203,7 +205,8 @@ impl Tpu { ) }; - let (gossip_vote_sender, gossip_vote_receiver) = unbounded(); + let (gossip_vote_sender, gossip_vote_receiver) = + banking_tracer.create_channel_gossip_vote(); let cluster_info_vote_listener = ClusterInfoVoteListener::new( exit.clone(), cluster_info.clone(), @@ -231,6 +234,7 @@ impl Tpu { log_messages_bytes_limit, connection_cache.clone(), bank_forks.clone(), + banking_tracer, ); let broadcast_stage = broadcast_type.new_broadcast_stage( diff --git a/core/src/validator.rs b/core/src/validator.rs index 7c73f4c3e03..c0b11737d44 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -4,6 +4,7 @@ pub use solana_perf::report_target_features; use { crate::{ accounts_hash_verifier::AccountsHashVerifier, + banking_trace::BankingTracer, broadcast_stage::BroadcastStageType, cache_block_meta_service::{CacheBlockMetaSender, CacheBlockMetaService}, cluster_info_vote_listener::VoteTracker, @@ -176,6 +177,7 @@ pub struct ValidatorConfig { pub ledger_column_options: LedgerColumnOptions, pub runtime_config: RuntimeConfig, pub replay_slots_concurrently: bool, + pub banking_trace_size: u64, } impl Default for ValidatorConfig { @@ -238,6 +240,7 @@ impl Default for ValidatorConfig { ledger_column_options: LedgerColumnOptions::default(), runtime_config: RuntimeConfig::default(), replay_slots_concurrently: false, + banking_trace_size: 0, } } } @@ -983,6 +986,13 @@ impl Validator { &prioritization_fee_cache, )?; + let banking_tracer = BankingTracer::new((config.banking_trace_size > 0).then_some(( + blockstore.banking_tracer_path(), + exit.clone(), + config.banking_trace_size, + ))) + .map_err(|err| format!("{} [{:?}]", &err, &err))?; + let tpu = Tpu::new( &cluster_info, &poh_recorder, @@ -1017,6 +1027,7 @@ impl Validator { &staked_nodes, config.staked_nodes_overrides.clone(), tpu_enable_udp, + banking_tracer, ); datapoint_info!( diff --git a/ledger/src/blockstore.rs b/ledger/src/blockstore.rs index c0d59c8e011..cbaf9ebde20 100644 --- a/ledger/src/blockstore.rs +++ b/ledger/src/blockstore.rs @@ -221,11 +221,14 @@ impl Blockstore { self.db } - /// The path to the ledger store pub fn ledger_path(&self) -> &PathBuf { &self.ledger_path } + pub fn banking_tracer_path(&self) -> PathBuf { + self.ledger_path.join("banking_trace") + } + /// Opens a Ledger in directory, provides "infinite" window of shreds pub fn open(ledger_path: &Path) -> Result { Self::do_open(ledger_path, BlockstoreOptions::default()) diff --git a/local-cluster/src/validator_configs.rs b/local-cluster/src/validator_configs.rs index a2b32fec362..d536667b1f1 100644 --- a/local-cluster/src/validator_configs.rs +++ b/local-cluster/src/validator_configs.rs @@ -64,6 +64,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig { ledger_column_options: config.ledger_column_options.clone(), runtime_config: config.runtime_config.clone(), replay_slots_concurrently: config.replay_slots_concurrently, + banking_trace_size: config.banking_trace_size, } } diff --git a/perf/src/cuda_runtime.rs b/perf/src/cuda_runtime.rs index 6c802caa9e4..efc7545c6e8 100644 --- a/perf/src/cuda_runtime.rs +++ b/perf/src/cuda_runtime.rs @@ -12,6 +12,7 @@ use { }, rand::{seq::SliceRandom, Rng}, rayon::prelude::*, + serde::{Deserialize, Serialize}, std::{ ops::{Index, IndexMut}, os::raw::c_int, @@ -53,11 +54,12 @@ fn unpin(_mem: *mut T) { // A vector wrapper where the underlying memory can be // page-pinned. Controlled by flags in case user only wants // to pin in certain circumstances. -#[derive(Debug, Default)] +#[derive(Debug, Default, Serialize, Deserialize)] pub struct PinnedVec { x: Vec, pinned: bool, pinnable: bool, + #[serde(skip)] recycler: Weak>>, } diff --git a/perf/src/packet.rs b/perf/src/packet.rs index 4e6b64fe156..2cb07b2b6f1 100644 --- a/perf/src/packet.rs +++ b/perf/src/packet.rs @@ -4,7 +4,7 @@ use { crate::{cuda_runtime::PinnedVec, recycler::Recycler}, bincode::config::Options, rayon::prelude::{IntoParallelIterator, IntoParallelRefIterator, IntoParallelRefMutIterator}, - serde::{de::DeserializeOwned, Serialize}, + serde::{de::DeserializeOwned, Deserialize, Serialize}, std::{ io::Read, net::SocketAddr, @@ -18,7 +18,7 @@ pub const NUM_PACKETS: usize = 1024 * 8; pub const PACKETS_PER_BATCH: usize = 64; pub const NUM_RCVMMSGS: usize = 64; -#[derive(Debug, Default, Clone)] +#[derive(Debug, Default, Clone, Serialize, Deserialize)] pub struct PacketBatch { packets: PinnedVec, } diff --git a/sdk/src/packet.rs b/sdk/src/packet.rs index 3301b7d42e2..af5104aa959 100644 --- a/sdk/src/packet.rs +++ b/sdk/src/packet.rs @@ -3,7 +3,7 @@ use { bincode::{Options, Result}, bitflags::bitflags, - serde::Serialize, + serde::{Deserialize, Serialize}, std::{ fmt, io, net::{IpAddr, Ipv4Addr, SocketAddr}, @@ -21,6 +21,7 @@ pub const PACKET_DATA_SIZE: usize = 1280 - 40 - 8; bitflags! { #[repr(C)] + #[derive(Serialize, Deserialize)] pub struct PacketFlags: u8 { const DISCARD = 0b0000_0001; const FORWARDED = 0b0000_0010; @@ -30,7 +31,7 @@ bitflags! { } } -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[repr(C)] pub struct Meta { pub size: usize, @@ -40,11 +41,39 @@ pub struct Meta { pub sender_stake: u64, } -#[derive(Clone, Eq)] +mod serde_bytes_array { + use { + core::convert::TryInto, + serde::{de::Error, Deserializer, Serializer}, + }; + + pub(crate) fn serialize(bytes: &[u8], serializer: S) -> Result + where + S: Serializer, + { + serde_bytes::serialize(bytes, serializer) + } + + pub(crate) fn deserialize<'de, D, const N: usize>(deserializer: D) -> Result<[u8; N], D::Error> + where + D: Deserializer<'de>, + { + let vec: Vec = serde_bytes::deserialize(deserializer)?; + let vec_len = vec.len(); + let array: [u8; N] = vec.try_into().map_err(|_| { + let expected = format!("[u8; {}]", N); + D::Error::invalid_length(vec_len, &expected.as_str()) + })?; + Ok(array) + } +} + +#[derive(Clone, Eq, Serialize, Deserialize)] #[repr(C)] pub struct Packet { // Bytes past Packet.meta.size are not valid to read from. // Use Packet.data(index) to read from the buffer. + #[serde(with = "serde_bytes_array")] buffer: [u8; PACKET_DATA_SIZE], meta: Meta, } diff --git a/validator/src/cli.rs b/validator/src/cli.rs index 10380993770..bd046125a6f 100644 --- a/validator/src/cli.rs +++ b/validator/src/cli.rs @@ -10,6 +10,7 @@ use { }, keypair::SKIP_SEED_PHRASE_VALIDATION_ARG, }, + solana_core::banking_trace::DEFAULT_BANKING_TRACE_SIZE, solana_faucet::faucet::{self, FAUCET_PORT}, solana_net_utils::{MINIMUM_VALIDATOR_PORT_RANGE_WIDTH, VALIDATOR_PORT_RANGE}, solana_rpc::{rpc::MAX_REQUEST_BODY_SIZE, rpc_pubsub_service::PubSubConfig}, @@ -1302,6 +1303,15 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> { .long("replay-slots-concurrently") .help("Allow concurrent replay of slots on different forks") ) + .arg( + Arg::with_name("banking_trace_size") + .long("enable-banking-trace") + .value_name("MAX_BYTES") + .validator(is_parsable::) + .takes_value(true) + .default_value(&default_args.banking_trace_size) + .help("Write trace files for simulate-leader-blocks, retaining up to the maximum total bytes") + ) .args(&get_deprecated_arguments()) .after_help("The default subcommand is run") .subcommand( @@ -1683,6 +1693,8 @@ pub struct DefaultArgs { // Wait subcommand pub wait_for_restart_window_min_idle_time: String, pub wait_for_restart_window_max_delinquent_stake: String, + + pub banking_trace_size: String, } impl DefaultArgs { @@ -1761,6 +1773,7 @@ impl DefaultArgs { exit_max_delinquent_stake: "5".to_string(), wait_for_restart_window_min_idle_time: "10".to_string(), wait_for_restart_window_max_delinquent_stake: "5".to_string(), + banking_trace_size: DEFAULT_BANKING_TRACE_SIZE.to_string(), } } } diff --git a/validator/src/main.rs b/validator/src/main.rs index 67f39206a87..83fcc49be51 100644 --- a/validator/src/main.rs +++ b/validator/src/main.rs @@ -8,6 +8,7 @@ use { rand::{seq::SliceRandom, thread_rng}, solana_clap_utils::input_parsers::{keypair_of, keypairs_of, pubkey_of, value_of}, solana_core::{ + banking_trace::EMPTY_BANKING_TRACE_SIZE, ledger_cleanup_service::{DEFAULT_MAX_LEDGER_SHREDS, DEFAULT_MIN_MAX_LEDGER_SHREDS}, system_monitor_service::SystemMonitorService, tower_storage, @@ -1427,6 +1428,15 @@ pub fn main() { validator_config.max_ledger_shreds = Some(limit_ledger_size); } + validator_config.banking_trace_size = if matches.occurrences_of("banking_trace_size") == 0 { + // disable with no explicit flag; then, this effectively becomes `opt-in` even if we're + // specifying a default value in clap configuration. + EMPTY_BANKING_TRACE_SIZE + } else { + // DEFAULT_BANKING_TRACE_SIZE or user-supplied override value + value_t_or_exit!(matches, "banking_trace_size", u64) + }; + validator_config.ledger_column_options = LedgerColumnOptions { compression_type: match matches.value_of("rocksdb_ledger_compression") { None => BlockstoreCompressionType::default(), From fc4f778a7c9832b774fcb63dcab36e440ac9c7db Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sat, 10 Dec 2022 22:31:50 +0900 Subject: [PATCH 02/43] Don't use eprintln!()... --- core/src/banking_trace.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index ba9083214fa..ba441a01af1 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -360,7 +360,7 @@ pub fn drop_and_clean_temp_dir_unless_suppressed(temp_dir: TempDir) { std::env::var("BANKING_TRACE_LEAVE_FILES_FROM_LAST_ITERATION") .is_ok() .then(|| { - eprintln!("prevented to remove {:?}", temp_dir.path()); + warn!("prevented to remove {:?}", temp_dir.path()); drop(temp_dir.into_path()); }); } From ec83707727167411092a41e8ae1537433c72a514 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sat, 10 Dec 2022 22:37:35 +0900 Subject: [PATCH 03/43] Update programs/sbf/Cargo.lock... --- programs/sbf/Cargo.lock | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/programs/sbf/Cargo.lock b/programs/sbf/Cargo.lock index a28285f7e87..5313d36c92e 100644 --- a/programs/sbf/Cargo.lock +++ b/programs/sbf/Cargo.lock @@ -3694,6 +3694,14 @@ dependencies = [ "librocksdb-sys", ] +[[package]] +name = "rolling-file" +version = "0.1.0" +source = "git+https://github.com/ryoqun/rolling-file-rs?rev=664504e77fad7292dedf5a119d950b80aef67934#664504e77fad7292dedf5a119d950b80aef67934" +dependencies = [ + "chrono", +] + [[package]] name = "rpassword" version = "7.0.0" @@ -4473,6 +4481,7 @@ dependencies = [ "rand 0.7.3", "rand_chacha 0.2.2", "rayon", + "rolling-file", "rustc_version 0.4.0", "serde", "serde_derive", From 2257bd0ea929948fae16400187f22bd7506099e6 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sun, 11 Dec 2022 08:38:38 +0900 Subject: [PATCH 04/43] Remove meaningless assert_eq --- core/src/banking_trace.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index ba441a01af1..d49bf2e86d8 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -151,7 +151,6 @@ pub fn sender_overhead_minimized_receiver_loop( Ok(message) => on_recv(message)?, Err(TryRecvError::Empty) => break 'inner, Err(TryRecvError::Disconnected) => { - assert_eq!(receiver.len(), 0); break 'outer; } }; From b5b495eaca0fb7fc6502c2f37e5cdc86bea98595 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sun, 11 Dec 2022 08:40:29 +0900 Subject: [PATCH 05/43] Group test-only code under aptly named mod --- core/benches/banking_trace.rs | 8 +++-- core/src/banking_trace.rs | 66 ++++++++++++++++++++--------------- 2 files changed, 42 insertions(+), 32 deletions(-) diff --git a/core/benches/banking_trace.rs b/core/benches/banking_trace.rs index 54224bf6f6a..1013fa6cd8a 100644 --- a/core/benches/banking_trace.rs +++ b/core/benches/banking_trace.rs @@ -4,9 +4,11 @@ extern crate test; use { solana_core::banking_trace::{ - drop_and_clean_temp_dir_unless_suppressed, sample_packet_batch, - sender_overhead_minimized_receiver_loop, terminate_tracer, BankingPacketBatch, - BankingTracer, TraceError, TracerThreadResult, DEFAULT_BANKING_TRACE_SIZE, + for_test::{ + drop_and_clean_temp_dir_unless_suppressed, sample_packet_batch, terminate_tracer, + }, + sender_overhead_minimized_receiver_loop, BankingPacketBatch, BankingTracer, TraceError, + TracerThreadResult, DEFAULT_BANKING_TRACE_SIZE, }, std::{ path::PathBuf, diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index d49bf2e86d8..1ede3cbcbbe 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -351,33 +351,37 @@ impl TracedSender { } } -pub fn sample_packet_batch() -> BankingPacketBatch { - BankingPacketBatch::new((to_packet_batches(&vec![test_tx(); 4], 10), None)) -} +pub mod for_test { + use super::*; -pub fn drop_and_clean_temp_dir_unless_suppressed(temp_dir: TempDir) { - std::env::var("BANKING_TRACE_LEAVE_FILES_FROM_LAST_ITERATION") - .is_ok() - .then(|| { - warn!("prevented to remove {:?}", temp_dir.path()); - drop(temp_dir.into_path()); - }); -} + pub fn sample_packet_batch() -> BankingPacketBatch { + BankingPacketBatch::new((to_packet_batches(&vec![test_tx(); 4], 10), None)) + } -pub fn terminate_tracer( - tracer: BankingTracer, - main_thread: JoinHandle, - sender: TracedSender, - exit: Option>, -) { - if let Some(exit) = exit { - exit.store(true, Ordering::Relaxed); + pub fn drop_and_clean_temp_dir_unless_suppressed(temp_dir: TempDir) { + std::env::var("BANKING_TRACE_LEAVE_FILES_FROM_LAST_ITERATION") + .is_ok() + .then(|| { + warn!("prevented to remove {:?}", temp_dir.path()); + drop(temp_dir.into_path()); + }); } - let (tracer_thread, tracer) = tracer.finalize_under_arc(); - drop((sender, tracer)); - main_thread.join().unwrap().unwrap(); - if let Some(tracer_thread) = tracer_thread { - tracer_thread.join().unwrap().unwrap(); + + pub fn terminate_tracer( + tracer: BankingTracer, + main_thread: JoinHandle, + sender: TracedSender, + exit: Option>, + ) { + if let Some(exit) = exit { + exit.store(true, Ordering::Relaxed); + } + let (tracer_thread, tracer) = tracer.finalize_under_arc(); + drop((sender, tracer)); + main_thread.join().unwrap().unwrap(); + if let Some(tracer_thread) = tracer_thread { + tracer_thread.join().unwrap().unwrap(); + } } } @@ -406,7 +410,7 @@ mod tests { non_vote_sender .send(BankingPacketBatch::new((vec![], None))) .unwrap(); - terminate_tracer(tracer, dummy_main_thread, non_vote_sender, None); + for_test::terminate_tracer(tracer, dummy_main_thread, non_vote_sender, None); } #[test] @@ -438,7 +442,9 @@ mod tests { drop(tracer); // shouldn't panic - non_vote_sender.send(sample_packet_batch()).unwrap(); + non_vote_sender + .send(for_test::sample_packet_batch()) + .unwrap(); // finally terminate and join the main thread exit_for_dummy_thread2.store(true, Ordering::Relaxed); @@ -462,10 +468,12 @@ mod tests { ) }); - non_vote_sender.send(sample_packet_batch()).unwrap(); + non_vote_sender + .send(for_test::sample_packet_batch()) + .unwrap(); tracer.bank_start(1, 2, 3); - terminate_tracer(tracer, dummy_main_thread, non_vote_sender, None); + for_test::terminate_tracer(tracer, dummy_main_thread, non_vote_sender, None); let mut stream = BufReader::new(File::open(path.join(BASENAME)).unwrap()); let results = (0..3) @@ -491,6 +499,6 @@ mod tests { Err(_) // in this way, rustfmt formats this line like the above ); - drop_and_clean_temp_dir_unless_suppressed(temp_dir); + for_test::drop_and_clean_temp_dir_unless_suppressed(temp_dir); } } From 2aa1872ec1cc7e1b93037efcdc22422b826baed2 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sun, 11 Dec 2022 09:11:11 +0900 Subject: [PATCH 06/43] Remove needless overflow handling in receive_until --- core/src/packet_deserializer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/packet_deserializer.rs b/core/src/packet_deserializer.rs index 06cec393991..c308656b867 100644 --- a/core/src/packet_deserializer.rs +++ b/core/src/packet_deserializer.rs @@ -122,6 +122,7 @@ impl PacketDeserializer { .iter() .map(|batch| batch.len()) .sum::(); + messages.push(message); if let Some(tracer_packet_stats) = &tracer_packet_stats_option { if let Some(aggregated_tracer_packet_stats) = @@ -132,7 +133,6 @@ impl PacketDeserializer { aggregated_tracer_packet_stats_option = tracer_packet_stats_option; } } - messages.push(message); if start.elapsed() >= recv_timeout || num_packets_received >= packet_count_upperbound { break; From f046ee55ab75d948859dd31a7d9366d2a4a34aa5 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sun, 11 Dec 2022 09:51:43 +0900 Subject: [PATCH 07/43] Delay stat aggregation as it's possible now --- core/src/packet_deserializer.rs | 81 ++++++++++++++++----------------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/core/src/packet_deserializer.rs b/core/src/packet_deserializer.rs index c308656b867..f8783da5526 100644 --- a/core/src/packet_deserializer.rs +++ b/core/src/packet_deserializer.rs @@ -28,6 +28,11 @@ pub struct PacketDeserializer { packet_batch_receiver: BankingPacketReceiver, } +enum ForEach<'a> { + Batch(&'a PacketBatch), + Stats(&'a SigverifyTracerPacketStats), +} + impl PacketDeserializer { pub fn new(packet_batch_receiver: BankingPacketReceiver) -> Self { Self { @@ -45,22 +50,24 @@ impl PacketDeserializer { recv_timeout: Duration, capacity: usize, ) -> Result { - let (packet_count, packet_batches, sigverify_tracer_stats_option) = - self.receive_until(recv_timeout, capacity)?; + let (packet_count, packet_batches) = self.receive_until(recv_timeout, capacity)?; Ok(Self::deserialize_and_collect_packets( packet_count, &packet_batches, - sigverify_tracer_stats_option, )) } fn for_each_packet_batch( banking_batches: &[BankingPacketBatch], - mut for_each: impl FnMut(&PacketBatch), + mut for_each: impl FnMut(ForEach), ) { for banking_batch in banking_batches { for batch in &banking_batch.0 { - for_each(batch) + for_each(ForEach::Batch(batch)) + } + + if let Some(stats) = &banking_batch.1 { + for_each(ForEach::Stats(stats)) } } } @@ -69,23 +76,39 @@ impl PacketDeserializer { fn deserialize_and_collect_packets( packet_count: usize, packet_batches: &[BankingPacketBatch], - sigverify_tracer_stats_option: Option, ) -> ReceivePacketResults { let mut passed_sigverify_count: usize = 0; let mut failed_sigverify_count: usize = 0; let mut deserialized_packets = Vec::with_capacity(packet_count); - Self::for_each_packet_batch(packet_batches, |packet_batch| { - let packet_indexes = Self::generate_packet_indexes(packet_batch); + let mut aggregated_tracer_packet_stats_option = None::; + + Self::for_each_packet_batch(packet_batches, |packet_batch_or_stat| { + match packet_batch_or_stat { + ForEach::Batch(packet_batch) => { + let packet_indexes = Self::generate_packet_indexes(packet_batch); - passed_sigverify_count += packet_indexes.len(); - failed_sigverify_count += packet_batch.len().saturating_sub(packet_indexes.len()); + passed_sigverify_count += packet_indexes.len(); + failed_sigverify_count += + packet_batch.len().saturating_sub(packet_indexes.len()); - deserialized_packets.extend(Self::deserialize_packets(packet_batch, &packet_indexes)); + deserialized_packets + .extend(Self::deserialize_packets(packet_batch, &packet_indexes)); + } + ForEach::Stats(tracer_packet_stats) => { + if let Some(aggregated_tracer_packet_stats) = + &mut aggregated_tracer_packet_stats_option + { + aggregated_tracer_packet_stats.aggregate(tracer_packet_stats); + } else { + aggregated_tracer_packet_stats_option = Some(tracer_packet_stats.clone()); + } + } + } }); ReceivePacketResults { deserialized_packets, - new_tracer_stats_option: sigverify_tracer_stats_option, + new_tracer_stats_option: aggregated_tracer_packet_stats_option, passed_sigverify_count: passed_sigverify_count as u64, failed_sigverify_count: failed_sigverify_count as u64, } @@ -96,19 +119,11 @@ impl PacketDeserializer { &self, recv_timeout: Duration, packet_count_upperbound: usize, - ) -> Result< - ( - usize, - Vec, - Option, - ), - RecvTimeoutError, - > { + ) -> Result<(usize, Vec), RecvTimeoutError> { let start = Instant::now(); let message = self.packet_batch_receiver.recv_timeout(recv_timeout)?; - let (packet_batches, mut aggregated_tracer_packet_stats_option) = - (&message.0, message.1.clone()); + let packet_batches = &message.0; let mut num_packets_received = packet_batches .iter() .map(|batch| batch.len()) @@ -116,7 +131,7 @@ impl PacketDeserializer { let mut messages = vec![message]; while let Ok(message) = self.packet_batch_receiver.try_recv() { - let (packet_batches, tracer_packet_stats_option) = (&message.0, message.1.clone()); + let packet_batches = &message.0; trace!("got more packet batches in packet deserializer"); num_packets_received += packet_batches .iter() @@ -124,26 +139,12 @@ impl PacketDeserializer { .sum::(); messages.push(message); - if let Some(tracer_packet_stats) = &tracer_packet_stats_option { - if let Some(aggregated_tracer_packet_stats) = - &mut aggregated_tracer_packet_stats_option - { - aggregated_tracer_packet_stats.aggregate(tracer_packet_stats); - } else { - aggregated_tracer_packet_stats_option = tracer_packet_stats_option; - } - } - if start.elapsed() >= recv_timeout || num_packets_received >= packet_count_upperbound { break; } } - Ok(( - num_packets_received, - messages, - aggregated_tracer_packet_stats_option, - )) + Ok((num_packets_received, messages)) } fn generate_packet_indexes(packet_batch: &PacketBatch) -> Vec { @@ -182,7 +183,7 @@ mod tests { #[test] fn test_deserialize_and_collect_packets_empty() { - let results = PacketDeserializer::deserialize_and_collect_packets(0, &[], None); + let results = PacketDeserializer::deserialize_and_collect_packets(0, &[]); assert_eq!(results.deserialized_packets.len(), 0); assert!(results.new_tracer_stats_option.is_none()); assert_eq!(results.passed_sigverify_count, 0); @@ -199,7 +200,6 @@ mod tests { let results = PacketDeserializer::deserialize_and_collect_packets( packet_count, &[BankingPacketBatch::new((packet_batches, None))], - None, ); assert_eq!(results.deserialized_packets.len(), 2); assert!(results.new_tracer_stats_option.is_none()); @@ -218,7 +218,6 @@ mod tests { let results = PacketDeserializer::deserialize_and_collect_packets( packet_count, &[BankingPacketBatch::new((packet_batches, None))], - None, ); assert_eq!(results.deserialized_packets.len(), 1); assert!(results.new_tracer_stats_option.is_none()); From b8edf77a9ce947edf5af52925226a3b752135df1 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sun, 11 Dec 2022 15:01:34 +0900 Subject: [PATCH 08/43] Use Cow to avoid needless heap allocs --- sdk/src/packet.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sdk/src/packet.rs b/sdk/src/packet.rs index af5104aa959..088fdfd699f 100644 --- a/sdk/src/packet.rs +++ b/sdk/src/packet.rs @@ -45,6 +45,7 @@ mod serde_bytes_array { use { core::convert::TryInto, serde::{de::Error, Deserializer, Serializer}, + std::borrow::Cow, }; pub(crate) fn serialize(bytes: &[u8], serializer: S) -> Result @@ -58,11 +59,11 @@ mod serde_bytes_array { where D: Deserializer<'de>, { - let vec: Vec = serde_bytes::deserialize(deserializer)?; - let vec_len = vec.len(); - let array: [u8; N] = vec.try_into().map_err(|_| { + let slice: Cow<'de, [u8]> = serde_bytes::deserialize(deserializer)?; + + let array: [u8; N] = (&*slice).try_into().map_err(|_| { let expected = format!("[u8; {}]", N); - D::Error::invalid_length(vec_len, &expected.as_str()) + D::Error::invalid_length(slice.len(), &expected.as_str()) })?; Ok(array) } From 0ff201a985f9103ebfa6d4393a3aeece2cae096b Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Mon, 12 Dec 2022 10:53:28 +0900 Subject: [PATCH 09/43] Properly consume metrics action as soon as hold --- core/src/banking_stage.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/src/banking_stage.rs b/core/src/banking_stage.rs index 38d0745b929..0a4d8e1aa36 100644 --- a/core/src/banking_stage.rs +++ b/core/src/banking_stage.rs @@ -851,7 +851,9 @@ impl BankingStage { // Take metrics action after forwarding packets slot_metrics_tracker.apply_action(metrics_action); } - _ => (), + BufferedPacketsDecision::Hold => { + slot_metrics_tracker.apply_action(metrics_action); + } } } From 02a0a4cacab27ee1b671096f23944c0da42743f7 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Mon, 12 Dec 2022 15:12:24 +0900 Subject: [PATCH 10/43] Trace UnprocessedTransactionStorage::len() instead --- core/src/banking_stage.rs | 2 -- core/src/banking_trace.rs | 18 ++++++++++++------ core/src/leader_slot_banking_stage_metrics.rs | 12 ++++++------ core/src/packet_deserializer.rs | 4 ---- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/core/src/banking_stage.rs b/core/src/banking_stage.rs index 0a4d8e1aa36..41e8de3720d 100644 --- a/core/src/banking_stage.rs +++ b/core/src/banking_stage.rs @@ -770,7 +770,6 @@ impl BankingStage { connection_cache: &ConnectionCache, tracer_packet_stats: &mut TracerPacketStats, bank_forks: &Arc>, - unreceived_batch_count: usize, ) { if unprocessed_transaction_storage.should_not_process() { return; @@ -1002,7 +1001,6 @@ impl BankingStage { &connection_cache, &mut tracer_packet_stats, bank_forks, - packet_deserializer.unreceived_batch_count(), ), "process_buffered_packets", ); diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index 1ede3cbcbbe..deed6e5af84 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -227,25 +227,31 @@ impl BankingTracer { ) } - fn bank_event(&self, slot: Slot, id: u32, status: BankStatus, unreceived_batch_count: usize) { + fn bank_event( + &self, + slot: Slot, + id: u32, + status: BankStatus, + unprocessed_transaction_count: usize, + ) { if let Some((sender, _, exit)) = &self.enabled_tracer { if !exit.load(Ordering::Relaxed) { sender .send(TimedTracedEvent( SystemTime::now(), - TracedEvent::Bank(slot, id, status, unreceived_batch_count), + TracedEvent::Bank(slot, id, status, unprocessed_transaction_count), )) .expect("active tracer thread unless exited"); } } } - pub fn bank_start(&self, slot: Slot, id: u32, unreceived_batch_count: usize) { - self.bank_event(slot, id, BankStatus::Started, unreceived_batch_count); + pub fn bank_start(&self, slot: Slot, id: u32, unprocessed_transaction_count: usize) { + self.bank_event(slot, id, BankStatus::Started, unprocessed_transaction_count); } - pub fn bank_end(&self, slot: Slot, id: u32, unreceived_batch_count: usize) { - self.bank_event(slot, id, BankStatus::Ended, unreceived_batch_count); + pub fn bank_end(&self, slot: Slot, id: u32, unprocessed_transaction_count: usize) { + self.bank_event(slot, id, BankStatus::Ended, unprocessed_transaction_count); } pub fn channel_for_test() -> (TracedSender, Receiver) { diff --git a/core/src/leader_slot_banking_stage_metrics.rs b/core/src/leader_slot_banking_stage_metrics.rs index b57e4d035f4..9f7477ba0b5 100644 --- a/core/src/leader_slot_banking_stage_metrics.rs +++ b/core/src/leader_slot_banking_stage_metrics.rs @@ -359,7 +359,7 @@ pub struct LeaderSlotMetricsTracker { leader_slot_metrics: Option, id: u32, banking_tracer: Arc, - unreceived_batch_count: usize, + unprocessed_transaction_count: usize, } impl LeaderSlotMetricsTracker { @@ -368,12 +368,12 @@ impl LeaderSlotMetricsTracker { leader_slot_metrics: None, id, banking_tracer, - unreceived_batch_count: 0, + unprocessed_transaction_count: 0, } } - pub fn refresh_unreceived_batch_count(&mut self, batch_count: usize) { - self.unreceived_batch_count = batch_count; + pub fn refresh_unprocessed_transaction_count(&mut self, batch_count: usize) { + self.unprocessed_transaction_count = batch_count; } pub fn new_for_test() -> Self { @@ -387,14 +387,14 @@ impl LeaderSlotMetricsTracker { &bank_start.bank_creation_time, ); self.banking_tracer - .bank_start(metrics.slot, self.id, self.unreceived_batch_count); + .bank_start(metrics.slot, self.id, self.unprocessed_transaction_count); Some(metrics) } pub fn trace_bank_end(&self, metrics_slot: Slot) { self.banking_tracer - .bank_end(metrics_slot, self.id, self.unreceived_batch_count); + .bank_end(metrics_slot, self.id, self.unprocessed_transaction_count); } // Check leader slot, return MetricsTrackerAction to be applied by apply_action() diff --git a/core/src/packet_deserializer.rs b/core/src/packet_deserializer.rs index f8783da5526..6e57dfb52a9 100644 --- a/core/src/packet_deserializer.rs +++ b/core/src/packet_deserializer.rs @@ -40,10 +40,6 @@ impl PacketDeserializer { } } - pub fn unreceived_batch_count(&self) -> usize { - self.packet_batch_receiver.len() - } - /// Handles receiving packet batches from sigverify and returns a vector of deserialized packets pub fn handle_received_packets( &self, From 52b09d7740e373fd3e076b2d0941aa6ce56877e0 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 16 Dec 2022 22:52:51 +0900 Subject: [PATCH 11/43] Loosen joining api over type safety for replaystage --- core/src/banking_stage.rs | 12 +++--- core/src/banking_trace.rs | 42 ++++++++++--------- core/src/leader_slot_banking_stage_metrics.rs | 2 +- core/src/replay_stage.rs | 4 ++ core/src/tpu.rs | 2 +- core/src/tvu.rs | 4 ++ core/src/validator.rs | 17 ++++---- 7 files changed, 48 insertions(+), 35 deletions(-) diff --git a/core/src/banking_stage.rs b/core/src/banking_stage.rs index 41e8de3720d..8584da96e7b 100644 --- a/core/src/banking_stage.rs +++ b/core/src/banking_stage.rs @@ -8,7 +8,7 @@ use { packet_receiver::PacketReceiver, }, crate::{ - banking_trace::{BankingPacketReceiver, BankingTracer, TracerThreadResult}, + banking_trace::{BankingPacketReceiver, BankingTracer, TracerThread}, forward_packet_batches_by_accounts::ForwardPacketBatchesByAccounts, immutable_deserialized_packet::ImmutableDeserializedPacket, latest_unprocessed_votes::{LatestUnprocessedVotes, VoteSource}, @@ -349,7 +349,7 @@ pub struct BatchedTransactionErrorDetails { /// Stores the stage's thread handle and output receiver. pub struct BankingStage { bank_thread_hdls: Vec>, - tracer_thread_hdl: Option>, + tracer_thread_hdl: TracerThread, } #[derive(Debug, Clone)] @@ -382,7 +382,7 @@ impl BankingStage { log_messages_bytes_limit: Option, connection_cache: Arc, bank_forks: Arc>, - banking_tracer: BankingTracer, + banking_tracer: Arc, ) -> Self { Self::new_num_threads( cluster_info, @@ -413,7 +413,7 @@ impl BankingStage { log_messages_bytes_limit: Option, connection_cache: Arc, bank_forks: Arc>, - banking_tracer: BankingTracer, + banking_tracer: Arc, ) -> Self { assert!(num_threads >= MIN_TOTAL_THREADS); // Single thread to generate entries from many banks. @@ -432,7 +432,6 @@ impl BankingStage { .is_active(&allow_votes_to_directly_update_vote_state::id()) }) .unwrap_or(false); - let (tracer_thread_hdl, banking_tracer) = banking_tracer.finalize_under_arc(); // Many banks that process transactions in parallel. let bank_thread_hdls: Vec> = (0..num_threads) .map(|i| { @@ -507,9 +506,10 @@ impl BankingStage { .unwrap() }) .collect(); + Self { bank_thread_hdls, - tracer_thread_hdl, + tracer_thread_hdl: banking_tracer.take_tracer_thread_join_handle(), } } diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index deed6e5af84..f27eb8760d9 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -8,14 +8,14 @@ use { packet::{to_packet_batches, PacketBatch}, test_tx::test_tx, }, - solana_sdk::slot_history::Slot, + solana_sdk::{hash::Hash, slot_history::Slot}, std::{ fs::{create_dir_all, remove_dir_all}, io::{self, Write}, path::PathBuf, sync::{ atomic::{AtomicBool, Ordering}, - Arc, + Arc, Mutex, }, thread::{self, sleep, JoinHandle}, time::{Duration, SystemTime}, @@ -28,6 +28,7 @@ pub type BankingPacketBatch = Arc<(Vec, Option; pub type TracerThreadResult = Result<(), TraceError>; +pub type TracerThread = Option>; #[derive(Error, Debug)] pub enum TraceError { @@ -58,7 +59,7 @@ pub const DEFAULT_BANKING_TRACE_SIZE: u64 = pub struct BankingTracer { enabled_tracer: Option<( Sender, - Option>, + Mutex>>, Arc, )>, } @@ -165,7 +166,9 @@ pub fn sender_overhead_minimized_receiver_loop( } impl BankingTracer { - pub fn new(maybe_config: Option<(PathBuf, Arc, u64)>) -> Result { + pub fn new( + maybe_config: Option<(PathBuf, Arc, u64)>, + ) -> Result, TraceError> { let enabled_tracer = maybe_config .map(|(path, exit, total_size)| -> Result<_, TraceError> { let rotate_threshold_size = total_size / TRACE_FILE_ROTATE_COUNT; @@ -184,17 +187,17 @@ impl BankingTracer { let tracing_thread = Self::spawn_background_thread(trace_receiver, file_appender, exit.clone())?; - Ok((trace_sender, Some(tracing_thread), exit)) + Ok((trace_sender, Mutex::new(Some(tracing_thread)), exit)) }) .transpose()?; - Ok(Self { enabled_tracer }) + Ok(Arc::new(Self { enabled_tracer })) } - pub fn new_disabled() -> Self { - Self { + pub fn new_disabled() -> Arc { + Arc::new(Self { enabled_tracer: None, - } + }) } fn create_channel(&self, label: ChannelLabel) -> (BankingPacketSender, BankingPacketReceiver) { @@ -218,13 +221,14 @@ impl BankingTracer { self.create_channel(ChannelLabel::GossipVote) } - pub fn finalize_under_arc(mut self) -> (Option>, Arc) { - ( - self.enabled_tracer - .as_mut() - .and_then(|(_, tracer_thread, _)| tracer_thread.take()), - Arc::new(self), - ) + pub fn take_tracer_thread_join_handle(&self) -> TracerThread { + self.enabled_tracer.as_ref().map(|(_, tracer_thread, _)| { + tracer_thread + .lock() + .unwrap() + .take() + .expect("no double take; BankingStage should only do once!") + }) } fn bank_event( @@ -374,7 +378,7 @@ pub mod for_test { } pub fn terminate_tracer( - tracer: BankingTracer, + tracer: Arc, main_thread: JoinHandle, sender: TracedSender, exit: Option>, @@ -382,7 +386,7 @@ pub mod for_test { if let Some(exit) = exit { exit.store(true, Ordering::Relaxed); } - let (tracer_thread, tracer) = tracer.finalize_under_arc(); + let tracer_thread = tracer.take_tracer_thread_join_handle(); drop((sender, tracer)); main_thread.join().unwrap().unwrap(); if let Some(tracer_thread) = tracer_thread { @@ -425,7 +429,7 @@ mod tests { let path = temp_dir.path().join("banking-trace"); let exit = Arc::::default(); let tracer = BankingTracer::new(Some((path, exit.clone(), u64::max_value()))).unwrap(); - let (tracer_thread, tracer) = tracer.finalize_under_arc(); + let tracer_thread = tracer.take_tracer_thread_join_handle(); let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); let exit_for_dummy_thread = Arc::::default(); diff --git a/core/src/leader_slot_banking_stage_metrics.rs b/core/src/leader_slot_banking_stage_metrics.rs index 9f7477ba0b5..86406bf103e 100644 --- a/core/src/leader_slot_banking_stage_metrics.rs +++ b/core/src/leader_slot_banking_stage_metrics.rs @@ -377,7 +377,7 @@ impl LeaderSlotMetricsTracker { } pub fn new_for_test() -> Self { - Self::new(0, Arc::new(BankingTracer::new_disabled())) + Self::new(0, BankingTracer::new_disabled()) } fn create_new_slot_metrics(&self, bank_start: &BankStart) -> Option { diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index 67be6836324..ba77f9689aa 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -3,6 +3,7 @@ use { crate::{ ancestor_hashes_service::AncestorHashesReplayUpdateSender, + banking_trace::BankingTracer, broadcast_stage::RetransmitSlotsSender, cache_block_meta_service::CacheBlockMetaSender, cluster_info_vote_listener::{ @@ -402,6 +403,7 @@ impl ReplayStage { log_messages_bytes_limit: Option, prioritization_fee_cache: Arc, dumped_slots_sender: DumpedSlotsSender, + banking_tracer: Arc, ) -> Result { let mut tower = if let Some(process_blockstore) = maybe_process_blockstore { let tower = process_blockstore.process_to_create_tower()?; @@ -943,6 +945,7 @@ impl ReplayStage { &mut progress, &retransmit_slots_sender, &mut skipped_slots_info, + &banking_tracer, has_new_vote_been_rooted, transaction_status_sender.is_some(), ); @@ -1654,6 +1657,7 @@ impl ReplayStage { progress_map: &mut ProgressMap, retransmit_slots_sender: &RetransmitSlotsSender, skipped_slots_info: &mut SkippedSlotsInfo, + banking_tracer: &Arc, has_new_vote_been_rooted: bool, track_transaction_indexes: bool, ) { diff --git a/core/src/tpu.rs b/core/src/tpu.rs index a34306f8de9..2ef09d84870 100644 --- a/core/src/tpu.rs +++ b/core/src/tpu.rs @@ -99,8 +99,8 @@ impl Tpu { log_messages_bytes_limit: Option, staked_nodes: &Arc>, shared_staked_nodes_overrides: Arc>>, + banking_tracer: Arc, tpu_enable_udp: bool, - banking_tracer: BankingTracer, ) -> Self { let TpuSockets { transactions: transactions_sockets, diff --git a/core/src/tvu.rs b/core/src/tvu.rs index 5d447116044..d6dac79f8d6 100644 --- a/core/src/tvu.rs +++ b/core/src/tvu.rs @@ -3,6 +3,7 @@ use { crate::{ + banking_trace::BankingTracer, broadcast_stage::RetransmitSlotsSender, cache_block_meta_service::CacheBlockMetaSender, cluster_info_vote_listener::{ @@ -135,6 +136,7 @@ impl Tvu { log_messages_bytes_limit: Option, connection_cache: &Arc, prioritization_fee_cache: &Arc, + banking_tracer: Arc, ) -> Result { let TvuSockets { repair: repair_socket, @@ -299,6 +301,7 @@ impl Tvu { log_messages_bytes_limit, prioritization_fee_cache.clone(), dumped_slots_sender, + banking_tracer, )?; let ledger_cleanup_service = tvu_config.max_ledger_shreds.map(|max_ledger_shreds| { @@ -468,6 +471,7 @@ pub mod tests { None, &Arc::new(ConnectionCache::default()), &_ignored_prioritization_fee_cache, + BankingTracer::new_disabled(), ) .expect("assume success"); exit.store(true, Ordering::Relaxed); diff --git a/core/src/validator.rs b/core/src/validator.rs index c0b11737d44..51c30ad613e 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -935,6 +935,13 @@ impl Validator { exit.clone(), ); + let banking_tracer = BankingTracer::new((config.banking_trace_size > 0).then_some(( + blockstore.banking_tracer_path(), + exit.clone(), + config.banking_trace_size, + ))) + .map_err(|err| format!("{} [{:?}]", &err, &err))?; + let (replay_vote_sender, replay_vote_receiver) = unbounded(); let tvu = Tvu::new( vote_account, @@ -984,15 +991,9 @@ impl Validator { config.runtime_config.log_messages_bytes_limit, &connection_cache, &prioritization_fee_cache, + banking_tracer.clone(), )?; - let banking_tracer = BankingTracer::new((config.banking_trace_size > 0).then_some(( - blockstore.banking_tracer_path(), - exit.clone(), - config.banking_trace_size, - ))) - .map_err(|err| format!("{} [{:?}]", &err, &err))?; - let tpu = Tpu::new( &cluster_info, &poh_recorder, @@ -1026,8 +1027,8 @@ impl Validator { config.runtime_config.log_messages_bytes_limit, &staked_nodes, config.staked_nodes_overrides.clone(), - tpu_enable_udp, banking_tracer, + tpu_enable_udp, ); datapoint_info!( From e8d49be9b010a0b7ac9d9195547acc0681d4b468 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 16 Dec 2022 22:53:41 +0900 Subject: [PATCH 12/43] Introce hash event to override these when simulating --- core/src/banking_trace.rs | 56 +++++++++++++++++++++++++++++++-------- core/src/replay_stage.rs | 3 +++ 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index f27eb8760d9..f12bfd2afdb 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -69,8 +69,9 @@ pub struct TimedTracedEvent(std::time::SystemTime, TracedEvent); #[derive(Serialize, Deserialize, Debug)] enum TracedEvent { - Bank(Slot, u32, BankStatus, usize), PacketBatch(ChannelLabel, BankingPacketBatch), + Bank(Slot, u32, BankStatus, usize), + BlockAndBankHash(Slot, Hash, Hash), } #[derive(Serialize, Deserialize, Debug)] @@ -238,13 +239,24 @@ impl BankingTracer { status: BankStatus, unprocessed_transaction_count: usize, ) { + self.trace_event(TimedTracedEvent( + SystemTime::now(), + TracedEvent::Bank(slot, id, status, unprocessed_transaction_count), + )) + } + + pub fn hash_event(&self, slot: Slot, blockhash: Hash, bank_hash: Hash) { + self.trace_event(TimedTracedEvent( + SystemTime::now(), + TracedEvent::BlockAndBankHash(slot, blockhash, bank_hash), + )) + } + + fn trace_event(&self, event: TimedTracedEvent) { if let Some((sender, _, exit)) = &self.enabled_tracer { if !exit.load(Ordering::Relaxed) { sender - .send(TimedTracedEvent( - SystemTime::now(), - TracedEvent::Bank(slot, id, status, unprocessed_transaction_count), - )) + .send(event) .expect("active tracer thread unless exited"); } } @@ -399,7 +411,12 @@ pub mod for_test { mod tests { use { super::*, - std::{fs::File, io::BufReader}, + bincode::ErrorKind::Io as BincodeIoError, + std::{ + fs::File, + io::{BufReader, ErrorKind::UnexpectedEof}, + str::FromStr, + }, }; #[test] @@ -482,31 +499,48 @@ mod tests { .send(for_test::sample_packet_batch()) .unwrap(); tracer.bank_start(1, 2, 3); + let blockhash = Hash::from_str("B1ockhash1111111111111111111111111111111111").unwrap(); + let bank_hash = Hash::from_str("BankHash11111111111111111111111111111111111").unwrap(); + tracer.hash_event(4, blockhash, bank_hash); for_test::terminate_tracer(tracer, dummy_main_thread, non_vote_sender, None); let mut stream = BufReader::new(File::open(path.join(BASENAME)).unwrap()); - let results = (0..3) + let results = (0..=3) .map(|_| bincode::deserialize_from::<_, TimedTracedEvent>(&mut stream)) .collect::>(); + let mut i = 0; assert_matches!( - results[0], + results[i], Ok(TimedTracedEvent( _, TracedEvent::PacketBatch(ChannelLabel::NonVote, _) )) ); + i += 1; assert_matches!( - results[1], + results[i], Ok(TimedTracedEvent( _, TracedEvent::Bank(1, 2, BankStatus::Started, 3) )) ); + i += 1; assert_matches!( - results[2], - Err(_) // in this way, rustfmt formats this line like the above + results[i], + Ok(TimedTracedEvent( + _, + TracedEvent::BlockAndBankHash(4, actual_blockhash, actual_bank_hash) + )) if actual_blockhash == blockhash && actual_bank_hash == bank_hash + ); + i += 1; + assert_matches!( + results[i], + Err(ref err) if matches!( + **err, + BincodeIoError(ref error) if error.kind() == UnexpectedEof + ) ); for_test::drop_and_clean_temp_dir_unless_suppressed(temp_dir); diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index ba77f9689aa..eb1a5701e77 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -1778,6 +1778,9 @@ impl ReplayStage { rpc_subscriptions, NewBankOptions { vote_only_bank }, ); + // make sure parent is frozen for finalized hashes via the above + // new()-ing of its child bank + banking_tracer.hash_event(parent.slot(), parent.last_blockhash(), parent.hash()); let tpu_bank = bank_forks.write().unwrap().insert(tpu_bank); poh_recorder From ebf408a583dab37c9db5356d045264e61dd08638 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Mon, 19 Dec 2022 15:17:13 +0900 Subject: [PATCH 13/43] Use serde_with/serde_as instead of hacky workaround --- Cargo.lock | 64 +++++++++++++++++++++++++++++++++++++++++++++++ sdk/Cargo.toml | 1 + sdk/src/packet.rs | 32 +++--------------------- 3 files changed, 68 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a7da1106b1c..e2eabad1eb7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1198,6 +1198,41 @@ dependencies = [ "zeroize", ] +[[package]] +name = "darling" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a784d2ccaf7c98501746bf0be29b2022ba41fd62a2e622af997a03e9f972859f" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2 1.0.41", + "quote 1.0.18", + "strsim 0.10.0", + "syn 1.0.98", +] + +[[package]] +name = "darling_macro" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e" +dependencies = [ + "darling_core", + "quote 1.0.18", + "syn 1.0.98", +] + [[package]] name = "dashmap" version = "4.0.2" @@ -2229,6 +2264,12 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "0.1.5" @@ -4398,6 +4439,28 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_with" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25bf4a5a814902cd1014dbccfa4d4560fb8432c779471e96e035602519f82eef" +dependencies = [ + "serde", + "serde_with_macros", +] + +[[package]] +name = "serde_with_macros" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3452b4c0f6c1e357f73fdb87cd1efabaa12acf328c7a528e252893baeb3f4aa" +dependencies = [ + "darling", + "proc-macro2 1.0.41", + "quote 1.0.18", + "syn 1.0.98", +] + [[package]] name = "serde_yaml" version = "0.8.26" @@ -6461,6 +6524,7 @@ dependencies = [ "serde_bytes", "serde_derive", "serde_json", + "serde_with", "sha2 0.10.5", "sha3 0.10.4", "solana-frozen-abi 1.15.0", diff --git a/sdk/Cargo.toml b/sdk/Cargo.toml index 290632634a3..339e1c6d959 100644 --- a/sdk/Cargo.toml +++ b/sdk/Cargo.toml @@ -70,6 +70,7 @@ serde = "1.0.144" serde_bytes = "0.11" serde_derive = "1.0.103" serde_json = { version = "1.0.83", optional = true } +serde_with = { version = "2.1.0", default-features = false, features = ["macros"] } sha2 = "0.10.5" sha3 = { version = "0.10.4", optional = true } solana-frozen-abi = { path = "../frozen-abi", version = "=1.15.0" } diff --git a/sdk/src/packet.rs b/sdk/src/packet.rs index 088fdfd699f..a706ba2a121 100644 --- a/sdk/src/packet.rs +++ b/sdk/src/packet.rs @@ -4,6 +4,7 @@ use { bincode::{Options, Result}, bitflags::bitflags, serde::{Deserialize, Serialize}, + serde_with::{serde_as, Bytes}, std::{ fmt, io, net::{IpAddr, Ipv4Addr, SocketAddr}, @@ -41,40 +42,13 @@ pub struct Meta { pub sender_stake: u64, } -mod serde_bytes_array { - use { - core::convert::TryInto, - serde::{de::Error, Deserializer, Serializer}, - std::borrow::Cow, - }; - - pub(crate) fn serialize(bytes: &[u8], serializer: S) -> Result - where - S: Serializer, - { - serde_bytes::serialize(bytes, serializer) - } - - pub(crate) fn deserialize<'de, D, const N: usize>(deserializer: D) -> Result<[u8; N], D::Error> - where - D: Deserializer<'de>, - { - let slice: Cow<'de, [u8]> = serde_bytes::deserialize(deserializer)?; - - let array: [u8; N] = (&*slice).try_into().map_err(|_| { - let expected = format!("[u8; {}]", N); - D::Error::invalid_length(slice.len(), &expected.as_str()) - })?; - Ok(array) - } -} - +#[serde_as] #[derive(Clone, Eq, Serialize, Deserialize)] #[repr(C)] pub struct Packet { // Bytes past Packet.meta.size are not valid to read from. // Use Packet.data(index) to read from the buffer. - #[serde(with = "serde_bytes_array")] + #[serde_as(as = "Bytes")] buffer: [u8; PACKET_DATA_SIZE], meta: Meta, } From 9dd971b25ce0ef86b02659c3cf3c2e89bf0a4475 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Mon, 19 Dec 2022 16:03:58 +0900 Subject: [PATCH 14/43] Update another Cargo.lock... --- programs/sbf/Cargo.lock | 64 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/programs/sbf/Cargo.lock b/programs/sbf/Cargo.lock index 5313d36c92e..da4c108e1ff 100644 --- a/programs/sbf/Cargo.lock +++ b/programs/sbf/Cargo.lock @@ -1034,6 +1034,41 @@ dependencies = [ "zeroize", ] +[[package]] +name = "darling" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a784d2ccaf7c98501746bf0be29b2022ba41fd62a2e622af997a03e9f972859f" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2 1.0.41", + "quote 1.0.18", + "strsim 0.10.0", + "syn 1.0.98", +] + +[[package]] +name = "darling_macro" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e" +dependencies = [ + "darling_core", + "quote 1.0.18", + "syn 1.0.98", +] + [[package]] name = "dashmap" version = "4.0.2" @@ -1968,6 +2003,12 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "0.1.5" @@ -3996,6 +4037,28 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_with" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25bf4a5a814902cd1014dbccfa4d4560fb8432c779471e96e035602519f82eef" +dependencies = [ + "serde", + "serde_with_macros", +] + +[[package]] +name = "serde_with_macros" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3452b4c0f6c1e357f73fdb87cd1efabaa12acf328c7a528e252893baeb3f4aa" +dependencies = [ + "darling", + "proc-macro2 1.0.41", + "quote 1.0.18", + "syn 1.0.98", +] + [[package]] name = "serde_yaml" version = "0.9.13" @@ -5784,6 +5847,7 @@ dependencies = [ "serde_bytes", "serde_derive", "serde_json", + "serde_with", "sha2 0.10.5", "sha3 0.10.4", "solana-frozen-abi 1.15.0", From 2a78d198c27edb8266d75cf1d81b9902cc4a8457 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Mon, 19 Dec 2022 21:56:04 +0900 Subject: [PATCH 15/43] Add detailed comment for Packet::buffer serialize --- sdk/src/packet.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/sdk/src/packet.rs b/sdk/src/packet.rs index a706ba2a121..df0c6bd4a0a 100644 --- a/sdk/src/packet.rs +++ b/sdk/src/packet.rs @@ -42,6 +42,31 @@ pub struct Meta { pub sender_stake: u64, } +// serde_as is used as a work around because array isn't supported by serde +// (and serde_bytes). +// +// the root cause is of a historical special handling for [T; 0] in rust's +// `Default` and supposedly mirrored serde's `Serialize` (macro) impls, +// pre-dating stabilized const generics, meaning it'll take long time...: +// https://github.com/rust-lang/rust/issues/61415 +// https://github.com/rust-lang/rust/issues/88744#issuecomment-1138678928 +// +// Due to the nature of the root cause, the current situation is complicated. +// All in all, the serde_as solution is chosen for good perf and low maintenance +// need at the cost of another crate dependency.. +// +// For details, please refer to the below various links... +// +// relevant merged/published pr for this serde_as functionality used here: +// https://github.com/jonasbb/serde_with/pull/277 +// open pr at serde_bytes: +// https://github.com/serde-rs/bytes/pull/28 +// open issue at serde: +// https://github.com/serde-rs/serde/issues/1937 +// closed pr at serde (due to the above mentioned [N; 0] issue): +// https://github.com/serde-rs/serde/pull/1860 +// ryoqun's dirty experiments: +// https://github.com/ryoqun/serde-array-comparisons #[serde_as] #[derive(Clone, Eq, Serialize, Deserialize)] #[repr(C)] From 24ffc4894a98c7a11406b9cc46e192607a7d9fb7 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Mon, 19 Dec 2022 22:23:44 +0900 Subject: [PATCH 16/43] Rename sender_overhead_minimized_receiver_loop() --- core/benches/banking_trace.rs | 12 ++++++------ core/src/banking_trace.rs | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/core/benches/banking_trace.rs b/core/benches/banking_trace.rs index 1013fa6cd8a..b0ec4334196 100644 --- a/core/benches/banking_trace.rs +++ b/core/benches/banking_trace.rs @@ -7,8 +7,8 @@ use { for_test::{ drop_and_clean_temp_dir_unless_suppressed, sample_packet_batch, terminate_tracer, }, - sender_overhead_minimized_receiver_loop, BankingPacketBatch, BankingTracer, TraceError, - TracerThreadResult, DEFAULT_BANKING_TRACE_SIZE, + receiving_loop_with_minimized_sender_overhead, BankingPacketBatch, BankingTracer, + TraceError, TracerThreadResult, DEFAULT_BANKING_TRACE_SIZE, }, std::{ path::PathBuf, @@ -39,7 +39,7 @@ fn bench_banking_tracer_main_thread_overhead_noop_baseline(bencher: &mut Bencher let exit_for_dummy_thread = exit.clone(); let dummy_main_thread = thread::spawn(move || { - sender_overhead_minimized_receiver_loop::<_, TraceError, 0>( + receiving_loop_with_minimized_sender_overhead::<_, TraceError, 0>( exit_for_dummy_thread, non_vote_receiver, black_box_packet_batch, @@ -68,7 +68,7 @@ fn bench_banking_tracer_main_thread_overhead_under_peak_write(bencher: &mut Benc let exit_for_dummy_thread = exit.clone(); let dummy_main_thread = thread::spawn(move || { - sender_overhead_minimized_receiver_loop::<_, TraceError, 0>( + receiving_loop_with_minimized_sender_overhead::<_, TraceError, 0>( exit_for_dummy_thread, non_vote_receiver, black_box_packet_batch, @@ -99,7 +99,7 @@ fn bench_banking_tracer_main_thread_overhead_under_sustained_write(bencher: &mut let exit_for_dummy_thread = exit.clone(); let dummy_main_thread = thread::spawn(move || { - sender_overhead_minimized_receiver_loop::<_, TraceError, 0>( + receiving_loop_with_minimized_sender_overhead::<_, TraceError, 0>( exit_for_dummy_thread, non_vote_receiver, black_box_packet_batch, @@ -132,7 +132,7 @@ fn bench_banking_tracer_background_thread_throughput(bencher: &mut Bencher) { let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); let dummy_main_thread = thread::spawn(move || { - sender_overhead_minimized_receiver_loop::<_, TraceError, 0>( + receiving_loop_with_minimized_sender_overhead::<_, TraceError, 0>( exit.clone(), non_vote_receiver, black_box_packet_batch, diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index f12bfd2afdb..f3e10f6c0f1 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -140,7 +140,7 @@ impl<'a> Write for GroupedWriter<'a> { } } -pub fn sender_overhead_minimized_receiver_loop( +pub fn receiving_loop_with_minimized_sender_overhead( exit: Arc, receiver: Receiver, mut on_recv: impl FnMut(T) -> Result<(), E>, @@ -321,7 +321,7 @@ impl BankingTracer { ) -> Result, TraceError> { let thread = thread::Builder::new().name("solBanknTracer".into()).spawn( move || -> TracerThreadResult { - sender_overhead_minimized_receiver_loop::< + receiving_loop_with_minimized_sender_overhead::< _, TraceError, TRACE_FILE_WRITE_INTERVAL_MS, @@ -427,7 +427,7 @@ mod tests { let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); let dummy_main_thread = thread::spawn(move || { - sender_overhead_minimized_receiver_loop::<_, TraceError, 0>( + receiving_loop_with_minimized_sender_overhead::<_, TraceError, 0>( exit, non_vote_receiver, |_packet_batch| Ok(()), @@ -452,7 +452,7 @@ mod tests { let exit_for_dummy_thread = Arc::::default(); let exit_for_dummy_thread2 = exit_for_dummy_thread.clone(); let dummy_main_thread = thread::spawn(move || { - sender_overhead_minimized_receiver_loop::<_, TraceError, 0>( + receiving_loop_with_minimized_sender_overhead::<_, TraceError, 0>( exit_for_dummy_thread, non_vote_receiver, |_packet_batch| Ok(()), @@ -488,7 +488,7 @@ mod tests { let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); let dummy_main_thread = thread::spawn(move || { - sender_overhead_minimized_receiver_loop::<_, TraceError, 0>( + receiving_loop_with_minimized_sender_overhead::<_, TraceError, 0>( exit, non_vote_receiver, |_packet_batch| Ok(()), From ffbcb926d0bcc39cc553353296b13ab5c97d4dc8 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Mon, 19 Dec 2022 22:40:50 +0900 Subject: [PATCH 17/43] Use type interference for TraceError --- core/src/banking_trace.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index f3e10f6c0f1..fe45130dd39 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -171,7 +171,7 @@ impl BankingTracer { maybe_config: Option<(PathBuf, Arc, u64)>, ) -> Result, TraceError> { let enabled_tracer = maybe_config - .map(|(path, exit, total_size)| -> Result<_, TraceError> { + .map(|(path, exit, total_size)| { let rotate_threshold_size = total_size / TRACE_FILE_ROTATE_COUNT; if rotate_threshold_size == 0 { return Err(TraceError::TooSmallTraceSize( @@ -321,15 +321,15 @@ impl BankingTracer { ) -> Result, TraceError> { let thread = thread::Builder::new().name("solBanknTracer".into()).spawn( move || -> TracerThreadResult { - receiving_loop_with_minimized_sender_overhead::< - _, - TraceError, - TRACE_FILE_WRITE_INTERVAL_MS, - >(exit, trace_receiver, |event| -> Result<(), TraceError> { - file_appender.condition_mut().reset(); - serialize_into(&mut GroupedWriter::new(&mut file_appender), &event)?; - Ok(()) - })?; + receiving_loop_with_minimized_sender_overhead::<_, _, TRACE_FILE_WRITE_INTERVAL_MS>( + exit, + trace_receiver, + |event| -> Result<(), TraceError> { + file_appender.condition_mut().reset(); + serialize_into(&mut GroupedWriter::new(&mut file_appender), &event)?; + Ok(()) + }, + )?; file_appender.flush()?; Ok(()) }, From 28a30edbc284262936e56a02616e4d211222be5f Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Mon, 19 Dec 2022 22:47:24 +0900 Subject: [PATCH 18/43] Another minor rename --- banking-bench/src/main.rs | 2 +- core/src/validator.rs | 2 +- ledger/src/blockstore.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/banking-bench/src/main.rs b/banking-bench/src/main.rs index 3eb411209fd..98ad28e23e7 100644 --- a/banking-bench/src/main.rs +++ b/banking-bench/src/main.rs @@ -417,7 +417,7 @@ fn main() { let (exit, poh_recorder, poh_service, signal_receiver) = create_test_recorder(&bank, &blockstore, None, Some(leader_schedule_cache)); let banking_tracer = BankingTracer::new(matches.is_present("trace_banking").then_some(( - blockstore.banking_tracer_path(), + blockstore.banking_trace_path(), exit.clone(), DEFAULT_BANKING_TRACE_SIZE, ))) diff --git a/core/src/validator.rs b/core/src/validator.rs index 51c30ad613e..d4b5a50584a 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -936,7 +936,7 @@ impl Validator { ); let banking_tracer = BankingTracer::new((config.banking_trace_size > 0).then_some(( - blockstore.banking_tracer_path(), + blockstore.banking_trace_path(), exit.clone(), config.banking_trace_size, ))) diff --git a/ledger/src/blockstore.rs b/ledger/src/blockstore.rs index cbaf9ebde20..3d08feeaf4c 100644 --- a/ledger/src/blockstore.rs +++ b/ledger/src/blockstore.rs @@ -225,7 +225,7 @@ impl Blockstore { &self.ledger_path } - pub fn banking_tracer_path(&self) -> PathBuf { + pub fn banking_trace_path(&self) -> PathBuf { self.ledger_path.join("banking_trace") } From bd0caa7ec1ddee04c71814377e5915df2cdb4ee5 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Tue, 20 Dec 2022 15:13:58 +0900 Subject: [PATCH 19/43] Retire now useless ForEach to simplify code --- core/src/packet_deserializer.rs | 65 ++++++++++++--------------------- 1 file changed, 23 insertions(+), 42 deletions(-) diff --git a/core/src/packet_deserializer.rs b/core/src/packet_deserializer.rs index 6e57dfb52a9..094228dd077 100644 --- a/core/src/packet_deserializer.rs +++ b/core/src/packet_deserializer.rs @@ -28,11 +28,6 @@ pub struct PacketDeserializer { packet_batch_receiver: BankingPacketReceiver, } -enum ForEach<'a> { - Batch(&'a PacketBatch), - Stats(&'a SigverifyTracerPacketStats), -} - impl PacketDeserializer { pub fn new(packet_batch_receiver: BankingPacketReceiver) -> Self { Self { @@ -53,54 +48,40 @@ impl PacketDeserializer { )) } - fn for_each_packet_batch( - banking_batches: &[BankingPacketBatch], - mut for_each: impl FnMut(ForEach), - ) { - for banking_batch in banking_batches { - for batch in &banking_batch.0 { - for_each(ForEach::Batch(batch)) - } - - if let Some(stats) = &banking_batch.1 { - for_each(ForEach::Stats(stats)) - } - } - } - - /// Deserialize packet batches and collect them into ReceivePacketResults + /// Deserialize packet batches, aggregates tracer packet stats, and collect + /// them into ReceivePacketResults fn deserialize_and_collect_packets( packet_count: usize, - packet_batches: &[BankingPacketBatch], + banking_batches: &[BankingPacketBatch], ) -> ReceivePacketResults { let mut passed_sigverify_count: usize = 0; let mut failed_sigverify_count: usize = 0; let mut deserialized_packets = Vec::with_capacity(packet_count); let mut aggregated_tracer_packet_stats_option = None::; - Self::for_each_packet_batch(packet_batches, |packet_batch_or_stat| { - match packet_batch_or_stat { - ForEach::Batch(packet_batch) => { - let packet_indexes = Self::generate_packet_indexes(packet_batch); + for banking_batch in banking_batches { + for packet_batch in &banking_batch.0 { + let packet_indexes = Self::generate_packet_indexes(packet_batch); + + passed_sigverify_count += packet_indexes.len(); + failed_sigverify_count += packet_batch.len().saturating_sub(packet_indexes.len()); - passed_sigverify_count += packet_indexes.len(); - failed_sigverify_count += - packet_batch.len().saturating_sub(packet_indexes.len()); + deserialized_packets + .extend(Self::deserialize_packets(packet_batch, &packet_indexes)); + } - deserialized_packets - .extend(Self::deserialize_packets(packet_batch, &packet_indexes)); - } - ForEach::Stats(tracer_packet_stats) => { - if let Some(aggregated_tracer_packet_stats) = - &mut aggregated_tracer_packet_stats_option - { - aggregated_tracer_packet_stats.aggregate(tracer_packet_stats); - } else { - aggregated_tracer_packet_stats_option = Some(tracer_packet_stats.clone()); - } + if let Some(tracer_packet_stats) = &banking_batch.1 { + if let Some(aggregated_tracer_packet_stats) = + &mut aggregated_tracer_packet_stats_option + { + aggregated_tracer_packet_stats.aggregate(tracer_packet_stats); + } else { + // BankingPacketBatch is owned by Arc; so we have to clone its internal field + // (SigverifyTracerPacketStats). + aggregated_tracer_packet_stats_option = Some(tracer_packet_stats.clone()); } } - }); + } ReceivePacketResults { deserialized_packets, @@ -110,7 +91,7 @@ impl PacketDeserializer { } } - /// Receives packet batches from sigverify stage with a timeout, and aggregates tracer packet stats + /// Receives packet batches from sigverify stage with a timeout fn receive_until( &self, recv_timeout: Duration, From 5118cd8fa6e04e2bb570f53f3b75ca0895f9c198 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Tue, 20 Dec 2022 21:25:57 +0900 Subject: [PATCH 20/43] Use type alias as much as possible --- core/src/banking_trace.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index fe45130dd39..d62da6aa311 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -364,7 +364,7 @@ impl TracedSender { trace_sender .send(TimedTracedEvent( SystemTime::now(), - TracedEvent::PacketBatch(self.label, Arc::clone(&batch)), + TracedEvent::PacketBatch(self.label, BankingPacketBatch::clone(&batch)), )) .expect("active tracer thread unless exited"); } From ee044d44ecfae14ff15ed46c78b83c131846e7de Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Tue, 20 Dec 2022 21:31:58 +0900 Subject: [PATCH 21/43] Properly translate and propagate tracing errors --- core/src/banking_trace.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index d62da6aa311..2c12d8d0af1 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -366,7 +366,13 @@ impl TracedSender { SystemTime::now(), TracedEvent::PacketBatch(self.label, BankingPacketBatch::clone(&batch)), )) - .expect("active tracer thread unless exited"); + .map_err(|err| { + error!( + "unexpected error when tracing a banking event...: {:?}", + err + ); + SendError(BankingPacketBatch::clone(&batch)) + })?; } } self.sender.send(batch) From 8d1d522508821acdeca22ea2c32d1083ffbc1d67 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Tue, 20 Dec 2022 22:03:13 +0900 Subject: [PATCH 22/43] Clarify --enable-banking-trace with better naming --- banking-bench/src/main.rs | 4 ++-- core/benches/banking_trace.rs | 4 ++-- core/src/banking_trace.rs | 30 +++++++++++++++----------- core/src/validator.rs | 19 ++++++++-------- local-cluster/src/validator_configs.rs | 2 +- validator/src/cli.rs | 25 ++++++++++++++------- validator/src/main.rs | 26 ++++++++++++++-------- 7 files changed, 67 insertions(+), 43 deletions(-) diff --git a/banking-bench/src/main.rs b/banking-bench/src/main.rs index 98ad28e23e7..5f8c17f34ee 100644 --- a/banking-bench/src/main.rs +++ b/banking-bench/src/main.rs @@ -8,7 +8,7 @@ use { solana_client::connection_cache::ConnectionCache, solana_core::{ banking_stage::BankingStage, - banking_trace::{BankingPacketBatch, BankingTracer, DEFAULT_BANKING_TRACE_SIZE}, + banking_trace::{BankingPacketBatch, BankingTracer, BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT}, }, solana_gossip::cluster_info::{ClusterInfo, Node}, solana_ledger::{ @@ -419,7 +419,7 @@ fn main() { let banking_tracer = BankingTracer::new(matches.is_present("trace_banking").then_some(( blockstore.banking_trace_path(), exit.clone(), - DEFAULT_BANKING_TRACE_SIZE, + BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, ))) .unwrap(); let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); diff --git a/core/benches/banking_trace.rs b/core/benches/banking_trace.rs index b0ec4334196..7ade2c81891 100644 --- a/core/benches/banking_trace.rs +++ b/core/benches/banking_trace.rs @@ -8,7 +8,7 @@ use { drop_and_clean_temp_dir_unless_suppressed, sample_packet_batch, terminate_tracer, }, receiving_loop_with_minimized_sender_overhead, BankingPacketBatch, BankingTracer, - TraceError, TracerThreadResult, DEFAULT_BANKING_TRACE_SIZE, + TraceError, TracerThreadResult, BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, }, std::{ path::PathBuf, @@ -61,7 +61,7 @@ fn bench_banking_tracer_main_thread_overhead_under_peak_write(bencher: &mut Benc let tracer = BankingTracer::new(Some(( temp_dir.path().join("banking-trace"), exit.clone(), - DEFAULT_BANKING_TRACE_SIZE, + BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, ))) .unwrap(); let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index 2c12d8d0af1..f780167f42b 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -29,6 +29,7 @@ pub type BankingPacketSender = TracedSender; pub type BankingPacketReceiver = Receiver; pub type TracerThreadResult = Result<(), TraceError>; pub type TracerThread = Option>; +pub type DirByteLimit = u64; #[derive(Error, Debug)] pub enum TraceError { @@ -41,8 +42,8 @@ pub enum TraceError { #[error("Integer Cast Error: {0}")] IntegerCastError(#[from] std::num::TryFromIntError), - #[error("Trace size is too small (must be larger than {1}): {0}")] - TooSmallTraceSize(u64, u64), + #[error("dir byte limit is too small (must be larger than {1}): {0}")] + TooSmallDirByteLimit(DirByteLimit, DirByteLimit), } const BASENAME: &str = "events"; @@ -50,8 +51,8 @@ const TRACE_FILE_ROTATE_COUNT: u64 = 14; // target 2 weeks retention under norma const TRACE_FILE_WRITE_INTERVAL_MS: u64 = 100; const BUF_WRITER_CAPACITY: usize = 10 * 1024 * 1024; pub const TRACE_FILE_DEFAULT_ROTATE_BYTE_THRESHOLD: u64 = 1024 * 1024 * 1024; -pub const EMPTY_BANKING_TRACE_SIZE: u64 = 0; -pub const DEFAULT_BANKING_TRACE_SIZE: u64 = +pub const DISABLED_BAKING_TRACE_DIR: DirByteLimit = 0; +pub const BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT: DirByteLimit = TRACE_FILE_DEFAULT_ROTATE_BYTE_THRESHOLD * TRACE_FILE_ROTATE_COUNT; #[allow(clippy::type_complexity)] @@ -168,14 +169,14 @@ pub fn receiving_loop_with_minimized_sender_overhead( impl BankingTracer { pub fn new( - maybe_config: Option<(PathBuf, Arc, u64)>, + maybe_config: Option<(PathBuf, Arc, DirByteLimit)>, ) -> Result, TraceError> { let enabled_tracer = maybe_config - .map(|(path, exit, total_size)| { - let rotate_threshold_size = total_size / TRACE_FILE_ROTATE_COUNT; + .map(|(path, exit, dir_byte_limit)| { + let rotate_threshold_size = dir_byte_limit / TRACE_FILE_ROTATE_COUNT; if rotate_threshold_size == 0 { - return Err(TraceError::TooSmallTraceSize( - total_size, + return Err(TraceError::TooSmallDirByteLimit( + dir_byte_limit, TRACE_FILE_ROTATE_COUNT, )); } @@ -451,7 +452,8 @@ mod tests { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("banking-trace"); let exit = Arc::::default(); - let tracer = BankingTracer::new(Some((path, exit.clone(), u64::max_value()))).unwrap(); + let tracer = + BankingTracer::new(Some((path, exit.clone(), DirByteLimit::max_value()))).unwrap(); let tracer_thread = tracer.take_tracer_thread_join_handle(); let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); @@ -489,8 +491,12 @@ mod tests { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("banking-trace"); let exit = Arc::::default(); - let tracer = - BankingTracer::new(Some((path.clone(), exit.clone(), u64::max_value()))).unwrap(); + let tracer = BankingTracer::new(Some(( + path.clone(), + exit.clone(), + DirByteLimit::max_value(), + ))) + .unwrap(); let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); let dummy_main_thread = thread::spawn(move || { diff --git a/core/src/validator.rs b/core/src/validator.rs index d4b5a50584a..c56afd9ecb9 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -4,7 +4,7 @@ pub use solana_perf::report_target_features; use { crate::{ accounts_hash_verifier::AccountsHashVerifier, - banking_trace::BankingTracer, + banking_trace::{self, BankingTracer}, broadcast_stage::BroadcastStageType, cache_block_meta_service::{CacheBlockMetaSender, CacheBlockMetaService}, cluster_info_vote_listener::VoteTracker, @@ -177,7 +177,7 @@ pub struct ValidatorConfig { pub ledger_column_options: LedgerColumnOptions, pub runtime_config: RuntimeConfig, pub replay_slots_concurrently: bool, - pub banking_trace_size: u64, + pub banking_trace_dir_byte_limit: banking_trace::DirByteLimit, } impl Default for ValidatorConfig { @@ -240,7 +240,7 @@ impl Default for ValidatorConfig { ledger_column_options: LedgerColumnOptions::default(), runtime_config: RuntimeConfig::default(), replay_slots_concurrently: false, - banking_trace_size: 0, + banking_trace_dir_byte_limit: 0, } } } @@ -935,12 +935,13 @@ impl Validator { exit.clone(), ); - let banking_tracer = BankingTracer::new((config.banking_trace_size > 0).then_some(( - blockstore.banking_trace_path(), - exit.clone(), - config.banking_trace_size, - ))) - .map_err(|err| format!("{} [{:?}]", &err, &err))?; + let banking_tracer = + BankingTracer::new((config.banking_trace_dir_byte_limit > 0).then_some(( + blockstore.banking_trace_path(), + exit.clone(), + config.banking_trace_dir_byte_limit, + ))) + .map_err(|err| format!("{} [{:?}]", &err, &err))?; let (replay_vote_sender, replay_vote_receiver) = unbounded(); let tvu = Tvu::new( diff --git a/local-cluster/src/validator_configs.rs b/local-cluster/src/validator_configs.rs index d536667b1f1..0790d4f2a6c 100644 --- a/local-cluster/src/validator_configs.rs +++ b/local-cluster/src/validator_configs.rs @@ -64,7 +64,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig { ledger_column_options: config.ledger_column_options.clone(), runtime_config: config.runtime_config.clone(), replay_slots_concurrently: config.replay_slots_concurrently, - banking_trace_size: config.banking_trace_size, + banking_trace_dir_byte_limit: config.banking_trace_dir_byte_limit, } } diff --git a/validator/src/cli.rs b/validator/src/cli.rs index bd046125a6f..438812b87d4 100644 --- a/validator/src/cli.rs +++ b/validator/src/cli.rs @@ -10,7 +10,7 @@ use { }, keypair::SKIP_SEED_PHRASE_VALIDATION_ARG, }, - solana_core::banking_trace::DEFAULT_BANKING_TRACE_SIZE, + solana_core::banking_trace::{DirByteLimit, BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT}, solana_faucet::faucet::{self, FAUCET_PORT}, solana_net_utils::{MINIMUM_VALIDATOR_PORT_RANGE_WIDTH, VALIDATOR_PORT_RANGE}, solana_rpc::{rpc::MAX_REQUEST_BODY_SIZE, rpc_pubsub_service::PubSubConfig}, @@ -1304,13 +1304,22 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> { .help("Allow concurrent replay of slots on different forks") ) .arg( - Arg::with_name("banking_trace_size") + Arg::with_name("banking_trace_dir_byte_limit") + // expose friendly alternative name to cli than internal + // implementation-oriented one .long("enable-banking-trace") .value_name("MAX_BYTES") - .validator(is_parsable::) - .takes_value(true) - .default_value(&default_args.banking_trace_size) - .help("Write trace files for simulate-leader-blocks, retaining up to the maximum total bytes") + .validator(is_parsable::) + .takes_value(true) + // Firstly, zero limit value causes tracer to be disabled + // altogether, intuitively. On the other hand, this non-zero + // default doesn't enable banking tracer unless this flag is + // explicitly given, similar to --limit-ledger-size. + // see configure_banking_trace_dir_byte_limit() for this. + .default_value(&default_args.banking_trace_dir_byte_limit) + .help("Write trace files for simulate-leader-blocks, retaining \ + up to the default or specified total bytes in the \ + ledger") ) .args(&get_deprecated_arguments()) .after_help("The default subcommand is run") @@ -1694,7 +1703,7 @@ pub struct DefaultArgs { pub wait_for_restart_window_min_idle_time: String, pub wait_for_restart_window_max_delinquent_stake: String, - pub banking_trace_size: String, + pub banking_trace_dir_byte_limit: String, } impl DefaultArgs { @@ -1773,7 +1782,7 @@ impl DefaultArgs { exit_max_delinquent_stake: "5".to_string(), wait_for_restart_window_min_idle_time: "10".to_string(), wait_for_restart_window_max_delinquent_stake: "5".to_string(), - banking_trace_size: DEFAULT_BANKING_TRACE_SIZE.to_string(), + banking_trace_dir_byte_limit: BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT.to_string(), } } } diff --git a/validator/src/main.rs b/validator/src/main.rs index 83fcc49be51..f031caa6976 100644 --- a/validator/src/main.rs +++ b/validator/src/main.rs @@ -8,7 +8,7 @@ use { rand::{seq::SliceRandom, thread_rng}, solana_clap_utils::input_parsers::{keypair_of, keypairs_of, pubkey_of, value_of}, solana_core::{ - banking_trace::EMPTY_BANKING_TRACE_SIZE, + banking_trace::DISABLED_BAKING_TRACE_DIR, ledger_cleanup_service::{DEFAULT_MAX_LEDGER_SHREDS, DEFAULT_MIN_MAX_LEDGER_SHREDS}, system_monitor_service::SystemMonitorService, tower_storage, @@ -428,6 +428,21 @@ fn get_cluster_shred_version(entrypoints: &[SocketAddr]) -> Option { None } +fn configure_banking_trace_dir_byte_limit( + validator_config: &mut ValidatorConfig, + matches: &ArgMatches, +) { + validator_config.banking_trace_dir_byte_limit = + if matches.occurrences_of("banking_trace_dir_byte_limit") == 0 { + // disable with no explicit flag; then, this effectively becomes `opt-in` even if we're + // specifying a default value in clap configuration. + DISABLED_BAKING_TRACE_DIR + } else { + // BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT or user-supplied override value + value_t_or_exit!(matches, "banking_trace_dir_byte_limit", u64) + }; +} + pub fn main() { let default_args = DefaultArgs::new(); let solana_version = solana_version::version!(); @@ -1428,14 +1443,7 @@ pub fn main() { validator_config.max_ledger_shreds = Some(limit_ledger_size); } - validator_config.banking_trace_size = if matches.occurrences_of("banking_trace_size") == 0 { - // disable with no explicit flag; then, this effectively becomes `opt-in` even if we're - // specifying a default value in clap configuration. - EMPTY_BANKING_TRACE_SIZE - } else { - // DEFAULT_BANKING_TRACE_SIZE or user-supplied override value - value_t_or_exit!(matches, "banking_trace_size", u64) - }; + configure_banking_trace_dir_byte_limit(&mut validator_config, &matches); validator_config.ledger_column_options = LedgerColumnOptions { compression_type: match matches.value_of("rocksdb_ledger_compression") { From fb608ffd688d142b2b1abccffd0a7152bc9f936c Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 21 Dec 2022 21:28:20 +0900 Subject: [PATCH 23/43] Consider unclean (signal-based) node restarts.. --- core/src/banking_trace.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index f780167f42b..1cacbab59ee 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -91,6 +91,7 @@ pub enum ChannelLabel { struct RollingConditionGrouped { basic: RollingConditionBasic, + tried_rollover_after_opened: bool, is_checked: bool, } @@ -98,6 +99,7 @@ impl RollingConditionGrouped { fn new(basic: RollingConditionBasic) -> Self { Self { basic, + tried_rollover_after_opened: bool::default(), is_checked: bool::default(), } } @@ -123,6 +125,18 @@ impl<'a> GroupedWriter<'a> { impl RollingCondition for RollingConditionGrouped { fn should_rollover(&mut self, now: &DateTime, current_filesize: u64) -> bool { + if !self.tried_rollover_after_opened { + self.tried_rollover_after_opened = true; + + // rollover normally if empty to reuse it if possible + if current_filesize > 0 { + // forcibly rollover anew, so that we always avoid to append + // to a possibly-damaged tracing file even after unclean + // restarts + return true; + } + } + if !self.is_checked { self.is_checked = true; self.basic.should_rollover(now, current_filesize) From 874820e2b65edd4357b1094a7373ae3726bb4cac Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 21 Dec 2022 21:37:49 +0900 Subject: [PATCH 24/43] Tweak logging and cli --- core/src/banking_trace.rs | 4 ++++ core/src/validator.rs | 8 ++++++++ validator/src/cli.rs | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index 1cacbab59ee..25f230c567a 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -216,6 +216,10 @@ impl BankingTracer { }) } + pub fn is_enabled(&self) -> bool { + self.enabled_tracer.is_some() + } + fn create_channel(&self, label: ChannelLabel) -> (BankingPacketSender, BankingPacketReceiver) { Self::channel( label, diff --git a/core/src/validator.rs b/core/src/validator.rs index c56afd9ecb9..50c57135690 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -942,6 +942,14 @@ impl Validator { config.banking_trace_dir_byte_limit, ))) .map_err(|err| format!("{} [{:?}]", &err, &err))?; + if banking_tracer.is_enabled() { + info!( + "Enabled banking tracer (dir_byte_limit: {})", + config.banking_trace_dir_byte_limit + ); + } else { + info!("Disabled banking tracer"); + } let (replay_vote_sender, replay_vote_receiver) = unbounded(); let tvu = Tvu::new( diff --git a/validator/src/cli.rs b/validator/src/cli.rs index 438812b87d4..2f5be98a5de 100644 --- a/validator/src/cli.rs +++ b/validator/src/cli.rs @@ -1308,7 +1308,7 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> { // expose friendly alternative name to cli than internal // implementation-oriented one .long("enable-banking-trace") - .value_name("MAX_BYTES") + .value_name("BYTES") .validator(is_parsable::) .takes_value(true) // Firstly, zero limit value causes tracer to be disabled From 33f6fb8f02a847286b79b6572683c7d67b147d98 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Fri, 13 Jan 2023 22:38:36 +0900 Subject: [PATCH 25/43] Remove Bank events as it's not needed anymore --- core/benches/banking_stage.rs | 2 +- core/src/banking_stage.rs | 19 +++---- core/src/banking_trace.rs | 35 +----------- core/src/leader_slot_banking_stage_metrics.rs | 57 +++++-------------- 4 files changed, 26 insertions(+), 87 deletions(-) diff --git a/core/benches/banking_stage.rs b/core/benches/banking_stage.rs index f1a31e135bc..cf0b11054f1 100644 --- a/core/benches/banking_stage.rs +++ b/core/benches/banking_stage.rs @@ -103,7 +103,7 @@ fn bench_consume_buffered(bencher: &mut Bencher) { &BankingStageStats::default(), &recorder, &QosService::new(1), - &mut LeaderSlotMetricsTracker::new_for_test(), + &mut LeaderSlotMetricsTracker::new(0), None, ); }); diff --git a/core/src/banking_stage.rs b/core/src/banking_stage.rs index 8584da96e7b..08de15f3a19 100644 --- a/core/src/banking_stage.rs +++ b/core/src/banking_stage.rs @@ -483,7 +483,6 @@ impl BankingStage { let data_budget = data_budget.clone(); let connection_cache = connection_cache.clone(); let bank_forks = bank_forks.clone(); - let banking_tracer = banking_tracer.clone(); Builder::new() .name(format!("solBanknStgTx{i:02}")) .spawn(move || { @@ -500,7 +499,6 @@ impl BankingStage { connection_cache, &bank_forks, unprocessed_transaction_storage, - banking_tracer, ); }) .unwrap() @@ -850,9 +848,7 @@ impl BankingStage { // Take metrics action after forwarding packets slot_metrics_tracker.apply_action(metrics_action); } - BufferedPacketsDecision::Hold => { - slot_metrics_tracker.apply_action(metrics_action); - } + _ => {} } } @@ -967,7 +963,6 @@ impl BankingStage { connection_cache: Arc, bank_forks: &Arc>, mut unprocessed_transaction_storage: UnprocessedTransactionStorage, - banking_tracer: Arc, ) { let recorder = poh_recorder.read().unwrap().recorder(); let socket = UdpSocket::bind("0.0.0.0:0").unwrap(); @@ -975,7 +970,7 @@ impl BankingStage { let mut tracer_packet_stats = TracerPacketStats::new(id); let qos_service = QosService::new(id); - let mut slot_metrics_tracker = LeaderSlotMetricsTracker::new(id, banking_tracer); + let mut slot_metrics_tracker = LeaderSlotMetricsTracker::new(id); let mut last_metrics_update = Instant::now(); loop { @@ -3421,7 +3416,7 @@ mod tests { &BankingStageStats::default(), &recorder, &QosService::new(1), - &mut LeaderSlotMetricsTracker::new_for_test(), + &mut LeaderSlotMetricsTracker::new(0), None, ); assert!(buffered_packet_batches.is_empty()); @@ -3479,7 +3474,7 @@ mod tests { &BankingStageStats::default(), &recorder, &QosService::new(1), - &mut LeaderSlotMetricsTracker::new_for_test(), + &mut LeaderSlotMetricsTracker::new(0), None, ); assert!(buffered_packet_batches.is_empty()); @@ -3541,7 +3536,7 @@ mod tests { &BankingStageStats::default(), &recorder, &QosService::new(1), - &mut LeaderSlotMetricsTracker::new_for_test(), + &mut LeaderSlotMetricsTracker::new(0), None, ); @@ -3634,7 +3629,7 @@ mod tests { &socket, true, &data_budget, - &mut LeaderSlotMetricsTracker::new_for_test(), + &mut LeaderSlotMetricsTracker::new(0), &stats, &connection_cache, &mut TracerPacketStats::new(0), @@ -3731,7 +3726,7 @@ mod tests { &socket, hold, &DataBudget::default(), - &mut LeaderSlotMetricsTracker::new_for_test(), + &mut LeaderSlotMetricsTracker::new(0), &stats, &connection_cache, &mut TracerPacketStats::new(0), diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index 25f230c567a..0d44b9411a3 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -71,7 +71,6 @@ pub struct TimedTracedEvent(std::time::SystemTime, TracedEvent); #[derive(Serialize, Deserialize, Debug)] enum TracedEvent { PacketBatch(ChannelLabel, BankingPacketBatch), - Bank(Slot, u32, BankStatus, usize), BlockAndBankHash(Slot, Hash, Hash), } @@ -251,19 +250,6 @@ impl BankingTracer { }) } - fn bank_event( - &self, - slot: Slot, - id: u32, - status: BankStatus, - unprocessed_transaction_count: usize, - ) { - self.trace_event(TimedTracedEvent( - SystemTime::now(), - TracedEvent::Bank(slot, id, status, unprocessed_transaction_count), - )) - } - pub fn hash_event(&self, slot: Slot, blockhash: Hash, bank_hash: Hash) { self.trace_event(TimedTracedEvent( SystemTime::now(), @@ -281,14 +267,6 @@ impl BankingTracer { } } - pub fn bank_start(&self, slot: Slot, id: u32, unprocessed_transaction_count: usize) { - self.bank_event(slot, id, BankStatus::Started, unprocessed_transaction_count); - } - - pub fn bank_end(&self, slot: Slot, id: u32, unprocessed_transaction_count: usize) { - self.bank_event(slot, id, BankStatus::Ended, unprocessed_transaction_count); - } - pub fn channel_for_test() -> (TracedSender, Receiver) { Self::channel(ChannelLabel::Dummy, None) } @@ -490,7 +468,9 @@ mod tests { tracer_thread.unwrap().join().unwrap().unwrap(); // shouldn't panic - tracer.bank_end(1, 2, 3); + let blockhash = Hash::from_str("B1ockhash1111111111111111111111111111111111").unwrap(); + let bank_hash = Hash::from_str("BankHash11111111111111111111111111111111111").unwrap(); + tracer.hash_event(4, blockhash, bank_hash); drop(tracer); @@ -528,7 +508,6 @@ mod tests { non_vote_sender .send(for_test::sample_packet_batch()) .unwrap(); - tracer.bank_start(1, 2, 3); let blockhash = Hash::from_str("B1ockhash1111111111111111111111111111111111").unwrap(); let bank_hash = Hash::from_str("BankHash11111111111111111111111111111111111").unwrap(); tracer.hash_event(4, blockhash, bank_hash); @@ -549,14 +528,6 @@ mod tests { )) ); i += 1; - assert_matches!( - results[i], - Ok(TimedTracedEvent( - _, - TracedEvent::Bank(1, 2, BankStatus::Started, 3) - )) - ); - i += 1; assert_matches!( results[i], Ok(TimedTracedEvent( diff --git a/core/src/leader_slot_banking_stage_metrics.rs b/core/src/leader_slot_banking_stage_metrics.rs index 86406bf103e..6e9898c33ff 100644 --- a/core/src/leader_slot_banking_stage_metrics.rs +++ b/core/src/leader_slot_banking_stage_metrics.rs @@ -1,12 +1,12 @@ use { crate::{ - banking_trace::BankingTracer, leader_slot_banking_stage_timing_metrics::*, + leader_slot_banking_stage_timing_metrics::*, unprocessed_transaction_storage::InsertPacketBatchSummary, }, solana_poh::poh_recorder::BankStart, solana_runtime::transaction_error_metrics::*, solana_sdk::{clock::Slot, saturating_add_assign}, - std::{sync::Arc, time::Instant}, + std::time::Instant, }; /// A summary of what happened to transactions passed to the execution pipeline. @@ -358,45 +358,16 @@ pub struct LeaderSlotMetricsTracker { // otherwise `None` leader_slot_metrics: Option, id: u32, - banking_tracer: Arc, - unprocessed_transaction_count: usize, } impl LeaderSlotMetricsTracker { - pub fn new(id: u32, banking_tracer: Arc) -> Self { + pub fn new(id: u32) -> Self { Self { leader_slot_metrics: None, id, - banking_tracer, - unprocessed_transaction_count: 0, } } - pub fn refresh_unprocessed_transaction_count(&mut self, batch_count: usize) { - self.unprocessed_transaction_count = batch_count; - } - - pub fn new_for_test() -> Self { - Self::new(0, BankingTracer::new_disabled()) - } - - fn create_new_slot_metrics(&self, bank_start: &BankStart) -> Option { - let metrics = LeaderSlotMetrics::new( - self.id, - bank_start.working_bank.slot(), - &bank_start.bank_creation_time, - ); - self.banking_tracer - .bank_start(metrics.slot, self.id, self.unprocessed_transaction_count); - - Some(metrics) - } - - pub fn trace_bank_end(&self, metrics_slot: Slot) { - self.banking_tracer - .bank_end(metrics_slot, self.id, self.unprocessed_transaction_count); - } - // Check leader slot, return MetricsTrackerAction to be applied by apply_action() pub(crate) fn check_leader_slot_boundary( &mut self, @@ -406,27 +377,28 @@ impl LeaderSlotMetricsTracker { (None, None) => MetricsTrackerAction::Noop, (Some(leader_slot_metrics), None) => { - let slot = leader_slot_metrics.slot; leader_slot_metrics.mark_slot_end_detected(); - self.trace_bank_end(slot); - MetricsTrackerAction::ReportAndResetTracker } // Our leader slot has begain, time to create a new slot tracker (None, Some(bank_start)) => { - MetricsTrackerAction::NewTracker(self.create_new_slot_metrics(bank_start)) + MetricsTrackerAction::NewTracker(Some(LeaderSlotMetrics::new( + self.id, + bank_start.working_bank.slot(), + &bank_start.bank_creation_time, + ))) } (Some(leader_slot_metrics), Some(bank_start)) => { if leader_slot_metrics.slot != bank_start.working_bank.slot() { // Last slot has ended, new slot has began - let slot = leader_slot_metrics.slot; leader_slot_metrics.mark_slot_end_detected(); - self.trace_bank_end(slot); - MetricsTrackerAction::ReportAndNewTracker( - self.create_new_slot_metrics(bank_start), - ) + MetricsTrackerAction::ReportAndNewTracker(Some(LeaderSlotMetrics::new( + self.id, + bank_start.working_bank.slot(), + &bank_start.bank_creation_time, + ))) } else { MetricsTrackerAction::Noop } @@ -905,7 +877,8 @@ mod tests { bank_creation_time: Arc::new(Instant::now()), }; - let leader_slot_metrics_tracker = LeaderSlotMetricsTracker::new_for_test(); + let banking_stage_thread_id = 0; + let leader_slot_metrics_tracker = LeaderSlotMetricsTracker::new(banking_stage_thread_id); TestSlotBoundaryComponents { first_bank, From 3e83eacfc9aa3cb89c270dbd34e60e24901b8268 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sat, 14 Jan 2023 22:29:22 +0900 Subject: [PATCH 26/43] Make tpu own banking tracer thread --- banking-bench/src/main.rs | 1 - core/benches/banking_stage.rs | 1 - core/src/banking_stage.rs | 26 ++++++-------------------- core/src/tpu.rs | 13 +++++++++++-- 4 files changed, 17 insertions(+), 24 deletions(-) diff --git a/banking-bench/src/main.rs b/banking-bench/src/main.rs index 5f8c17f34ee..7b309b75d47 100644 --- a/banking-bench/src/main.rs +++ b/banking-bench/src/main.rs @@ -449,7 +449,6 @@ fn main() { None, Arc::new(connection_cache), bank_forks.clone(), - banking_tracer, ); poh_recorder.write().unwrap().set_bank(&bank, false); diff --git a/core/benches/banking_stage.rs b/core/benches/banking_stage.rs index cf0b11054f1..8f8b2d0a258 100644 --- a/core/benches/banking_stage.rs +++ b/core/benches/banking_stage.rs @@ -290,7 +290,6 @@ fn bench_banking(bencher: &mut Bencher, tx_type: TransactionType) { None, Arc::new(ConnectionCache::default()), bank_forks, - banking_tracer, ); poh_recorder.write().unwrap().set_bank(&bank, false); diff --git a/core/src/banking_stage.rs b/core/src/banking_stage.rs index 08de15f3a19..63eff76e9a9 100644 --- a/core/src/banking_stage.rs +++ b/core/src/banking_stage.rs @@ -8,7 +8,7 @@ use { packet_receiver::PacketReceiver, }, crate::{ - banking_trace::{BankingPacketReceiver, BankingTracer, TracerThread}, + banking_trace::BankingPacketReceiver, forward_packet_batches_by_accounts::ForwardPacketBatchesByAccounts, immutable_deserialized_packet::ImmutableDeserializedPacket, latest_unprocessed_votes::{LatestUnprocessedVotes, VoteSource}, @@ -349,7 +349,6 @@ pub struct BatchedTransactionErrorDetails { /// Stores the stage's thread handle and output receiver. pub struct BankingStage { bank_thread_hdls: Vec>, - tracer_thread_hdl: TracerThread, } #[derive(Debug, Clone)] @@ -382,7 +381,6 @@ impl BankingStage { log_messages_bytes_limit: Option, connection_cache: Arc, bank_forks: Arc>, - banking_tracer: Arc, ) -> Self { Self::new_num_threads( cluster_info, @@ -396,7 +394,6 @@ impl BankingStage { log_messages_bytes_limit, connection_cache, bank_forks, - banking_tracer, ) } @@ -413,7 +410,6 @@ impl BankingStage { log_messages_bytes_limit: Option, connection_cache: Arc, bank_forks: Arc>, - banking_tracer: Arc, ) -> Self { assert!(num_threads >= MIN_TOTAL_THREADS); // Single thread to generate entries from many banks. @@ -505,10 +501,7 @@ impl BankingStage { }) .collect(); - Self { - bank_thread_hdls, - tracer_thread_hdl: banking_tracer.take_tracer_thread_join_handle(), - } + Self { bank_thread_hdls } } /// Forwards all valid, unprocessed packets in the buffer, up to a rate limit. Returns @@ -1743,11 +1736,6 @@ impl BankingStage { for bank_thread_hdl in self.bank_thread_hdls { bank_thread_hdl.join()?; } - if let Some(tracer_thread_hdl) = self.tracer_thread_hdl { - if let Err(tracer_result) = tracer_thread_hdl.join()? { - error!("tracer thread error: {:?}", tracer_result); - } - } Ok(()) } } @@ -1756,7 +1744,10 @@ impl BankingStage { mod tests { use { super::*, - crate::{banking_trace::BankingPacketBatch, unprocessed_packet_batches}, + crate::{ + banking_trace::{BankingPacketBatch, BankingTracer}, + unprocessed_packet_batches, + }, crossbeam_channel::{unbounded, Receiver}, solana_address_lookup_table_program::state::{AddressLookupTable, LookupTableMeta}, solana_entry::entry::{next_entry, next_versioned_entry, Entry, EntrySlice}, @@ -1848,7 +1839,6 @@ mod tests { None, Arc::new(ConnectionCache::default()), bank_forks, - banking_tracer, ); drop(non_vote_sender); drop(tpu_vote_sender); @@ -1904,7 +1894,6 @@ mod tests { None, Arc::new(ConnectionCache::default()), bank_forks, - banking_tracer, ); trace!("sending bank"); drop(non_vote_sender); @@ -1985,7 +1974,6 @@ mod tests { None, Arc::new(ConnectionCache::default()), bank_forks, - banking_tracer, ); // fund another account so we can send 2 good transactions in a single batch. @@ -2147,7 +2135,6 @@ mod tests { None, Arc::new(ConnectionCache::default()), bank_forks, - banking_tracer, ); // wait for banking_stage to eat the packets @@ -3838,7 +3825,6 @@ mod tests { None, Arc::new(ConnectionCache::default()), bank_forks, - banking_tracer, ); let keypairs = (0..100).map(|_| Keypair::new()).collect_vec(); diff --git a/core/src/tpu.rs b/core/src/tpu.rs index 2ef09d84870..c0f9bbcae66 100644 --- a/core/src/tpu.rs +++ b/core/src/tpu.rs @@ -4,7 +4,7 @@ use { crate::{ banking_stage::BankingStage, - banking_trace::BankingTracer, + banking_trace::{BankingTracer, TracerThread}, broadcast_stage::{BroadcastStage, BroadcastStageType, RetransmitSlotsReceiver}, cluster_info_vote_listener::{ ClusterInfoVoteListener, GossipDuplicateConfirmedSlotsSender, @@ -69,6 +69,7 @@ pub struct Tpu { find_packet_sender_stake_stage: FindPacketSenderStakeStage, vote_find_packet_sender_stake_stage: FindPacketSenderStakeStage, staked_nodes_updater_service: StakedNodesUpdaterService, + tracer_thread_hdl: TracerThread, } impl Tpu { @@ -234,7 +235,6 @@ impl Tpu { log_messages_bytes_limit, connection_cache.clone(), bank_forks.clone(), - banking_tracer, ); let broadcast_stage = broadcast_type.new_broadcast_stage( @@ -260,6 +260,7 @@ impl Tpu { find_packet_sender_stake_stage, vote_find_packet_sender_stake_stage, staked_nodes_updater_service, + tracer_thread_hdl: banking_tracer.take_tracer_thread_join_handle(), } } @@ -281,6 +282,14 @@ impl Tpu { result?; } let _ = broadcast_result?; + if let Some(tracer_thread_hdl) = self.tracer_thread_hdl { + if let Err(tracer_result) = tracer_thread_hdl.join()? { + error!( + "banking tracer thread returned error after successful thread join: {:?}", + tracer_result + ); + } + } Ok(()) } } From c4b90bbbc4f37d782da874c20e7eb661e5014e8a Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sat, 14 Jan 2023 22:37:47 +0900 Subject: [PATCH 27/43] Reduce diff a bit.. --- core/src/banking_stage.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/src/banking_stage.rs b/core/src/banking_stage.rs index 63eff76e9a9..5f828d86cbe 100644 --- a/core/src/banking_stage.rs +++ b/core/src/banking_stage.rs @@ -500,7 +500,6 @@ impl BankingStage { .unwrap() }) .collect(); - Self { bank_thread_hdls } } @@ -841,7 +840,7 @@ impl BankingStage { // Take metrics action after forwarding packets slot_metrics_tracker.apply_action(metrics_action); } - _ => {} + _ => (), } } From 6c5dd574923fb8cf41722038705150ec44d79ad1 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Sun, 15 Jan 2023 21:34:20 +0900 Subject: [PATCH 28/43] Use latest serde_with --- Cargo.lock | 8 ++++---- programs/sbf/Cargo.lock | 8 ++++---- sdk/Cargo.toml | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e2eabad1eb7..50517defdc9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4441,9 +4441,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25bf4a5a814902cd1014dbccfa4d4560fb8432c779471e96e035602519f82eef" +checksum = "30d904179146de381af4c93d3af6ca4984b3152db687dacb9c3c35e86f39809c" dependencies = [ "serde", "serde_with_macros", @@ -4451,9 +4451,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3452b4c0f6c1e357f73fdb87cd1efabaa12acf328c7a528e252893baeb3f4aa" +checksum = "a1966009f3c05f095697c537312f5415d1e3ed31ce0a56942bac4c771c5c335e" dependencies = [ "darling", "proc-macro2 1.0.41", diff --git a/programs/sbf/Cargo.lock b/programs/sbf/Cargo.lock index da4c108e1ff..01a849bdc5c 100644 --- a/programs/sbf/Cargo.lock +++ b/programs/sbf/Cargo.lock @@ -4039,9 +4039,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25bf4a5a814902cd1014dbccfa4d4560fb8432c779471e96e035602519f82eef" +checksum = "30d904179146de381af4c93d3af6ca4984b3152db687dacb9c3c35e86f39809c" dependencies = [ "serde", "serde_with_macros", @@ -4049,9 +4049,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3452b4c0f6c1e357f73fdb87cd1efabaa12acf328c7a528e252893baeb3f4aa" +checksum = "a1966009f3c05f095697c537312f5415d1e3ed31ce0a56942bac4c771c5c335e" dependencies = [ "darling", "proc-macro2 1.0.41", diff --git a/sdk/Cargo.toml b/sdk/Cargo.toml index 339e1c6d959..61ae764300b 100644 --- a/sdk/Cargo.toml +++ b/sdk/Cargo.toml @@ -70,7 +70,7 @@ serde = "1.0.144" serde_bytes = "0.11" serde_derive = "1.0.103" serde_json = { version = "1.0.83", optional = true } -serde_with = { version = "2.1.0", default-features = false, features = ["macros"] } +serde_with = { version = "2.2.0", default-features = false, features = ["macros"] } sha2 = "0.10.5" sha3 = { version = "0.10.4", optional = true } solana-frozen-abi = { path = "../frozen-abi", version = "=1.15.0" } From 88b2b507472e38d09ea7f63194dfe1d006a080a4 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 18 Jan 2023 15:07:22 +0900 Subject: [PATCH 29/43] Finally use the published rolling-file crate --- Cargo.lock | 5 +++-- core/Cargo.toml | 2 +- programs/sbf/Cargo.lock | 9 +++++---- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 50517defdc9..e84841a2ae9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4124,8 +4124,9 @@ dependencies = [ [[package]] name = "rolling-file" -version = "0.1.0" -source = "git+https://github.com/ryoqun/rolling-file-rs?rev=664504e77fad7292dedf5a119d950b80aef67934#664504e77fad7292dedf5a119d950b80aef67934" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8395b4f860856b740f20a296ea2cd4d823e81a2658cf05ef61be22916026a906" dependencies = [ "chrono", ] diff --git a/core/Cargo.toml b/core/Cargo.toml index 363235ee26a..e6d0e00656f 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -34,7 +34,7 @@ num_enum = "0.5.7" rand = "0.7.0" rand_chacha = "0.2.2" rayon = "1.5.3" -rolling-file = { git = "https://github.com/ryoqun/rolling-file-rs", rev = "664504e77fad7292dedf5a119d950b80aef67934" } +rolling-file = "0.2.0" serde = "1.0.144" serde_derive = "1.0.103" solana-address-lookup-table-program = { path = "../programs/address-lookup-table", version = "=1.15.0" } diff --git a/programs/sbf/Cargo.lock b/programs/sbf/Cargo.lock index 01a849bdc5c..251e366c0dd 100644 --- a/programs/sbf/Cargo.lock +++ b/programs/sbf/Cargo.lock @@ -745,9 +745,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.22" +version = "0.4.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfd4d1b31faaa3a89d7934dbded3111da0d2ef28e3ebccdb4f0179f5929d1ef1" +checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" dependencies = [ "iana-time-zone", "js-sys", @@ -3737,8 +3737,9 @@ dependencies = [ [[package]] name = "rolling-file" -version = "0.1.0" -source = "git+https://github.com/ryoqun/rolling-file-rs?rev=664504e77fad7292dedf5a119d950b80aef67934#664504e77fad7292dedf5a119d950b80aef67934" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8395b4f860856b740f20a296ea2cd4d823e81a2658cf05ef61be22916026a906" dependencies = [ "chrono", ] From 907d59fd29064e45215328ce1854028299eff9b6 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Tue, 24 Jan 2023 15:45:49 +0900 Subject: [PATCH 30/43] Make test code change more consistent --- core/benches/sigverify_stage.rs | 20 ++++++++++++-------- core/src/sigverify_stage.rs | 17 +++++++++++------ 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/core/benches/sigverify_stage.rs b/core/benches/sigverify_stage.rs index 2eb03c52395..e856e787c26 100644 --- a/core/benches/sigverify_stage.rs +++ b/core/benches/sigverify_stage.rs @@ -144,7 +144,16 @@ fn gen_batches(use_same_tx: bool) -> Vec { } #[bench] -fn bench_sigverify_stage(bencher: &mut Bencher) { +fn bench_sigverify_stage_with_same_tx(bencher: &mut Bencher) { + bench_sigverify_stage(bencher, true) +} + +#[bench] +fn bench_sigverify_stage_without_same_tx(bencher: &mut Bencher) { + bench_sigverify_stage(bencher, false) +} + +fn bench_sigverify_stage(bencher: &mut Bencher, use_same_tx: bool) { solana_logger::setup(); trace!("start"); let (packet_s, packet_r) = unbounded(); @@ -152,7 +161,6 @@ fn bench_sigverify_stage(bencher: &mut Bencher) { let verifier = TransactionSigVerifier::new(verified_s); let stage = SigVerifyStage::new(packet_r, verifier, "bench"); - let use_same_tx = true; bencher.iter(move || { let now = Instant::now(); let mut batches = gen_batches(use_same_tx); @@ -171,21 +179,17 @@ fn bench_sigverify_stage(bencher: &mut Bencher) { } let mut received = 0; trace!("sent: {}", sent_len); - let mut messages = vec![]; loop { if let Ok(message) = verified_r.recv_timeout(Duration::from_millis(10)) { let (verifieds, _) = &*message; - for v in verifieds.iter().rev() { - received += v.len(); - } - messages.push(message); + received += verifieds.iter().map(|batch| batch.len()).sum::(); + test::black_box(message); if use_same_tx || received >= sent_len { break; } } } trace!("received: {}", received); - test::black_box(messages); }); stage.join().unwrap(); } diff --git a/core/src/sigverify_stage.rs b/core/src/sigverify_stage.rs index 09fd0fa525c..9dfe0fd5730 100644 --- a/core/src/sigverify_stage.rs +++ b/core/src/sigverify_stage.rs @@ -531,7 +531,16 @@ mod tests { } #[test] - fn test_sigverify_stage() { + fn test_sigverify_stage_with_same_tx() { + test_sigverify_stage(true) + } + + #[test] + fn test_sigverify_stage_without_same_tx() { + test_sigverify_stage(false) + } + + fn test_sigverify_stage(use_same_tx: bool) { solana_logger::setup(); trace!("start"); let (packet_s, packet_r) = unbounded(); @@ -539,7 +548,6 @@ mod tests { let verifier = TransactionSigVerifier::new(verified_s); let stage = SigVerifyStage::new(packet_r, verifier, "test"); - let use_same_tx = true; let now = Instant::now(); let packets_per_batch = 128; let total_packets = 1920; @@ -607,10 +615,7 @@ mod tests { ); } assert_eq!(tracer_packet_stats.total_excess_tracer_packets, 0); - for v in verifieds.iter().rev() { - received += v.len(); - batches.push(v.clone()); - } + received += verifieds.iter().map(|batch| batch.len()).sum::(); } if total_tracer_packets_received_in_sigverify_stage >= sent_len { From 8755444293e6365dfcfbb10ded698e0c4ec7e1a2 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Tue, 24 Jan 2023 12:33:41 +0000 Subject: [PATCH 31/43] Revive dead and non-terminating test code path... --- core/benches/sigverify_stage.rs | 13 ++++++++++--- core/src/sigverify_stage.rs | 3 +-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/core/benches/sigverify_stage.rs b/core/benches/sigverify_stage.rs index e856e787c26..43237197138 100644 --- a/core/benches/sigverify_stage.rs +++ b/core/benches/sigverify_stage.rs @@ -23,6 +23,7 @@ use { }, solana_sdk::{ hash::Hash, + packet::PacketFlags, signature::{Keypair, Signer}, system_transaction, timing::duration_as_ms, @@ -172,19 +173,25 @@ fn bench_sigverify_stage(bencher: &mut Bencher, use_same_tx: bool) { let mut sent_len = 0; for _ in 0..batches.len() { - if let Some(batch) = batches.pop() { + if let Some(mut batch) = batches.pop() { sent_len += batch.len(); + batch + .iter_mut() + .for_each(|packet| packet.meta_mut().flags |= PacketFlags::TRACER_PACKET); packet_s.send(vec![batch]).unwrap(); } } let mut received = 0; + let mut total_tracer_packets_received_in_sigverify_stage = 0; trace!("sent: {}", sent_len); loop { if let Ok(message) = verified_r.recv_timeout(Duration::from_millis(10)) { - let (verifieds, _) = &*message; + let (verifieds, tracer_packet_stats) = (&message.0, message.1.as_ref().unwrap()); received += verifieds.iter().map(|batch| batch.len()).sum::(); + total_tracer_packets_received_in_sigverify_stage += + tracer_packet_stats.total_tracer_packets_received_in_sigverify_stage; test::black_box(message); - if use_same_tx || received >= sent_len { + if total_tracer_packets_received_in_sigverify_stage >= sent_len { break; } } diff --git a/core/src/sigverify_stage.rs b/core/src/sigverify_stage.rs index 9dfe0fd5730..0f1e36c3c96 100644 --- a/core/src/sigverify_stage.rs +++ b/core/src/sigverify_stage.rs @@ -577,8 +577,7 @@ mod tests { trace!("sent: {}", sent_len); loop { if let Ok(message) = verified_r.recv() { - let (verifieds, tracer_packet_stats_option) = (&message.0, message.1.clone()); - let tracer_packet_stats = tracer_packet_stats_option.unwrap(); + let (verifieds, tracer_packet_stats) = (&message.0, message.1.as_ref().unwrap()); total_tracer_packets_received_in_sigverify_stage += tracer_packet_stats.total_tracer_packets_received_in_sigverify_stage; assert_eq!( From 8461c9501f85fa9aa1cfc3e7f1e86401a42af1aa Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Tue, 24 Jan 2023 13:27:51 +0000 Subject: [PATCH 32/43] Dispose batches early now that possible --- core/benches/sigverify_stage.rs | 16 +++++++--------- core/src/sigverify_stage.rs | 18 ++++++++---------- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/core/benches/sigverify_stage.rs b/core/benches/sigverify_stage.rs index 43237197138..c5b65414881 100644 --- a/core/benches/sigverify_stage.rs +++ b/core/benches/sigverify_stage.rs @@ -164,7 +164,7 @@ fn bench_sigverify_stage(bencher: &mut Bencher, use_same_tx: bool) { bencher.iter(move || { let now = Instant::now(); - let mut batches = gen_batches(use_same_tx); + let batches = gen_batches(use_same_tx); trace!( "starting... generation took: {} ms batches: {}", duration_as_ms(&now.elapsed()), @@ -172,14 +172,12 @@ fn bench_sigverify_stage(bencher: &mut Bencher, use_same_tx: bool) { ); let mut sent_len = 0; - for _ in 0..batches.len() { - if let Some(mut batch) = batches.pop() { - sent_len += batch.len(); - batch - .iter_mut() - .for_each(|packet| packet.meta_mut().flags |= PacketFlags::TRACER_PACKET); - packet_s.send(vec![batch]).unwrap(); - } + for mut batch in batches.into_iter() { + sent_len += batch.len(); + batch + .iter_mut() + .for_each(|packet| packet.meta_mut().flags |= PacketFlags::TRACER_PACKET); + packet_s.send(vec![batch]).unwrap(); } let mut received = 0; let mut total_tracer_packets_received_in_sigverify_stage = 0; diff --git a/core/src/sigverify_stage.rs b/core/src/sigverify_stage.rs index 0f1e36c3c96..b636a89dcb4 100644 --- a/core/src/sigverify_stage.rs +++ b/core/src/sigverify_stage.rs @@ -554,7 +554,7 @@ mod tests { // This is important so that we don't discard any packets and fail asserts below about // `total_excess_tracer_packets` assert!(total_packets < MAX_SIGVERIFY_BATCH); - let mut batches = gen_batches(use_same_tx, packets_per_batch, total_packets); + let batches = gen_batches(use_same_tx, packets_per_batch, total_packets); trace!( "starting... generation took: {} ms batches: {}", duration_as_ms(&now.elapsed()), @@ -562,15 +562,13 @@ mod tests { ); let mut sent_len = 0; - for _ in 0..batches.len() { - if let Some(mut batch) = batches.pop() { - sent_len += batch.len(); - batch - .iter_mut() - .for_each(|packet| packet.meta_mut().flags |= PacketFlags::TRACER_PACKET); - assert_eq!(batch.len(), packets_per_batch); - packet_s.send(vec![batch]).unwrap(); - } + for mut batch in batches.into_iter() { + sent_len += batch.len(); + batch + .iter_mut() + .for_each(|packet| packet.meta_mut().flags |= PacketFlags::TRACER_PACKET); + assert_eq!(batch.len(), packets_per_batch); + packet_s.send(vec![batch]).unwrap(); } let mut received = 0; let mut total_tracer_packets_received_in_sigverify_stage = 0; From 9c2d4c789c7ab94541fd3ce42b48dc48dd6afdc9 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Tue, 24 Jan 2023 23:30:38 +0900 Subject: [PATCH 33/43] Split off thread handle very early at ::new() --- banking-bench/src/main.rs | 14 ++++---- core/benches/banking_trace.rs | 33 +++++++++++++++---- core/src/banking_trace.rs | 61 ++++++++++++++++------------------- core/src/tpu.rs | 3 +- core/src/validator.rs | 3 +- 5 files changed, 66 insertions(+), 48 deletions(-) diff --git a/banking-bench/src/main.rs b/banking-bench/src/main.rs index 7b309b75d47..e8ae68728c4 100644 --- a/banking-bench/src/main.rs +++ b/banking-bench/src/main.rs @@ -416,12 +416,13 @@ fn main() { let leader_schedule_cache = Arc::new(LeaderScheduleCache::new_from_bank(&bank)); let (exit, poh_recorder, poh_service, signal_receiver) = create_test_recorder(&bank, &blockstore, None, Some(leader_schedule_cache)); - let banking_tracer = BankingTracer::new(matches.is_present("trace_banking").then_some(( - blockstore.banking_trace_path(), - exit.clone(), - BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, - ))) - .unwrap(); + let (banking_tracer, tracer_thread) = + BankingTracer::new(matches.is_present("trace_banking").then_some(( + blockstore.banking_trace_path(), + exit.clone(), + BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, + ))) + .unwrap(); let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); let (gossip_vote_sender, gossip_vote_receiver) = @@ -599,6 +600,7 @@ fn main() { poh_service.join().unwrap(); sleep(Duration::from_secs(1)); debug!("waited for poh_service"); + tracer_thread.unwrap().join().unwrap().unwrap(); } let _unused = Blockstore::destroy(&ledger_path); } diff --git a/core/benches/banking_trace.rs b/core/benches/banking_trace.rs index 7ade2c81891..b592d35c6eb 100644 --- a/core/benches/banking_trace.rs +++ b/core/benches/banking_trace.rs @@ -50,7 +50,7 @@ fn bench_banking_tracer_main_thread_overhead_noop_baseline(bencher: &mut Bencher bencher.iter(|| { non_vote_sender.send(packet_batch.clone()).unwrap(); }); - terminate_tracer(tracer, dummy_main_thread, non_vote_sender, Some(exit)); + terminate_tracer(tracer, None, dummy_main_thread, non_vote_sender, Some(exit)); } #[bench] @@ -58,7 +58,7 @@ fn bench_banking_tracer_main_thread_overhead_under_peak_write(bencher: &mut Benc let temp_dir = TempDir::new().unwrap(); let exit = Arc::::default(); - let tracer = BankingTracer::new(Some(( + let (tracer, tracer_thread) = BankingTracer::new(Some(( temp_dir.path().join("banking-trace"), exit.clone(), BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, @@ -80,7 +80,13 @@ fn bench_banking_tracer_main_thread_overhead_under_peak_write(bencher: &mut Benc non_vote_sender.send(packet_batch.clone()).unwrap(); }); - terminate_tracer(tracer, dummy_main_thread, non_vote_sender, Some(exit)); + terminate_tracer( + tracer, + tracer_thread, + dummy_main_thread, + non_vote_sender, + Some(exit), + ); drop_and_clean_temp_dir_unless_suppressed(temp_dir); } @@ -89,7 +95,7 @@ fn bench_banking_tracer_main_thread_overhead_under_sustained_write(bencher: &mut let temp_dir = TempDir::new().unwrap(); let exit = Arc::::default(); - let tracer = BankingTracer::new(Some(( + let (tracer, tracer_thread) = BankingTracer::new(Some(( temp_dir.path().join("banking-trace"), exit.clone(), 1024 * 1024, // cause more frequent trace file rotation @@ -111,7 +117,13 @@ fn bench_banking_tracer_main_thread_overhead_under_sustained_write(bencher: &mut non_vote_sender.send(packet_batch.clone()).unwrap(); }); - terminate_tracer(tracer, dummy_main_thread, non_vote_sender, Some(exit)); + terminate_tracer( + tracer, + tracer_thread, + dummy_main_thread, + non_vote_sender, + Some(exit), + ); drop_and_clean_temp_dir_unless_suppressed(temp_dir); } @@ -128,7 +140,8 @@ fn bench_banking_tracer_background_thread_throughput(bencher: &mut Bencher) { let exit = Arc::::default(); - let tracer = BankingTracer::new(Some((path, exit.clone(), 50 * 1024 * 1024))).unwrap(); + let (tracer, tracer_thread) = + BankingTracer::new(Some((path, exit.clone(), 50 * 1024 * 1024))).unwrap(); let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); let dummy_main_thread = thread::spawn(move || { @@ -143,7 +156,13 @@ fn bench_banking_tracer_background_thread_throughput(bencher: &mut Bencher) { non_vote_sender.send(packet_batch.clone()).unwrap(); } - terminate_tracer(tracer, dummy_main_thread, non_vote_sender, None); + terminate_tracer( + tracer, + tracer_thread, + dummy_main_thread, + non_vote_sender, + None, + ); }); drop_and_clean_temp_dir_unless_suppressed(temp_dir); diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index 0d44b9411a3..ccc9dd3cdde 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -15,7 +15,7 @@ use { path::PathBuf, sync::{ atomic::{AtomicBool, Ordering}, - Arc, Mutex, + Arc, }, thread::{self, sleep, JoinHandle}, time::{Duration, SystemTime}, @@ -58,11 +58,7 @@ pub const BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT: DirByteLimit = #[allow(clippy::type_complexity)] #[derive(Debug)] pub struct BankingTracer { - enabled_tracer: Option<( - Sender, - Mutex>>, - Arc, - )>, + enabled_tracer: Option<(Sender, Arc)>, } #[derive(Serialize, Deserialize, Debug)] @@ -183,9 +179,10 @@ pub fn receiving_loop_with_minimized_sender_overhead( impl BankingTracer { pub fn new( maybe_config: Option<(PathBuf, Arc, DirByteLimit)>, - ) -> Result, TraceError> { - let enabled_tracer = maybe_config - .map(|(path, exit, dir_byte_limit)| { + ) -> Result<(Arc, TracerThread), TraceError> { + match maybe_config { + None => Ok((Self::new_disabled(), None)), + Some((path, exit, dir_byte_limit)) => { let rotate_threshold_size = dir_byte_limit / TRACE_FILE_ROTATE_COUNT; if rotate_threshold_size == 0 { return Err(TraceError::TooSmallDirByteLimit( @@ -199,14 +196,17 @@ impl BankingTracer { Self::ensure_prepare_path(&path)?; let file_appender = Self::create_file_appender(path, rotate_threshold_size)?; - let tracing_thread = + let tracer_thread = Self::spawn_background_thread(trace_receiver, file_appender, exit.clone())?; - Ok((trace_sender, Mutex::new(Some(tracing_thread)), exit)) - }) - .transpose()?; - - Ok(Arc::new(Self { enabled_tracer })) + Ok(( + Arc::new(Self { + enabled_tracer: Some((trace_sender, exit)), + }), + Some(tracer_thread), + )) + } + } } pub fn new_disabled() -> Arc { @@ -224,7 +224,7 @@ impl BankingTracer { label, self.enabled_tracer .as_ref() - .map(|(sender, _, exit)| (sender.clone(), exit.clone())), + .map(|(sender, exit)| (sender.clone(), exit.clone())), ) } @@ -240,16 +240,6 @@ impl BankingTracer { self.create_channel(ChannelLabel::GossipVote) } - pub fn take_tracer_thread_join_handle(&self) -> TracerThread { - self.enabled_tracer.as_ref().map(|(_, tracer_thread, _)| { - tracer_thread - .lock() - .unwrap() - .take() - .expect("no double take; BankingStage should only do once!") - }) - } - pub fn hash_event(&self, slot: Slot, blockhash: Hash, bank_hash: Hash) { self.trace_event(TimedTracedEvent( SystemTime::now(), @@ -258,7 +248,7 @@ impl BankingTracer { } fn trace_event(&self, event: TimedTracedEvent) { - if let Some((sender, _, exit)) = &self.enabled_tracer { + if let Some((sender, exit)) = &self.enabled_tracer { if !exit.load(Ordering::Relaxed) { sender .send(event) @@ -394,6 +384,7 @@ pub mod for_test { pub fn terminate_tracer( tracer: Arc, + tracer_thread: TracerThread, main_thread: JoinHandle, sender: TracedSender, exit: Option>, @@ -401,7 +392,6 @@ pub mod for_test { if let Some(exit) = exit { exit.store(true, Ordering::Relaxed); } - let tracer_thread = tracer.take_tracer_thread_join_handle(); drop((sender, tracer)); main_thread.join().unwrap().unwrap(); if let Some(tracer_thread) = tracer_thread { @@ -440,7 +430,7 @@ mod tests { non_vote_sender .send(BankingPacketBatch::new((vec![], None))) .unwrap(); - for_test::terminate_tracer(tracer, dummy_main_thread, non_vote_sender, None); + for_test::terminate_tracer(tracer, None, dummy_main_thread, non_vote_sender, None); } #[test] @@ -448,9 +438,8 @@ mod tests { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("banking-trace"); let exit = Arc::::default(); - let tracer = + let (tracer, tracer_thread) = BankingTracer::new(Some((path, exit.clone(), DirByteLimit::max_value()))).unwrap(); - let tracer_thread = tracer.take_tracer_thread_join_handle(); let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); let exit_for_dummy_thread = Arc::::default(); @@ -489,7 +478,7 @@ mod tests { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("banking-trace"); let exit = Arc::::default(); - let tracer = BankingTracer::new(Some(( + let (tracer, tracer_thread) = BankingTracer::new(Some(( path.clone(), exit.clone(), DirByteLimit::max_value(), @@ -512,7 +501,13 @@ mod tests { let bank_hash = Hash::from_str("BankHash11111111111111111111111111111111111").unwrap(); tracer.hash_event(4, blockhash, bank_hash); - for_test::terminate_tracer(tracer, dummy_main_thread, non_vote_sender, None); + for_test::terminate_tracer( + tracer, + tracer_thread, + dummy_main_thread, + non_vote_sender, + None, + ); let mut stream = BufReader::new(File::open(path.join(BASENAME)).unwrap()); let results = (0..=3) diff --git a/core/src/tpu.rs b/core/src/tpu.rs index c0f9bbcae66..fca209c7d8e 100644 --- a/core/src/tpu.rs +++ b/core/src/tpu.rs @@ -101,6 +101,7 @@ impl Tpu { staked_nodes: &Arc>, shared_staked_nodes_overrides: Arc>>, banking_tracer: Arc, + tracer_thread_hdl: TracerThread, tpu_enable_udp: bool, ) -> Self { let TpuSockets { @@ -260,7 +261,7 @@ impl Tpu { find_packet_sender_stake_stage, vote_find_packet_sender_stake_stage, staked_nodes_updater_service, - tracer_thread_hdl: banking_tracer.take_tracer_thread_join_handle(), + tracer_thread_hdl, } } diff --git a/core/src/validator.rs b/core/src/validator.rs index 50c57135690..48ff061d3bc 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -935,7 +935,7 @@ impl Validator { exit.clone(), ); - let banking_tracer = + let (banking_tracer, tracer_thread) = BankingTracer::new((config.banking_trace_dir_byte_limit > 0).then_some(( blockstore.banking_trace_path(), exit.clone(), @@ -1037,6 +1037,7 @@ impl Validator { &staked_nodes, config.staked_nodes_overrides.clone(), banking_tracer, + tracer_thread, tpu_enable_udp, ); From b4ea1322f0c9ee1d90304aabdc967b8d03d050bd Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 25 Jan 2023 09:31:17 +0900 Subject: [PATCH 34/43] Tweak message for TooSmallDirByteLimitl --- core/src/banking_trace.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index ccc9dd3cdde..ddd698eefdc 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -42,7 +42,7 @@ pub enum TraceError { #[error("Integer Cast Error: {0}")] IntegerCastError(#[from] std::num::TryFromIntError), - #[error("dir byte limit is too small (must be larger than {1}): {0}")] + #[error("Trace directory's byte limit is too small (must be larger than {1}): {0}")] TooSmallDirByteLimit(DirByteLimit, DirByteLimit), } From c754b4ea570271ca8192440428a596a8d651c5d5 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 25 Jan 2023 09:36:04 +0900 Subject: [PATCH 35/43] Remove too much of indirection --- core/src/banking_trace.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index ddd698eefdc..dafc6ff6979 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -193,7 +193,7 @@ impl BankingTracer { let (trace_sender, trace_receiver) = unbounded(); - Self::ensure_prepare_path(&path)?; + create_dir_all(&path)?; let file_appender = Self::create_file_appender(path, rotate_threshold_size)?; let tracer_thread = @@ -269,10 +269,6 @@ impl BankingTracer { (TracedSender::new(label, sender, trace_sender), receiver) } - fn ensure_prepare_path(path: &PathBuf) -> Result<(), io::Error> { - create_dir_all(path) - } - pub fn ensure_cleanup_path(path: &PathBuf) -> Result<(), io::Error> { remove_dir_all(path).or_else(|err| { if err.kind() == io::ErrorKind::NotFound { From 10cf905ecd0fae5f5056e5b6886e08d997d0a6a4 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 25 Jan 2023 09:38:31 +0900 Subject: [PATCH 36/43] Remove needless pub from ::channel() --- core/src/banking_trace.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index dafc6ff6979..e4b64a72aee 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -261,7 +261,7 @@ impl BankingTracer { Self::channel(ChannelLabel::Dummy, None) } - pub fn channel( + fn channel( label: ChannelLabel, trace_sender: Option<(Sender, Arc)>, ) -> (TracedSender, Receiver) { From 22ff340c6a083a5533a8294d6a2d6cb9b63d2185 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 25 Jan 2023 09:47:18 +0900 Subject: [PATCH 37/43] Clarify test comments --- core/src/banking_trace.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index e4b64a72aee..107838acfe5 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -452,14 +452,15 @@ mod tests { exit.store(true, Ordering::Relaxed); tracer_thread.unwrap().join().unwrap().unwrap(); - // shouldn't panic + // .hash_event() must succeed even after exit is already set to true let blockhash = Hash::from_str("B1ockhash1111111111111111111111111111111111").unwrap(); let bank_hash = Hash::from_str("BankHash11111111111111111111111111111111111").unwrap(); tracer.hash_event(4, blockhash, bank_hash); drop(tracer); - // shouldn't panic + // .send() must succeed even after exit is already set to true and further tracer is + // already dropped non_vote_sender .send(for_test::sample_packet_batch()) .unwrap(); From 8ca2529226d551f6b4e31422fba7b3031b4e73f6 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 25 Jan 2023 09:55:20 +0900 Subject: [PATCH 38/43] Avoid needless event creation if tracer is disabled --- core/src/banking_trace.rs | 20 +++++++++++--------- core/src/replay_stage.rs | 2 +- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index 107838acfe5..7fb71238108 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -240,18 +240,20 @@ impl BankingTracer { self.create_channel(ChannelLabel::GossipVote) } - pub fn hash_event(&self, slot: Slot, blockhash: Hash, bank_hash: Hash) { - self.trace_event(TimedTracedEvent( - SystemTime::now(), - TracedEvent::BlockAndBankHash(slot, blockhash, bank_hash), - )) + pub fn hash_event(&self, slot: Slot, blockhash: &Hash, bank_hash: &Hash) { + self.trace_event(|| { + TimedTracedEvent( + SystemTime::now(), + TracedEvent::BlockAndBankHash(slot, *blockhash, *bank_hash), + ) + }) } - fn trace_event(&self, event: TimedTracedEvent) { + fn trace_event(&self, on_trace: impl Fn() -> TimedTracedEvent) { if let Some((sender, exit)) = &self.enabled_tracer { if !exit.load(Ordering::Relaxed) { sender - .send(event) + .send(on_trace()) .expect("active tracer thread unless exited"); } } @@ -455,7 +457,7 @@ mod tests { // .hash_event() must succeed even after exit is already set to true let blockhash = Hash::from_str("B1ockhash1111111111111111111111111111111111").unwrap(); let bank_hash = Hash::from_str("BankHash11111111111111111111111111111111111").unwrap(); - tracer.hash_event(4, blockhash, bank_hash); + tracer.hash_event(4, &blockhash, &bank_hash); drop(tracer); @@ -496,7 +498,7 @@ mod tests { .unwrap(); let blockhash = Hash::from_str("B1ockhash1111111111111111111111111111111111").unwrap(); let bank_hash = Hash::from_str("BankHash11111111111111111111111111111111111").unwrap(); - tracer.hash_event(4, blockhash, bank_hash); + tracer.hash_event(4, &blockhash, &bank_hash); for_test::terminate_tracer( tracer, diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index eb1a5701e77..879ac05d4ba 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -1780,7 +1780,7 @@ impl ReplayStage { ); // make sure parent is frozen for finalized hashes via the above // new()-ing of its child bank - banking_tracer.hash_event(parent.slot(), parent.last_blockhash(), parent.hash()); + banking_tracer.hash_event(parent.slot(), &parent.last_blockhash(), &parent.hash()); let tpu_bank = bank_forks.write().unwrap().insert(tpu_bank); poh_recorder From 30971c3a0b7679a3f72186a8f2fd5c5706e14fbb Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 25 Jan 2023 10:03:24 +0900 Subject: [PATCH 39/43] Write tests around file rotation and spill-over --- core/src/banking_trace.rs | 95 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index 7fb71238108..faa120931ce 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -193,7 +193,6 @@ impl BankingTracer { let (trace_sender, trace_receiver) = unbounded(); - create_dir_all(&path)?; let file_appender = Self::create_file_appender(path, rotate_threshold_size)?; let tracer_thread = @@ -285,6 +284,7 @@ impl BankingTracer { path: PathBuf, rotate_threshold_size: u64, ) -> Result, TraceError> { + create_dir_all(&path)?; let grouped = RollingConditionGrouped::new( RollingConditionBasic::new() .daily() @@ -372,12 +372,10 @@ pub mod for_test { } pub fn drop_and_clean_temp_dir_unless_suppressed(temp_dir: TempDir) { - std::env::var("BANKING_TRACE_LEAVE_FILES_FROM_LAST_ITERATION") - .is_ok() - .then(|| { - warn!("prevented to remove {:?}", temp_dir.path()); - drop(temp_dir.into_path()); - }); + std::env::var("BANKING_TRACE_LEAVE_FILES").is_ok().then(|| { + warn!("prevented to remove {:?}", temp_dir.path()); + drop(temp_dir.into_path()); + }); } pub fn terminate_tracer( @@ -540,4 +538,87 @@ mod tests { for_test::drop_and_clean_temp_dir_unless_suppressed(temp_dir); } + + #[test] + fn test_spill_over_at_rotation() { + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir.path().join("banking-trace"); + const REALLY_SMALL_ROTATION_THRESHOLD: u64 = 1; + + let mut file_appender = + BankingTracer::create_file_appender(path.clone(), REALLY_SMALL_ROTATION_THRESHOLD) + .unwrap(); + file_appender.write_all(b"foo").unwrap(); + file_appender.condition_mut().reset(); + file_appender.write_all(b"bar").unwrap(); + file_appender.condition_mut().reset(); + file_appender.flush().unwrap(); + + assert_eq!( + [ + std::fs::read_to_string(path.join("events")).ok(), + std::fs::read_to_string(path.join("events.1")).ok(), + std::fs::read_to_string(path.join("events.2")).ok(), + ], + [Some("bar".into()), Some("foo".into()), None] + ); + + for_test::drop_and_clean_temp_dir_unless_suppressed(temp_dir); + } + + #[test] + fn test_reopen_with_blank_file() { + let temp_dir = TempDir::new().unwrap(); + + let path = temp_dir.path().join("banking-trace"); + + let mut file_appender = BankingTracer::create_file_appender( + path.clone(), + TRACE_FILE_DEFAULT_ROTATE_BYTE_THRESHOLD, + ) + .unwrap(); + // assume this is unclean write + file_appender.write_all(b"f").unwrap(); + file_appender.flush().unwrap(); + + // reopen while shadow-dropping the old tracer + let mut file_appender = BankingTracer::create_file_appender( + path.clone(), + TRACE_FILE_DEFAULT_ROTATE_BYTE_THRESHOLD, + ) + .unwrap(); + // new file won't be created as appender is lazy + assert_eq!( + [ + std::fs::read_to_string(path.join("events")).ok(), + std::fs::read_to_string(path.join("events.1")).ok(), + std::fs::read_to_string(path.join("events.2")).ok(), + ], + [Some("f".into()), None, None] + ); + + // initial write actually creates the new blank file + file_appender.write_all(b"bar").unwrap(); + assert_eq!( + [ + std::fs::read_to_string(path.join("events")).ok(), + std::fs::read_to_string(path.join("events.1")).ok(), + std::fs::read_to_string(path.join("events.2")).ok(), + ], + [Some("".into()), Some("f".into()), None] + ); + + // flush actually write the actual data + file_appender.flush().unwrap(); + assert_eq!( + [ + std::fs::read_to_string(path.join("events")).ok(), + std::fs::read_to_string(path.join("events.1")).ok(), + std::fs::read_to_string(path.join("events.2")).ok(), + ], + [Some("bar".into()), Some("f".into()), None] + ); + + for_test::drop_and_clean_temp_dir_unless_suppressed(temp_dir); + } } From 420d95efcf91e10c93e9bc02db576771ac70162b Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 25 Jan 2023 11:30:03 +0900 Subject: [PATCH 40/43] Remove unneeded PathBuf::clone()s... --- banking-bench/src/main.rs | 2 +- core/benches/banking_trace.rs | 6 +++--- core/src/banking_trace.rs | 35 +++++++++++++---------------------- core/src/validator.rs | 2 +- 4 files changed, 18 insertions(+), 27 deletions(-) diff --git a/banking-bench/src/main.rs b/banking-bench/src/main.rs index e8ae68728c4..f557649a348 100644 --- a/banking-bench/src/main.rs +++ b/banking-bench/src/main.rs @@ -418,7 +418,7 @@ fn main() { create_test_recorder(&bank, &blockstore, None, Some(leader_schedule_cache)); let (banking_tracer, tracer_thread) = BankingTracer::new(matches.is_present("trace_banking").then_some(( - blockstore.banking_trace_path(), + &blockstore.banking_trace_path(), exit.clone(), BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, ))) diff --git a/core/benches/banking_trace.rs b/core/benches/banking_trace.rs index b592d35c6eb..fb93deebc17 100644 --- a/core/benches/banking_trace.rs +++ b/core/benches/banking_trace.rs @@ -59,7 +59,7 @@ fn bench_banking_tracer_main_thread_overhead_under_peak_write(bencher: &mut Benc let exit = Arc::::default(); let (tracer, tracer_thread) = BankingTracer::new(Some(( - temp_dir.path().join("banking-trace"), + &temp_dir.path().join("banking-trace"), exit.clone(), BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, ))) @@ -96,7 +96,7 @@ fn bench_banking_tracer_main_thread_overhead_under_sustained_write(bencher: &mut let exit = Arc::::default(); let (tracer, tracer_thread) = BankingTracer::new(Some(( - temp_dir.path().join("banking-trace"), + &temp_dir.path().join("banking-trace"), exit.clone(), 1024 * 1024, // cause more frequent trace file rotation ))) @@ -141,7 +141,7 @@ fn bench_banking_tracer_background_thread_throughput(bencher: &mut Bencher) { let exit = Arc::::default(); let (tracer, tracer_thread) = - BankingTracer::new(Some((path, exit.clone(), 50 * 1024 * 1024))).unwrap(); + BankingTracer::new(Some((&path, exit.clone(), 50 * 1024 * 1024))).unwrap(); let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); let dummy_main_thread = thread::spawn(move || { diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index faa120931ce..931c46d6b69 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -178,7 +178,7 @@ pub fn receiving_loop_with_minimized_sender_overhead( impl BankingTracer { pub fn new( - maybe_config: Option<(PathBuf, Arc, DirByteLimit)>, + maybe_config: Option<(&PathBuf, Arc, DirByteLimit)>, ) -> Result<(Arc, TracerThread), TraceError> { match maybe_config { None => Ok((Self::new_disabled(), None)), @@ -281,10 +281,10 @@ impl BankingTracer { } fn create_file_appender( - path: PathBuf, + path: &PathBuf, rotate_threshold_size: u64, ) -> Result, TraceError> { - create_dir_all(&path)?; + create_dir_all(path)?; let grouped = RollingConditionGrouped::new( RollingConditionBasic::new() .daily() @@ -435,7 +435,7 @@ mod tests { let path = temp_dir.path().join("banking-trace"); let exit = Arc::::default(); let (tracer, tracer_thread) = - BankingTracer::new(Some((path, exit.clone(), DirByteLimit::max_value()))).unwrap(); + BankingTracer::new(Some((&path, exit.clone(), DirByteLimit::max_value()))).unwrap(); let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); let exit_for_dummy_thread = Arc::::default(); @@ -475,12 +475,8 @@ mod tests { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("banking-trace"); let exit = Arc::::default(); - let (tracer, tracer_thread) = BankingTracer::new(Some(( - path.clone(), - exit.clone(), - DirByteLimit::max_value(), - ))) - .unwrap(); + let (tracer, tracer_thread) = + BankingTracer::new(Some((&path, exit.clone(), DirByteLimit::max_value()))).unwrap(); let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); let dummy_main_thread = thread::spawn(move || { @@ -546,8 +542,7 @@ mod tests { const REALLY_SMALL_ROTATION_THRESHOLD: u64 = 1; let mut file_appender = - BankingTracer::create_file_appender(path.clone(), REALLY_SMALL_ROTATION_THRESHOLD) - .unwrap(); + BankingTracer::create_file_appender(&path, REALLY_SMALL_ROTATION_THRESHOLD).unwrap(); file_appender.write_all(b"foo").unwrap(); file_appender.condition_mut().reset(); file_appender.write_all(b"bar").unwrap(); @@ -572,21 +567,17 @@ mod tests { let path = temp_dir.path().join("banking-trace"); - let mut file_appender = BankingTracer::create_file_appender( - path.clone(), - TRACE_FILE_DEFAULT_ROTATE_BYTE_THRESHOLD, - ) - .unwrap(); + let mut file_appender = + BankingTracer::create_file_appender(&path, TRACE_FILE_DEFAULT_ROTATE_BYTE_THRESHOLD) + .unwrap(); // assume this is unclean write file_appender.write_all(b"f").unwrap(); file_appender.flush().unwrap(); // reopen while shadow-dropping the old tracer - let mut file_appender = BankingTracer::create_file_appender( - path.clone(), - TRACE_FILE_DEFAULT_ROTATE_BYTE_THRESHOLD, - ) - .unwrap(); + let mut file_appender = + BankingTracer::create_file_appender(&path, TRACE_FILE_DEFAULT_ROTATE_BYTE_THRESHOLD) + .unwrap(); // new file won't be created as appender is lazy assert_eq!( [ diff --git a/core/src/validator.rs b/core/src/validator.rs index 48ff061d3bc..f52dd56df02 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -937,7 +937,7 @@ impl Validator { let (banking_tracer, tracer_thread) = BankingTracer::new((config.banking_trace_dir_byte_limit > 0).then_some(( - blockstore.banking_trace_path(), + &blockstore.banking_trace_path(), exit.clone(), config.banking_trace_dir_byte_limit, ))) From 54042b258d231bebca4e211943dd22d33acf3387 Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 25 Jan 2023 11:47:00 +0900 Subject: [PATCH 41/43] Introduce inner struct instead of tuple... --- core/src/banking_trace.rs | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index 931c46d6b69..e1071ca65e1 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -55,10 +55,15 @@ pub const DISABLED_BAKING_TRACE_DIR: DirByteLimit = 0; pub const BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT: DirByteLimit = TRACE_FILE_DEFAULT_ROTATE_BYTE_THRESHOLD * TRACE_FILE_ROTATE_COUNT; -#[allow(clippy::type_complexity)] +#[derive(Clone, Debug)] +struct ActiveTracer { + trace_sender: Sender, + exit: Arc, +} + #[derive(Debug)] pub struct BankingTracer { - enabled_tracer: Option<(Sender, Arc)>, + active_tracer: Option, } #[derive(Serialize, Deserialize, Debug)] @@ -200,7 +205,7 @@ impl BankingTracer { Ok(( Arc::new(Self { - enabled_tracer: Some((trace_sender, exit)), + active_tracer: Some(ActiveTracer { trace_sender, exit }), }), Some(tracer_thread), )) @@ -210,21 +215,16 @@ impl BankingTracer { pub fn new_disabled() -> Arc { Arc::new(Self { - enabled_tracer: None, + active_tracer: None, }) } pub fn is_enabled(&self) -> bool { - self.enabled_tracer.is_some() + self.active_tracer.is_some() } fn create_channel(&self, label: ChannelLabel) -> (BankingPacketSender, BankingPacketReceiver) { - Self::channel( - label, - self.enabled_tracer - .as_ref() - .map(|(sender, exit)| (sender.clone(), exit.clone())), - ) + Self::channel(label, self.active_tracer.as_ref().cloned()) } pub fn create_channel_non_vote(&self) -> (BankingPacketSender, BankingPacketReceiver) { @@ -249,9 +249,9 @@ impl BankingTracer { } fn trace_event(&self, on_trace: impl Fn() -> TimedTracedEvent) { - if let Some((sender, exit)) = &self.enabled_tracer { + if let Some(ActiveTracer { trace_sender, exit }) = &self.active_tracer { if !exit.load(Ordering::Relaxed) { - sender + trace_sender .send(on_trace()) .expect("active tracer thread unless exited"); } @@ -264,10 +264,10 @@ impl BankingTracer { fn channel( label: ChannelLabel, - trace_sender: Option<(Sender, Arc)>, + active_tracer: Option, ) -> (TracedSender, Receiver) { let (sender, receiver) = unbounded(); - (TracedSender::new(label, sender, trace_sender), receiver) + (TracedSender::new(label, sender, active_tracer), receiver) } pub fn ensure_cleanup_path(path: &PathBuf) -> Result<(), io::Error> { @@ -327,24 +327,24 @@ impl BankingTracer { pub struct TracedSender { label: ChannelLabel, sender: Sender, - trace_sender: Option<(Sender, Arc)>, + active_tracer: Option, } impl TracedSender { fn new( label: ChannelLabel, sender: Sender, - trace_sender: Option<(Sender, Arc)>, + active_tracer: Option, ) -> Self { Self { label, sender, - trace_sender, + active_tracer, } } pub fn send(&self, batch: BankingPacketBatch) -> Result<(), SendError> { - if let Some((trace_sender, exit)) = &self.trace_sender { + if let Some(ActiveTracer { trace_sender, exit }) = &self.active_tracer { if !exit.load(Ordering::Relaxed) { trace_sender .send(TimedTracedEvent( From b7ac53b81e9c3022e05c9359c897fd80fe6284fc Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 25 Jan 2023 16:32:59 +0900 Subject: [PATCH 42/43] Remove unused enum BankStatus... --- core/src/banking_trace.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index e1071ca65e1..b245e1fb592 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -75,12 +75,6 @@ enum TracedEvent { BlockAndBankHash(Slot, Hash, Hash), } -#[derive(Serialize, Deserialize, Debug)] -enum BankStatus { - Started, - Ended, -} - #[derive(Serialize, Deserialize, Debug, Clone, Copy)] pub enum ChannelLabel { NonVote, From 84e95204ad3f39f6785dcd5a871a3703c7bb5f3f Mon Sep 17 00:00:00 2001 From: Ryo Onodera Date: Wed, 25 Jan 2023 16:33:41 +0900 Subject: [PATCH 43/43] Avoid .unwrap() for the case of disabled tracer... --- banking-bench/src/main.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/banking-bench/src/main.rs b/banking-bench/src/main.rs index f557649a348..4fbe37400b8 100644 --- a/banking-bench/src/main.rs +++ b/banking-bench/src/main.rs @@ -600,7 +600,9 @@ fn main() { poh_service.join().unwrap(); sleep(Duration::from_secs(1)); debug!("waited for poh_service"); - tracer_thread.unwrap().join().unwrap().unwrap(); + if let Some(tracer_thread) = tracer_thread { + tracer_thread.join().unwrap().unwrap(); + } } let _unused = Blockstore::destroy(&ledger_path); }