diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index ce3851ea54e..d8f63ec5619 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -41,8 +41,8 @@ pub use crate::scheduler::BeaconProcessorQueueLengths; use crate::scheduler::work_queue::WorkQueues; use crate::work_reprocessing_queue::{ - QueuedBackfillBatch, QueuedColumnReconstruction, QueuedGossipBlock, QueuedGossipEnvelope, - ReprocessQueueMessage, + QueuedBackfillBatch, QueuedColumnReconstruction, QueuedGossipBlock, QueuedGossipDataColumn, + QueuedGossipEnvelope, ReprocessQueueMessage, }; use futures::stream::{Stream, StreamExt}; use futures::task::Poll; @@ -304,6 +304,10 @@ impl From for WorkEvent { work: Work::ColumnReconstruction(process_fn), } } + ReadyWork::DataColumn(QueuedGossipDataColumn { process_fn, .. }) => Self { + drop_during_sync: true, + work: Work::UnknownBlockDataColumn { process_fn }, + }, } } } @@ -369,6 +373,9 @@ pub enum Work { UnknownBlockAttestation { process_fn: BlockingFn, }, + UnknownBlockDataColumn { + process_fn: BlockingFn, + }, GossipAttestationBatch { attestations: GossipAttestationBatch, process_batch: Box, @@ -465,6 +472,7 @@ pub enum WorkType { GossipAttestation, GossipAttestationToConvert, UnknownBlockAttestation, + UnknownBlockDataColumn, GossipAttestationBatch, GossipAggregate, UnknownBlockAggregate, @@ -572,6 +580,7 @@ impl Work { Work::LightClientFinalityUpdateRequest(_) => WorkType::LightClientFinalityUpdateRequest, Work::LightClientUpdatesByRangeRequest(_) => WorkType::LightClientUpdatesByRangeRequest, Work::UnknownBlockAttestation { .. } => WorkType::UnknownBlockAttestation, + Work::UnknownBlockDataColumn { .. } => WorkType::UnknownBlockDataColumn, Work::UnknownBlockAggregate { .. } => WorkType::UnknownBlockAggregate, Work::UnknownLightClientOptimisticUpdate { .. } => { WorkType::UnknownLightClientOptimisticUpdate @@ -986,6 +995,9 @@ impl BeaconProcessor { } else if let Some(item) = work_queues.unknown_block_aggregate_queue.pop() { Some(item) } else if let Some(item) = work_queues.unknown_block_attestation_queue.pop() + { + Some(item) + } else if let Some(item) = work_queues.unknown_block_data_column_queue.pop() { Some(item) // Check execution payload bids. Most proposers will request bids directly from builders @@ -1246,6 +1258,9 @@ impl BeaconProcessor { Work::UnknownBlockAttestation { .. } => { work_queues.unknown_block_attestation_queue.push(work) } + Work::UnknownBlockDataColumn { .. } => work_queues + .unknown_block_data_column_queue + .push(work, work_id), Work::UnknownBlockAggregate { .. } => { work_queues.unknown_block_aggregate_queue.push(work) } @@ -1296,6 +1311,9 @@ impl BeaconProcessor { WorkType::UnknownBlockAttestation => { work_queues.unknown_block_attestation_queue.len() } + WorkType::UnknownBlockDataColumn => { + work_queues.unknown_block_data_column_queue.len() + } WorkType::GossipAttestationBatch => 0, // No queue WorkType::GossipAggregate => work_queues.aggregate_queue.len(), WorkType::UnknownBlockAggregate => { @@ -1513,6 +1531,7 @@ impl BeaconProcessor { }), Work::UnknownBlockAttestation { process_fn } | Work::UnknownBlockAggregate { process_fn } + | Work::UnknownBlockDataColumn { process_fn } | Work::UnknownLightClientOptimisticUpdate { process_fn, .. } => { task_spawner.spawn_blocking(process_fn) } diff --git a/beacon_node/beacon_processor/src/scheduler/work_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_queue.rs index 2fdc15182cd..41d25a7c218 100644 --- a/beacon_node/beacon_processor/src/scheduler/work_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/work_queue.rs @@ -111,6 +111,7 @@ pub struct BeaconProcessorQueueLengths { attestation_queue: usize, unknown_block_aggregate_queue: usize, unknown_block_attestation_queue: usize, + unknown_block_data_column_queue: usize, sync_message_queue: usize, sync_contribution_queue: usize, gossip_voluntary_exit_queue: usize, @@ -175,6 +176,8 @@ impl BeaconProcessorQueueLengths { Ok(Self { aggregate_queue: 4096, unknown_block_aggregate_queue: 1024, + // Capacity for two slot's worth of data columns for a supernode. + unknown_block_data_column_queue: 256, // Capacity for a full slot's worth of attestations if subscribed to all subnets attestation_queue: std::cmp::max( active_validator_count / slots_per_epoch, @@ -247,6 +250,7 @@ pub struct WorkQueues { pub attestation_debounce: TimeLatch, pub unknown_block_aggregate_queue: LifoQueue>, pub unknown_block_attestation_queue: LifoQueue>, + pub unknown_block_data_column_queue: FifoQueue>, pub sync_message_queue: LifoQueue>, pub sync_contribution_queue: LifoQueue>, pub gossip_voluntary_exit_queue: FifoQueue>, @@ -305,6 +309,8 @@ impl WorkQueues { LifoQueue::new(queue_lengths.unknown_block_aggregate_queue); let unknown_block_attestation_queue = LifoQueue::new(queue_lengths.unknown_block_attestation_queue); + let unknown_block_data_column_queue = + FifoQueue::new(queue_lengths.unknown_block_data_column_queue); let sync_message_queue = LifoQueue::new(queue_lengths.sync_message_queue); let sync_contribution_queue = LifoQueue::new(queue_lengths.sync_contribution_queue); @@ -387,6 +393,7 @@ impl WorkQueues { attestation_debounce, unknown_block_aggregate_queue, unknown_block_attestation_queue, + unknown_block_data_column_queue, sync_message_queue, sync_contribution_queue, gossip_voluntary_exit_queue, diff --git a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs index b1fa56af018..62ed86fbad0 100644 --- a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs @@ -52,6 +52,10 @@ pub const QUEUED_ATTESTATION_DELAY: Duration = Duration::from_secs(12); /// For how long to queue light client updates for re-processing. pub const QUEUED_LIGHT_CLIENT_UPDATE_DELAY: Duration = Duration::from_secs(12); +/// Data column timeout as a multiplier of slot duration. Columns waiting for their block will be +/// sent for processing after this many slots worth of time, even if the block hasn't arrived. +const QUEUED_DATA_COLUMN_DELAY_SLOTS: u32 = 1; + /// Envelope timeout as a multiplier of slot duration. Envelopes waiting for their block will be /// sent for processing after this many slots worth of time, even if the block hasn't arrived. const QUEUED_ENVELOPE_DELAY_SLOTS: u32 = 1; @@ -76,6 +80,9 @@ const MAXIMUM_QUEUED_ENVELOPES: usize = 16; /// How many attestations we keep before new ones get dropped. const MAXIMUM_QUEUED_ATTESTATIONS: usize = 16_384; +/// How many columns we keep before new ones get dropped. +const MAXIMUM_QUEUED_DATA_COLUMNS: usize = 256; + /// How many light client updates we keep before new ones get dropped. const MAXIMUM_QUEUED_LIGHT_CLIENT_UPDATES: usize = 128; @@ -123,6 +130,8 @@ pub enum ReprocessQueueMessage { UnknownLightClientOptimisticUpdate(QueuedLightClientUpdate), /// A new backfill batch that needs to be scheduled for processing. BackfillSync(QueuedBackfillBatch), + /// A gossip data column that references an unknown block. + UnknownBlockDataColumn(QueuedGossipDataColumn), /// A delayed column reconstruction that needs checking DelayColumnReconstruction(QueuedColumnReconstruction), } @@ -138,6 +147,7 @@ pub enum ReadyWork { LightClientUpdate(QueuedLightClientUpdate), BackfillSync(QueuedBackfillBatch), ColumnReconstruction(QueuedColumnReconstruction), + DataColumn(QueuedGossipDataColumn), } /// An Attestation for which the corresponding block was not seen while processing, queued for @@ -200,6 +210,12 @@ pub struct QueuedColumnReconstruction { pub process_fn: AsyncFn, } +/// A gossip data column that references an unknown block, queued for later reprocessing. +pub struct QueuedGossipDataColumn { + pub beacon_block_root: Hash256, + pub process_fn: BlockingFn, +} + impl TryFrom> for QueuedBackfillBatch { type Error = WorkEvent; @@ -240,6 +256,8 @@ enum InboundEvent { ReadyBackfillSync(QueuedBackfillBatch), /// A column reconstruction that was queued is ready for processing. ReadyColumnReconstruction(QueuedColumnReconstruction), + /// A gossip data column that is ready for re-processing. + ReadyDataColumn(Hash256), /// A message sent to the `ReprocessQueue` Msg(ReprocessQueueMessage), } @@ -264,6 +282,8 @@ struct ReprocessQueue { lc_updates_delay_queue: DelayQueue, /// Queue to manage scheduled column reconstructions. column_reconstructions_delay_queue: DelayQueue, + /// Queue to manage gossip data column timeouts. + data_columns_delay_queue: DelayQueue, /* Queued items */ /// Queued blocks. @@ -284,6 +304,10 @@ struct ReprocessQueue { queued_column_reconstructions: HashMap>, /// Queued backfill batches queued_backfill_batches: Vec, + /// Queued gossip data columns awaiting their block, keyed by block root. + awaiting_data_columns_per_root: HashMap, DelayKey)>, + /// Total number of queued gossip data columns across all roots. + queued_data_columns_count: usize, /* Aux */ /// Next attestation id, used for both aggregated and unaggregated attestations @@ -294,6 +318,7 @@ struct ReprocessQueue { rpc_block_debounce: TimeLatch, attestation_delay_debounce: TimeLatch, lc_update_delay_debounce: TimeLatch, + data_column_delay_debounce: TimeLatch, next_backfill_batch_event: Option>>, slot_clock: Arc, } @@ -387,6 +412,13 @@ impl Stream for ReprocessQueue { Poll::Ready(None) | Poll::Pending => (), } + match self.data_columns_delay_queue.poll_expired(cx) { + Poll::Ready(Some(block_root)) => { + return Poll::Ready(Some(InboundEvent::ReadyDataColumn(block_root.into_inner()))); + } + Poll::Ready(None) | Poll::Pending => (), + } + if let Some(next_backfill_batch_event) = self.next_backfill_batch_event.as_mut() { match next_backfill_batch_event.as_mut().poll(cx) { Poll::Ready(_) => { @@ -455,6 +487,7 @@ impl ReprocessQueue { attestations_delay_queue: DelayQueue::new(), lc_updates_delay_queue: DelayQueue::new(), column_reconstructions_delay_queue: DelayQueue::new(), + data_columns_delay_queue: DelayQueue::new(), queued_gossip_block_roots: HashSet::new(), awaiting_envelopes_per_root: HashMap::new(), queued_lc_updates: FnvHashMap::default(), @@ -464,6 +497,8 @@ impl ReprocessQueue { awaiting_lc_updates_per_parent_root: HashMap::new(), queued_backfill_batches: Vec::new(), queued_column_reconstructions: HashMap::new(), + awaiting_data_columns_per_root: HashMap::new(), + queued_data_columns_count: 0, next_attestation: 0, next_lc_update: 0, early_block_debounce: TimeLatch::default(), @@ -471,6 +506,7 @@ impl ReprocessQueue { rpc_block_debounce: TimeLatch::default(), attestation_delay_debounce: TimeLatch::default(), lc_update_delay_debounce: TimeLatch::default(), + data_column_delay_debounce: TimeLatch::default(), next_backfill_batch_event: None, slot_clock, } @@ -551,22 +587,16 @@ impl ReprocessQueue { return; } - // When the queue is full, evict the oldest entry to make room for newer envelopes. + // When the queue is full, drop the new envelope. if self.awaiting_envelopes_per_root.len() >= MAXIMUM_QUEUED_ENVELOPES { if self.envelope_delay_debounce.elapsed() { warn!( queue_size = MAXIMUM_QUEUED_ENVELOPES, msg = "system resources may be saturated", - "Envelope delay queue is full, evicting oldest entry" + "Envelope delay queue is full, dropping envelope" ); } - if let Some(oldest_root) = - self.awaiting_envelopes_per_root.keys().next().copied() - && let Some((_envelope, delay_key)) = - self.awaiting_envelopes_per_root.remove(&oldest_root) - { - self.envelope_delay_queue.remove(&delay_key); - } + return; } // Register the timeout. @@ -688,6 +718,37 @@ impl ReprocessQueue { self.next_attestation += 1; } + InboundEvent::Msg(UnknownBlockDataColumn(queued_data_column)) => { + let block_root = queued_data_column.beacon_block_root; + + if self.queued_data_columns_count >= MAXIMUM_QUEUED_DATA_COLUMNS { + if self.data_column_delay_debounce.elapsed() { + warn!( + queue_size = MAXIMUM_QUEUED_DATA_COLUMNS, + msg = "system resources may be saturated", + "Data column delay queue is full, dropping column" + ); + } + return; + } + + if let Some((columns, _delay_key)) = + self.awaiting_data_columns_per_root.get_mut(&block_root) + { + // Append to existing entry; the timer for this root is already running. + columns.push(queued_data_column); + } else { + let delay_key = self.data_columns_delay_queue.insert( + block_root, + self.slot_clock.slot_duration() * QUEUED_DATA_COLUMN_DELAY_SLOTS, + ); + + self.awaiting_data_columns_per_root + .insert(block_root, (vec![queued_data_column], delay_key)); + } + + self.queued_data_columns_count += 1; + } InboundEvent::Msg(UnknownLightClientOptimisticUpdate( queued_light_client_optimistic_update, )) => { @@ -800,6 +861,25 @@ impl ReprocessQueue { ); } } + + // Unqueue the data columns we have for this root, if any. + if let Some((data_columns, delay_key)) = + self.awaiting_data_columns_per_root.remove(&block_root) + { + self.data_columns_delay_queue.remove(&delay_key); + self.queued_data_columns_count = self + .queued_data_columns_count + .saturating_sub(data_columns.len()); + for data_column in data_columns { + if self + .ready_work_tx + .try_send(ReadyWork::DataColumn(data_column)) + .is_err() + { + error!(?block_root, "Failed to send data column for reprocessing"); + } + } + } } InboundEvent::Msg(NewLightClientOptimisticUpdate { parent_root }) => { // Unqueue the light client optimistic updates we have for this root, if any. @@ -1053,6 +1133,27 @@ impl ReprocessQueue { ); } } + InboundEvent::ReadyDataColumn(block_root) => { + if let Some((data_columns, _)) = + self.awaiting_data_columns_per_root.remove(&block_root) + { + self.queued_data_columns_count = self + .queued_data_columns_count + .saturating_sub(data_columns.len()); + for data_column in data_columns { + if self + .ready_work_tx + .try_send(ReadyWork::DataColumn(data_column)) + .is_err() + { + error!( + hint = "system may be overloaded", + "Ignored expired gossip data column" + ); + } + } + } + } } metrics::set_gauge_vec( @@ -1581,48 +1682,87 @@ mod tests { assert_eq!(queue.envelope_delay_queue.len(), 1); } + /// Tests that a queued gossip data column is released when its block is imported. #[tokio::test] - async fn envelope_capacity_evicts_oldest() { + async fn data_column_released_on_block_imported() { create_test_tracing_subscriber(); - let mut queue = test_queue(); + let config = BeaconProcessorConfig::default(); + let (ready_work_tx, mut ready_work_rx) = + mpsc::channel::(config.max_scheduled_work_queue_len); + let (_, reprocess_work_rx) = + mpsc::channel::(config.max_scheduled_work_queue_len); + let slot_clock = Arc::new(testing_slot_clock(12)); + let mut queue = ReprocessQueue::new(ready_work_tx, reprocess_work_rx, slot_clock); - // Pause time so it only advances manually tokio::time::pause(); - // Fill the queue to capacity. - for i in 0..MAXIMUM_QUEUED_ENVELOPES { - let block_root = Hash256::repeat_byte(i as u8); - let msg = ReprocessQueueMessage::UnknownBlockForEnvelope(QueuedGossipEnvelope { - beacon_block_slot: Slot::new(1), - beacon_block_root: block_root, - process_fn: Box::pin(async {}), - }); - queue.handle_message(InboundEvent::Msg(msg)); - } - assert_eq!( - queue.awaiting_envelopes_per_root.len(), - MAXIMUM_QUEUED_ENVELOPES - ); + let beacon_block_root = Hash256::repeat_byte(0xbb); - // One more should evict the oldest and insert the new one. - let overflow_root = Hash256::repeat_byte(0xff); - let msg = ReprocessQueueMessage::UnknownBlockForEnvelope(QueuedGossipEnvelope { - beacon_block_slot: Slot::new(1), - beacon_block_root: overflow_root, - process_fn: Box::pin(async {}), + let msg = ReprocessQueueMessage::UnknownBlockDataColumn(QueuedGossipDataColumn { + beacon_block_root, + process_fn: Box::new(|| {}), }); queue.handle_message(InboundEvent::Msg(msg)); - // Queue should still be at capacity, with the new root present. - assert_eq!( - queue.awaiting_envelopes_per_root.len(), - MAXIMUM_QUEUED_ENVELOPES + assert_eq!(queue.awaiting_data_columns_per_root.len(), 1); + assert!( + queue + .awaiting_data_columns_per_root + .contains_key(&beacon_block_root) ); + assert_eq!(queue.data_columns_delay_queue.len(), 1); + + // Simulate block import. + queue.handle_message(InboundEvent::Msg(ReprocessQueueMessage::BlockImported { + block_root: beacon_block_root, + parent_root: Hash256::repeat_byte(0x00), + })); + + // Internal state should be cleaned up. + assert!(queue.awaiting_data_columns_per_root.is_empty()); + assert_eq!(queue.data_columns_delay_queue.len(), 0); + + // The column should have been sent to the ready_work channel. + let ready = ready_work_rx.try_recv().expect("column should be ready"); + assert!(matches!(ready, ReadyWork::DataColumn(_))); + } + + /// Tests that an expired gossip data column is pruned cleanly from all internal state. + #[tokio::test] + async fn prune_awaiting_data_columns_per_root() { + create_test_tracing_subscriber(); + + let mut queue = test_queue(); + + tokio::time::pause(); + + let beacon_block_root = Hash256::repeat_byte(0xcd); + + let msg = ReprocessQueueMessage::UnknownBlockDataColumn(QueuedGossipDataColumn { + beacon_block_root, + process_fn: Box::new(|| {}), + }); + queue.handle_message(InboundEvent::Msg(msg)); + + assert_eq!(queue.awaiting_data_columns_per_root.len(), 1); assert!( queue - .awaiting_envelopes_per_root - .contains_key(&overflow_root) + .awaiting_data_columns_per_root + .contains_key(&beacon_block_root) ); + + // Advance time past the delay so the entry expires. + advance_time( + &queue.slot_clock, + 2 * queue.slot_clock.slot_duration() * QUEUED_DATA_COLUMN_DELAY_SLOTS, + ) + .await; + let ready_msg = queue.next().await.unwrap(); + assert!(matches!(ready_msg, InboundEvent::ReadyDataColumn(_))); + queue.handle_message(ready_msg); + + // All internal state should be cleaned up. + assert!(queue.awaiting_data_columns_per_root.is_empty()); } } diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index 3e8845f0175..a2a5010eee7 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -64,8 +64,8 @@ use beacon_processor::work_reprocessing_queue::QueuedColumnReconstruction; use beacon_processor::{ DuplicateCache, GossipAggregatePackage, GossipAttestationBatch, work_reprocessing_queue::{ - QueuedAggregate, QueuedGossipBlock, QueuedGossipEnvelope, QueuedLightClientUpdate, - QueuedUnaggregate, ReprocessQueueMessage, + QueuedAggregate, QueuedGossipBlock, QueuedGossipDataColumn, QueuedGossipEnvelope, + QueuedLightClientUpdate, QueuedUnaggregate, ReprocessQueueMessage, }, }; @@ -649,6 +649,7 @@ impl NetworkBeaconProcessor { subnet_id: DataColumnSubnetId, column_sidecar: Arc>, seen_duration: Duration, + allow_reprocess: bool, ) { let slot = column_sidecar.slot(); let block_root = column_sidecar.block_root(); @@ -728,19 +729,48 @@ impl NetworkBeaconProcessor { .. } => { debug!( - action = "ignoring", + action = "queuing for reprocessing", %unknown_block_root, "Unknown block root for column" ); - // TODO(gloas): wire this into proper lookup sync. Sending - // `UnknownBlockHashFromAttestation` here is a Fulu-shaped fallback that - // mixes column processing with the attestation lookup path and is not - // the right primitive for Gloas column lookups. self.propagate_validation_result( - message_id, + message_id.clone(), peer_id, MessageAcceptance::Ignore, ); + + if allow_reprocess { + // Queue the column for reprocessing when the block arrives. + let processor = self.clone(); + let reprocess_msg = ReprocessQueueMessage::UnknownBlockDataColumn( + QueuedGossipDataColumn { + beacon_block_root: unknown_block_root, + process_fn: Box::new(move || { + let _ = processor.send_gossip_data_column_sidecar( + message_id, + peer_id, + subnet_id, + column_sidecar, + seen_duration, + false, // Do not reprocess this message again. + ); + }), + }, + ); + if self + .beacon_processor_send + .try_send(WorkEvent { + drop_during_sync: false, + work: Work::Reprocess(reprocess_msg), + }) + .is_err() + { + debug!( + %unknown_block_root, + "Failed to queue data column for reprocessing" + ); + } + } } GossipDataColumnError::InvalidVariant | GossipDataColumnError::PubkeyCacheTimeout diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 434f7ecc8b3..b3c7d1d134b 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -231,6 +231,7 @@ impl NetworkBeaconProcessor { subnet_id: DataColumnSubnetId, column_sidecar: Arc>, seen_timestamp: Duration, + allow_reprocess: bool, ) -> Result<(), Error> { let processor = self.clone(); let process_fn = async move { @@ -241,6 +242,7 @@ impl NetworkBeaconProcessor { subnet_id, column_sidecar, seen_timestamp, + allow_reprocess, ) .await }; diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index 18d34b40b3b..7e9380e4333 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -435,6 +435,7 @@ impl TestRig { DataColumnSubnetId::from_column_index(*data_column.index(), &self.chain.spec), data_column.clone(), Duration::from_secs(0), + true, ) .unwrap(); } diff --git a/beacon_node/network/src/router.rs b/beacon_node/network/src/router.rs index 35939c6f396..ed719cc1f78 100644 --- a/beacon_node/network/src/router.rs +++ b/beacon_node/network/src/router.rs @@ -435,6 +435,7 @@ impl Router { subnet_id, column_sidecar, seen_timestamp, + true, ), ) }