diff --git a/Cargo.lock b/Cargo.lock index 078f699f3c8..58ee7d03ac9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5459,6 +5459,7 @@ dependencies = [ "rand 0.9.2", "regex", "serde", + "serde_json", "sha2", "smallvec", "snap", diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index f31817c5ba7..15d7feaa15e 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -109,6 +109,38 @@ use warp_utils::{query::multi_key_query, uor::UnifyingOrFilter}; const API_PREFIX: &str = "eth"; +/// Translates Lighthouse's internal disconnect reason tag (the `reason` field +/// on `LastDisconnect`, which is a `'static` variant name of `GoodbyeReason`) +/// into the simplified `PeerDisconnectReason` vocabulary defined by the +/// beacon-API peer-scoring proposal. +fn map_disconnect_reason(reason: &str) -> Option<&'static str> { + let mapped = match reason { + "BadScore" | "Banned" | "BannedIP" => "bad_score", + "ClientShutdown" => "client_shutdown", + "IrrelevantNetwork" => "irrelevant_network", + "UnableToVerifyNetwork" => "unviable_fork", + "TooManyPeers" => "too_many_peers", + "Fault" => "io_error", + _ => "unknown", + }; + Some(mapped) +} + +/// Translates Lighthouse's internal downscore reason tag (the `reason` field +/// on `LastAction`, e.g. tags emitted by `rpc_error_msg` or report-peer call +/// sites) into the simplified `PeerScoreReason` vocabulary defined by the +/// beacon-API peer-scoring proposal. +fn map_downscore_reason(reason: &str) -> &'static str { + match reason { + "rpc_invalid_request" => "rpc_invalid_request", + "rpc_rate_limited" => "rpc_rate_limited", + "rpc_io_error" => "rpc_io_error", + "rpc_stream_timeout" | "rpc_negotiation_timeout" => "rpc_timeout", + "rpc_invalid_data" | "rpc_ssz_decode_error" => "rpc_invalid_response", + _ => "unknown", + } +} + /// A custom type which allows for both unsecured and TLS-enabled HTTP servers. type HttpServer = (SocketAddr, Pin + Send>>); @@ -2402,12 +2434,36 @@ pub fn serve( // the eth2 API spec implies only peers we have been connected to at some point should be included. if let Some(&dir) = peer_info.connection_direction() { + let agent_version = peer_info.client().agent_string.clone(); + let score = Some(peer_info.score().score()); + let state: api_types::PeerState = + peer_info.connection_status().clone().into(); + // Per beacon-API spec, `disconnect_reason` MUST only be populated + // when `state` is `disconnected` or `disconnecting`. + let disconnect_reason = if matches!( + state, + api_types::PeerState::Disconnected + | api_types::PeerState::Disconnecting + ) { + peer_info.last_disconnect().and_then(|d| { + map_disconnect_reason(d.reason).map(|s| s.to_string()) + }) + } else { + None + }; + let downscore_reasons = peer_info + .last_action() + .map(|a| vec![map_downscore_reason(a.reason).to_string()]); return Ok(api_types::GenericResponse::from(api_types::PeerData { peer_id: peer_id.to_string(), enr: peer_info.enr().map(|enr| enr.to_base64()), last_seen_p2p_address: address, direction: dir.into(), - state: peer_info.connection_status().clone().into(), + state, + agent_version, + score, + disconnect_reason, + downscore_reasons, })); } } @@ -2463,12 +2519,34 @@ pub fn serve( .is_none_or(|directions| directions.contains(&direction)); if state_matches && direction_matches { + let agent_version = peer_info.client().agent_string.clone(); + let score = Some(peer_info.score().score()); + // Per beacon-API spec, `disconnect_reason` MUST only be + // populated when `state` is `disconnected` or `disconnecting`. + let disconnect_reason = if matches!( + state, + api_types::PeerState::Disconnected + | api_types::PeerState::Disconnecting + ) { + peer_info.last_disconnect().and_then(|d| { + map_disconnect_reason(d.reason).map(|s| s.to_string()) + }) + } else { + None + }; + let downscore_reasons = peer_info + .last_action() + .map(|a| vec![map_downscore_reason(a.reason).to_string()]); peers.push(api_types::PeerData { peer_id: peer_id.to_string(), enr: peer_info.enr().map(|enr| enr.to_base64()), last_seen_p2p_address: address, direction, state, + agent_version, + score, + disconnect_reason, + downscore_reasons, }); } } diff --git a/beacon_node/http_api/tests/tests.rs b/beacon_node/http_api/tests/tests.rs index a7fe34593a7..3616cdf180c 100644 --- a/beacon_node/http_api/tests/tests.rs +++ b/beacon_node/http_api/tests/tests.rs @@ -3220,6 +3220,10 @@ impl ApiTester { last_seen_p2p_address: EXTERNAL_ADDR.to_string(), state: PeerState::Connected, direction: PeerDirection::Inbound, + agent_version: result.agent_version.clone(), + score: result.score, + disconnect_reason: result.disconnect_reason.clone(), + downscore_reasons: result.downscore_reasons.clone(), }; assert_eq!(result, expected); @@ -3246,12 +3250,27 @@ impl ApiTester { for states in peer_states { for dirs in peer_dirs.clone() { let result = self.client.get_node_peers(states, dirs).await.unwrap(); + let (agent_version, score, disconnect_reason, downscore_reasons) = + if let Some(peer) = result.data.first() { + ( + peer.agent_version.clone(), + peer.score, + peer.disconnect_reason.clone(), + peer.downscore_reasons.clone(), + ) + } else { + (None, None, None, None) + }; let expected_peer = PeerData { peer_id: self.external_peer_id.to_string(), enr: None, last_seen_p2p_address: EXTERNAL_ADDR.to_string(), state: PeerState::Connected, direction: PeerDirection::Inbound, + agent_version, + score, + disconnect_reason, + downscore_reasons, }; let state_match = diff --git a/beacon_node/lighthouse_network/Cargo.toml b/beacon_node/lighthouse_network/Cargo.toml index 44af8d70064..4c58c9e111a 100644 --- a/beacon_node/lighthouse_network/Cargo.toml +++ b/beacon_node/lighthouse_network/Cargo.toml @@ -58,6 +58,7 @@ unsigned-varint = { version = "0.8", features = ["codec"] } async-channel = { workspace = true } logging = { workspace = true } proptest = { workspace = true } +serde_json = { workspace = true } tempfile = { workspace = true } [[test]] diff --git a/beacon_node/lighthouse_network/src/peer_manager/mod.rs b/beacon_node/lighthouse_network/src/peer_manager/mod.rs index 6b5144fa6fd..1230b1c036b 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/mod.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/mod.rs @@ -28,7 +28,7 @@ use libp2p::multiaddr; use network_utils::discovery_metrics; use network_utils::enr_ext::{EnrExt, peer_id_to_node_id}; pub use peerdb::peer_info::{ConnectionDirection, PeerConnectionStatus, PeerInfo}; -use peerdb::score::{PeerAction, ReportSource}; +use peerdb::score::{DisconnectDirection, LastDisconnect, PeerAction, ReportSource}; pub use peerdb::sync_status::{SyncInfo, SyncStatus}; use std::collections::{HashMap, HashSet, hash_map::Entry}; use std::net::IpAddr; @@ -151,6 +151,49 @@ pub enum PeerManagerEvent { DiscoverSubnetPeers(Vec), } +/// Maps an [`RPCError`] (and its inner status code, where applicable) to a +/// stable, granular `'static` tag used as the `msg` for `report_peer` so +/// external consumers can distinguish e.g. a rate limit from a ssz decode +/// failure or an unsupported protocol. +fn rpc_error_msg(err: &RPCError) -> &'static str { + match err { + RPCError::IncompleteStream => "rpc_incomplete_stream", + RPCError::HandlerRejected => "rpc_handler_rejected", + RPCError::InvalidData(_) => "rpc_invalid_data", + RPCError::SSZDecodeError(_) => "rpc_ssz_decode_error", + RPCError::IoError(_) => "rpc_io_error", + RPCError::NegotiationTimeout => "rpc_negotiation_timeout", + RPCError::StreamTimeout => "rpc_stream_timeout", + RPCError::UnsupportedProtocol => "rpc_unsupported_protocol", + RPCError::Disconnected => "rpc_disconnected", + RPCError::InternalError(_) => "rpc_internal_error", + RPCError::ErrorResponse(code, _) => match code { + RpcErrorResponse::Unknown => "rpc_unknown_status", + RpcErrorResponse::ResourceUnavailable => "rpc_resource_unavailable", + RpcErrorResponse::ServerError => "rpc_server_error", + RpcErrorResponse::InvalidRequest => "rpc_invalid_request", + RpcErrorResponse::RateLimited => "rpc_rate_limited", + RpcErrorResponse::BlobsNotFoundForBlock => "rpc_blobs_not_found", + }, + } +} + +/// Returns a stable `'static` variant name for a [`GoodbyeReason`] suitable +/// for JSON serialization on the `/lighthouse/peers` HTTP endpoint. +pub(crate) fn goodbye_reason_name(reason: &GoodbyeReason) -> &'static str { + match reason { + GoodbyeReason::ClientShutdown => "ClientShutdown", + GoodbyeReason::IrrelevantNetwork => "IrrelevantNetwork", + GoodbyeReason::Fault => "Fault", + GoodbyeReason::UnableToVerifyNetwork => "UnableToVerifyNetwork", + GoodbyeReason::TooManyPeers => "TooManyPeers", + GoodbyeReason::BadScore => "BadScore", + GoodbyeReason::Banned => "Banned", + GoodbyeReason::BannedIP => "BannedIP", + GoodbyeReason::Unknown => "Unknown", + } +} + impl PeerManager { // NOTE: Must be run inside a tokio executor. pub fn new( @@ -214,12 +257,24 @@ impl PeerManager { /// /// This will send a goodbye and disconnect the peer if it is connected or dialing. pub fn goodbye_peer(&mut self, peer_id: &PeerId, reason: GoodbyeReason, source: ReportSource) { + // Capture the goodbye details for the `/lighthouse/peers` API before + // `reason` is consumed by `report_peer` below. + let reason_name = goodbye_reason_name(&reason); + let reason_code: u64 = reason.clone().into(); + let last_disconnect = LastDisconnect { + reason: reason_name, + code: reason_code, + direction: DisconnectDirection::Sent, + at: Instant::now(), + }; + // Update the sync status if required if let Some(info) = self.network_globals.peers.write().peer_info_mut(peer_id) { debug!(%peer_id, %reason, score = %info.score(), "Sending goodbye to peer"); if matches!(reason, GoodbyeReason::IrrelevantNetwork) { info.update_sync_status(SyncStatus::IrrelevantPeer); } + info.set_last_disconnect(last_disconnect); } self.report_peer( @@ -664,13 +719,8 @@ impl PeerManager { RPCError::Disconnected => return, // No penalty for a graceful disconnection }; - self.report_peer( - peer_id, - peer_action, - ReportSource::RPC, - None, - "handle_rpc_error", - ); + let msg = rpc_error_msg(err); + self.report_peer(peer_id, peer_action, ReportSource::RPC, None, msg); } /// A ping request has been received. diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs index 11ce7853507..88bfbf7c9a6 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs @@ -5,7 +5,7 @@ use itertools::Itertools; use logging::crit; use network_utils::enr_ext::{EnrExt, peer_id_to_node_id}; use peer_info::{ConnectionDirection, PeerConnectionStatus, PeerInfo}; -use score::{PeerAction, ReportSource, Score, ScoreState}; +use score::{LastAction, PeerAction, ReportSource, Score, ScoreState}; use std::net::IpAddr; use std::time::Instant; use std::{cmp::Ordering, fmt::Display}; @@ -122,6 +122,21 @@ impl PeerDB { self.peers.get_mut(peer_id) } + /// Records the most recent disconnect event for a peer. + /// + /// Exposed to the wider crate (e.g. the RPC service layer when receiving + /// a `Goodbye` request) so the `/lighthouse/peers` HTTP endpoint can + /// show *why* a peer disconnected. No-op if the peer is unknown. + pub(crate) fn record_last_disconnect( + &mut self, + peer_id: &PeerId, + last_disconnect: score::LastDisconnect, + ) { + if let Some(info) = self.peers.get_mut(peer_id) { + info.set_last_disconnect(last_disconnect); + } + } + /// Returns if the peer is already connected. pub fn is_connected(&self, peer_id: &PeerId) -> bool { matches!( @@ -643,7 +658,21 @@ impl PeerDB { match self.peers.get_mut(peer_id) { Some(info) => { let previous_state = info.score_state(); + let pre_score = info.score().score(); info.apply_peer_action_to_score(action); + let post_score = info.score().score(); + // Record the most recent score-affecting event so the HTTP + // API can expose *why* a peer's score moved. Trusted peers + // are skipped because their score is never mutated. + if !info.is_trusted { + info.set_last_action(LastAction { + reason: msg, + source, + action, + delta: post_score - pre_score, + at: Instant::now(), + }); + } metrics::inc_counter_vec( &metrics::PEER_ACTION_EVENTS_PER_CLIENT, &[info.client().kind.as_ref(), action.as_ref(), source.into()], diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs index c289cb9a69c..ee3e6098995 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs @@ -1,5 +1,5 @@ use super::client::Client; -use super::score::{PeerAction, Score, ScoreState}; +use super::score::{LastAction, LastDisconnect, PeerAction, Score, ScoreState}; use super::sync_status::SyncStatus; use crate::discovery::Eth2Enr; use crate::{rpc::MetaData, types::Subnet}; @@ -57,6 +57,19 @@ pub struct PeerInfo { connection_direction: Option, /// The enr of the peer, if known. enr: Option, + /// The most recent score-affecting event for this peer (if any). + /// + /// Exposed via the `/lighthouse/peers` HTTP endpoint so external tooling + /// can correlate score changes with their cause. Skipped from JSON when + /// the peer has never been scored. + #[serde(skip_serializing_if = "Option::is_none")] + last_action: Option, + /// The most recent disconnect event for this peer (if any). + /// + /// Records the goodbye reason and direction (sent/received). Skipped from + /// JSON when the peer has never disconnected. + #[serde(skip_serializing_if = "Option::is_none")] + last_disconnect: Option, } impl Default for PeerInfo { @@ -75,6 +88,8 @@ impl Default for PeerInfo { is_trusted: false, connection_direction: None, enr: None, + last_action: None, + last_disconnect: None, } } } @@ -457,6 +472,29 @@ impl PeerInfo { } } + /// Returns the most recent score-affecting event recorded for this peer. + pub fn last_action(&self) -> Option<&LastAction> { + self.last_action.as_ref() + } + + /// Returns the most recent disconnect event recorded for this peer. + pub fn last_disconnect(&self) -> Option<&LastDisconnect> { + self.last_disconnect.as_ref() + } + + /// Records the most recent score-affecting event for this peer. + // VISIBILITY: The peer manager populates this from `report_peer`. + pub(in crate::peer_manager) fn set_last_action(&mut self, last_action: LastAction) { + self.last_action = Some(last_action); + } + + /// Records the most recent disconnect event for this peer. + // VISIBILITY: Populated from goodbye send/receive paths in both the peer + // manager and the service layer. + pub(crate) fn set_last_disconnect(&mut self, last_disconnect: LastDisconnect) { + self.last_disconnect = Some(last_disconnect); + } + /// Updates the gossipsub score with a new score. Optionally ignore the gossipsub score. pub(super) fn update_gossipsub_score(&mut self, new_score: f64, ignore: bool) { self.score.update_gossipsub_score(new_score, ignore); diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb/score.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb/score.rs index e57e7907db7..34190704e1a 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb/score.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb/score.rs @@ -6,7 +6,7 @@ //! //! The scoring algorithms are currently experimental. use crate::service::gossipsub_scoring_parameters::GREYLIST_THRESHOLD as GOSSIPSUB_GREYLIST_THRESHOLD; -use serde::Serialize; +use serde::{Serialize, Serializer}; use std::cmp::Ordering; use std::sync::LazyLock; use std::time::Instant; @@ -43,7 +43,7 @@ const GOSSIPSUB_POSITIVE_SCORE_WEIGHT: f64 = GOSSIPSUB_NEGATIVE_SCORE_WEIGHT; /// Each variant has an associated score change. // To easily assess the behaviour of scores changes the number of variants should stay low, and // somewhat generic. -#[derive(Debug, Clone, Copy, AsRefStr)] +#[derive(Debug, Clone, Copy, AsRefStr, Serialize)] #[strum(serialize_all = "snake_case")] pub enum PeerAction { /// We should not communicate more with this peer. @@ -66,7 +66,7 @@ pub enum PeerAction { } /// Service reporting a `PeerAction` for a peer. -#[derive(Debug)] +#[derive(Debug, Clone, Copy, Serialize)] pub enum ReportSource { Gossipsub, RPC, @@ -87,6 +87,65 @@ impl From for &'static str { } } +/// Custom serializer for [`Instant`] fields: emits the elapsed time in seconds. +/// +/// `Instant` is opaque and has no meaningful wire representation; we serialize +/// it as "how many seconds ago this happened" so consumers can render it +/// without needing wall-clock context. +pub(crate) fn serialize_instant_seconds_ago( + instant: &Instant, + serializer: S, +) -> Result +where + S: Serializer, +{ + serializer.serialize_u64(instant.elapsed().as_secs()) +} + +/// The most recent score-affecting event applied to a peer. +/// +/// Exposed on the `/lighthouse/peers` HTTP endpoint so external tools (e.g. +/// dora) can show *why* a peer was scored, not just the resulting number. +#[derive(Debug, Clone, Serialize)] +pub struct LastAction { + /// Static reason tag for the event, e.g. `bad_gossip_block_ssz` or + /// `lookup_block_processing_failure`. + pub reason: &'static str, + /// Which subsystem reported the action. + pub source: ReportSource, + /// The action category (Fatal/Low/Mid/High tolerance). + pub action: PeerAction, + /// Signed score change caused by this event (`post_score - pre_score`). + pub delta: f64, + /// When the event happened — serialized as `seconds_ago`. + #[serde(serialize_with = "serialize_instant_seconds_ago", rename = "seconds_ago")] + pub at: Instant, +} + +/// The direction of a disconnect event. +#[derive(Debug, Clone, Copy, Serialize)] +#[serde(rename_all = "lowercase")] +pub enum DisconnectDirection { + /// We sent the goodbye / initiated the disconnect. + Sent, + /// The peer sent us a goodbye. + Received, +} + +/// The most recent disconnect event for a peer. +#[derive(Debug, Clone, Serialize)] +pub struct LastDisconnect { + /// Human-readable variant name of the goodbye reason (e.g. `BadScore`). + pub reason: &'static str, + /// Numeric goodbye code on the wire (mirrors libp2p `GoodbyeReason as u64`). + pub code: u64, + /// Whether we sent or received the goodbye. + pub direction: DisconnectDirection, + /// When the disconnect happened — serialized as `seconds_ago`. + #[serde(serialize_with = "serialize_instant_seconds_ago", rename = "seconds_ago")] + pub at: Instant, +} + impl std::fmt::Display for PeerAction { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -418,4 +477,38 @@ mod tests { assert!(!score.is_good_gossipsub_peer()); assert_eq!(score.score(), 0.0); } + + #[test] + fn test_last_action_serialization_shape() { + let last_action = LastAction { + reason: "bad_gossip_block_ssz", + source: ReportSource::Gossipsub, + action: PeerAction::Fatal, + delta: -100.0, + at: Instant::now(), + }; + let value: serde_json::Value = + serde_json::to_value(&last_action).expect("serialization should succeed"); + assert_eq!(value["reason"], "bad_gossip_block_ssz"); + assert_eq!(value["source"], "Gossipsub"); + assert_eq!(value["action"], "Fatal"); + assert_eq!(value["delta"], -100.0); + assert!(value["seconds_ago"].is_u64()); + } + + #[test] + fn test_last_disconnect_serialization_shape() { + let last_disconnect = LastDisconnect { + reason: "BadScore", + code: 250, + direction: DisconnectDirection::Sent, + at: Instant::now(), + }; + let value: serde_json::Value = + serde_json::to_value(&last_disconnect).expect("serialization should succeed"); + assert_eq!(value["reason"], "BadScore"); + assert_eq!(value["code"], 250); + assert_eq!(value["direction"], "sent"); + assert!(value["seconds_ago"].is_u64()); + } } diff --git a/beacon_node/lighthouse_network/src/service/mod.rs b/beacon_node/lighthouse_network/src/service/mod.rs index 41d937e3245..9338d154ad1 100644 --- a/beacon_node/lighthouse_network/src/service/mod.rs +++ b/beacon_node/lighthouse_network/src/service/mod.rs @@ -6,7 +6,8 @@ use crate::discovery::{ }; use crate::peer_manager::{ ConnectionDirection, PeerManager, PeerManagerEvent, config::Config as PeerManagerCfg, - peerdb::score::PeerAction, peerdb::score::ReportSource, + goodbye_reason_name, + peerdb::score::{DisconnectDirection, LastDisconnect, PeerAction, ReportSource}, }; use crate::peer_manager::{MIN_OUTBOUND_ONLY_FACTOR, PEER_EXCESS_FACTOR, PRIORITY_PEER_EXCESS}; use crate::rpc::methods::MetadataRequest; @@ -40,7 +41,7 @@ use std::num::{NonZeroU8, NonZeroUsize}; use std::path::PathBuf; use std::pin::Pin; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, Instant}; use tracing::{debug, error, info, trace, warn}; use types::{ ChainSpec, DataColumnSubnetId, EnrForkId, EthSpec, ForkContext, ForkName, PartialDataColumn, @@ -1642,6 +1643,18 @@ impl Network { client = %self.network_globals.client(&peer_id), "Peer sent Goodbye" ); + // Record the received goodbye for the `/lighthouse/peers` API. + let reason_name = goodbye_reason_name(&reason); + let reason_code: u64 = reason.clone().into(); + self.network_globals.peers.write().record_last_disconnect( + &peer_id, + LastDisconnect { + reason: reason_name, + code: reason_code, + direction: DisconnectDirection::Received, + at: Instant::now(), + }, + ); // NOTE: We currently do not inform the application that we are // disconnecting here. The RPC handler will automatically // disconnect for us. diff --git a/common/eth2/src/types.rs b/common/eth2/src/types.rs index 449ea886856..cdf7eebc34f 100644 --- a/common/eth2/src/types.rs +++ b/common/eth2/src/types.rs @@ -882,6 +882,14 @@ pub struct PeerData { pub last_seen_p2p_address: String, pub state: PeerState, pub direction: PeerDirection, + #[serde(skip_serializing_if = "Option::is_none")] + pub agent_version: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub score: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub disconnect_reason: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub downscore_reasons: Option>, } #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]