From a0625939c5326207e83fd970fa84fc876865e9c6 Mon Sep 17 00:00:00 2001 From: Siddharth Sharma Date: Sun, 26 Apr 2026 14:42:34 -0700 Subject: [PATCH 1/2] [kyoto] tighten supervisor restart threshold to 15 failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lower KYOTO_MAX_CONSECUTIVE_FAILURES from 30 to 15. In bip157 >= 0.5.0, TrustedPeer::from_hostname is consumed on use — popped from the whitelist when first tried, never reinstated. Once the resolved-IP set for a hostname is exhausted, only a fresh Builder::build() will run lookup_host again, and our connectivity supervisor's rebuild path is that fresh build. At the previous threshold of 30 consecutive failures, a Kubernetes pod rotation that swaps the IPs of every whitelisted peer left us unavailable for several minutes while we burned through the cached resolution set 3x before rebuilding. 15 halves the worst-case recovery without churning during a normal single-peer flap. We can re-tune later from the kyoto_consecutive_failures and kyoto_restarts metrics. --- crates/hashi/src/btc_monitor/monitor.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/crates/hashi/src/btc_monitor/monitor.rs b/crates/hashi/src/btc_monitor/monitor.rs index f4e33ea3b..9f9f5eea2 100644 --- a/crates/hashi/src/btc_monitor/monitor.rs +++ b/crates/hashi/src/btc_monitor/monitor.rs @@ -24,7 +24,16 @@ use crate::metrics::Metrics; const FALLBACK_FEE_RATE_SAT_PER_KWU: u64 = 250; /// Number of consecutive connection failures before restarting Kyoto. -const KYOTO_MAX_CONSECUTIVE_FAILURES: u32 = 30; +/// +/// The rebuild path is also our only mechanism to re-resolve hostname peers: +/// in `bip157` >= 0.5.0, `TrustedPeer::from_hostname` is consumed on use +/// (popped from the whitelist when first tried, never reinstated), so once +/// the resolved-IP set is exhausted only a fresh `Builder::build()` will +/// run `lookup_host` again. We want this low enough that a Kubernetes pod +/// rotation that swaps every IP recovers in tens of seconds rather than +/// minutes, but high enough that a single peer flap does not churn the +/// whole node. +const KYOTO_MAX_CONSECUTIVE_FAILURES: u32 = 15; /// Delay before restarting Kyoto after connectivity loss. const KYOTO_RESTART_DELAY: Duration = Duration::from_secs(5); From 3155c86bb276ae50b49250914ccc6fc42299324d Mon Sep 17 00:00:00 2001 From: Siddharth Sharma Date: Mon, 27 Apr 2026 15:04:58 -0700 Subject: [PATCH 2/2] [kyoto] drop tuning rationale comment from KYOTO_MAX_CONSECUTIVE_FAILURES --- crates/hashi/src/btc_monitor/monitor.rs | 9 --------- 1 file changed, 9 deletions(-) diff --git a/crates/hashi/src/btc_monitor/monitor.rs b/crates/hashi/src/btc_monitor/monitor.rs index 9f9f5eea2..58fd51474 100644 --- a/crates/hashi/src/btc_monitor/monitor.rs +++ b/crates/hashi/src/btc_monitor/monitor.rs @@ -24,15 +24,6 @@ use crate::metrics::Metrics; const FALLBACK_FEE_RATE_SAT_PER_KWU: u64 = 250; /// Number of consecutive connection failures before restarting Kyoto. -/// -/// The rebuild path is also our only mechanism to re-resolve hostname peers: -/// in `bip157` >= 0.5.0, `TrustedPeer::from_hostname` is consumed on use -/// (popped from the whitelist when first tried, never reinstated), so once -/// the resolved-IP set is exhausted only a fresh `Builder::build()` will -/// run `lookup_host` again. We want this low enough that a Kubernetes pod -/// rotation that swaps every IP recovers in tens of seconds rather than -/// minutes, but high enough that a single peer flap does not churn the -/// whole node. const KYOTO_MAX_CONSECUTIVE_FAILURES: u32 = 15; /// Delay before restarting Kyoto after connectivity loss.