From 229cd1ef6f21efd7009ae4a6173fa668d29f6dc0 Mon Sep 17 00:00:00 2001 From: Siddharth Sharma Date: Fri, 17 Apr 2026 06:05:05 -0700 Subject: [PATCH] [guardian] wid-keyed idempotency cache for standard_withdrawal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Retries of the same withdrawal (leader restart, leader rotation, lost response, seq-mismatch retry on the client) previously debited the bucket once per retry — even though wid is deterministic and the "real" withdrawal only happens once on-chain. Add an LRU cache of signed responses keyed by wid on EnclaveState (cap 1024). standard_withdrawal consults the cache before touching the committee sig, limiter, or BTC key; a hit returns the previously signed response untouched. Entries are inserted only after a successful S3 log commit, so the cache and bucket always agree. Tests cover: - same wid twice returns the same response and debits the bucket once - a failed withdrawal is NOT cached; the next attempt runs the live path again --- Cargo.lock | 1 + crates/hashi-guardian/Cargo.toml | 1 + crates/hashi-guardian/src/enclave.rs | 43 +++++++++++++ crates/hashi-guardian/src/withdraw.rs | 92 ++++++++++++++++++++++++++- 4 files changed, 136 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 6034d0e7c..ef0c15499 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2893,6 +2893,7 @@ dependencies = [ "hashi-types", "hpke", "k256", + "lru", "rand 0.8.5", "serde", "serde_bytes", diff --git a/crates/hashi-guardian/Cargo.toml b/crates/hashi-guardian/Cargo.toml index 48e0cc61b..1c7fbffdb 100644 --- a/crates/hashi-guardian/Cargo.toml +++ b/crates/hashi-guardian/Cargo.toml @@ -13,6 +13,7 @@ tracing.workspace = true tracing-subscriber.workspace = true tonic.workspace = true tonic-health.workspace = true +lru = "0.16" hashi-types = { path = "../hashi-types" } # Crypto dependencies diff --git a/crates/hashi-guardian/src/enclave.rs b/crates/hashi-guardian/src/enclave.rs index 25b264c00..05c874f71 100644 --- a/crates/hashi-guardian/src/enclave.rs +++ b/crates/hashi-guardian/src/enclave.rs @@ -61,6 +61,13 @@ pub struct EnclaveState { /// Rate limiter. Set once during provisioner_init. /// Uses `Arc` so the guard can be held across `.await`. rate_limiter: OnceLock>>, + /// LRU cache of signed withdrawal responses keyed by `wid`. Lets the + /// guardian return the same response for retried requests (leader + /// restart, leader rotation, lost response) without re-consuming from + /// the limiter or re-signing. Inserted only after a successful S3 log + /// commit, so the bucket and the cache never disagree. + recent_responses: + std::sync::Mutex>>, } /// Scratchpad used only during initialization. @@ -327,6 +334,33 @@ impl EnclaveState { let limiter = self.rate_limiter.get()?; Some(*limiter.lock().await.state()) } + + // ======================================================================== + // Recent-response cache (wid-keyed idempotency) + // ======================================================================== + + /// Look up a previously signed response by wid. Marks the entry as + /// most-recently-used on hit. + pub fn get_cached_response( + &self, + wid: u64, + ) -> Option> { + self.recent_responses + .lock() + .expect("recent_responses mutex poisoned") + .get(&wid) + .cloned() + } + + /// Insert a signed response into the cache. Called only after the + /// withdrawal has been logged to S3 and the limiter committed, so the + /// cache and bucket are always consistent. + pub fn cache_response(&self, wid: u64, response: GuardianSigned) { + self.recent_responses + .lock() + .expect("recent_responses mutex poisoned") + .put(wid, response); + } } impl Enclave { @@ -340,11 +374,20 @@ impl Enclave { state: EnclaveState { committee: RwLock::new(None), rate_limiter: OnceLock::new(), + recent_responses: std::sync::Mutex::new(lru::LruCache::new( + Self::RECENT_RESPONSES_CAPACITY, + )), }, scratchpad: Scratchpad::default(), } } + /// Capacity of the wid-keyed response cache. Each entry is small + /// (Ed25519 sig + Schnorr sigs for each input), so 1024 is ample for + /// any realistic withdrawal throughput while bounding memory. + const RECENT_RESPONSES_CAPACITY: std::num::NonZeroUsize = + std::num::NonZeroUsize::new(1024).expect("1024 > 0"); + pub fn is_provisioner_init_complete(&self) -> bool { self.config.is_provisioner_init_complete() && self.state.is_provisioner_init_complete() diff --git a/crates/hashi-guardian/src/withdraw.rs b/crates/hashi-guardian/src/withdraw.rs index 3bbcad79a..68c07c7c3 100644 --- a/crates/hashi-guardian/src/withdraw.rs +++ b/crates/hashi-guardian/src/withdraw.rs @@ -32,6 +32,14 @@ pub async fn standard_withdrawal( let request_signature = signed_request.committee_signature().clone(); // for logging let wid = unsigned_request.wid; + // Idempotency: retries (leader rotation, pod restart, lost response) + // re-submit the same `wid`. Replay the cached signed response so the + // limiter is debited at most once per unique withdrawal. + if let Some(cached) = enclave.state.get_cached_response(wid) { + info!("Withdrawal {} served from idempotency cache.", wid); + return Ok(cached); + } + match normal_withdrawal_inner(enclave.clone(), signed_request).await { Ok((txid, response, limiter_guard)) => { info!("Withdrawal {} processed successfully. Logging to S3.", wid); @@ -42,7 +50,9 @@ pub async fn standard_withdrawal( response: response.clone(), }; log_withdrawal_success(enclave.as_ref(), wid, msg, limiter_guard).await?; - Ok(enclave.sign(response)) + let signed_response = enclave.sign(response); + enclave.state.cache_response(wid, signed_response.clone()); + Ok(signed_response) } Err(withdraw_err) => { error!("Withdrawal {} failed: {:?}", wid, withdraw_err); @@ -322,4 +332,84 @@ mod tests { GuardianError::RateLimitExceeded )); } + + /// Retrying the same `wid` (e.g. after leader rotation or a lost + /// response) returns the cached signed response and does NOT debit the + /// bucket a second time. Bucket capacity is sized to exactly one + /// withdrawal so a second debit would tip it over the rate limit. + #[tokio::test] + async fn test_standard_withdrawal_wid_cache_is_idempotent() { + let wid = 42; + let (req1, committee) = StandardWithdrawalRequest::mock_signed_and_committee_with_seq( + Network::Regtest, + wid, + 100, + 0, + ); + let amount_sats = req1.message().utxos().external_out_amount().to_sat(); + let enclave = + setup_fully_initialized_enclave(Network::Regtest, committee.clone(), amount_sats).await; + + let first = standard_withdrawal(enclave.clone(), req1) + .await + .expect("first withdrawal succeeds"); + + // Same wid, fresh timestamp + seq. Without the cache this would + // fail with a seq mismatch (or debit the bucket a second time); the + // cache short-circuits before any of that. + let (req2, _) = StandardWithdrawalRequest::mock_signed_and_committee_with_seq( + Network::Regtest, + wid, + 200, + 1, + ); + let second = standard_withdrawal(enclave.clone(), req2) + .await + .expect("retry serves cached response"); + assert_eq!(first, second, "cache must return identical signed response"); + + // Bucket should still reflect exactly one debit. + let limiter_state = enclave.state.limiter_state().await.unwrap(); + assert_eq!(limiter_state.next_seq, 1); + assert_eq!(limiter_state.num_tokens_available, 0); + } + + /// A failed withdrawal must NOT be cached — otherwise retries would + /// permanently receive the same error even if the underlying cause + /// (e.g. a corrupted one-off request) is gone on the next attempt. + #[tokio::test] + async fn test_standard_withdrawal_failures_not_cached() { + // Bucket capacity = 0 so the first request fails with RateLimitExceeded. + let (req1, committee) = StandardWithdrawalRequest::mock_signed_and_committee_with_seq( + Network::Regtest, + 42, + 100, + 0, + ); + let enclave = setup_fully_initialized_enclave(Network::Regtest, committee, 0).await; + + let first = standard_withdrawal(enclave.clone(), req1).await; + assert!(matches!( + first.unwrap_err(), + GuardianError::RateLimitExceeded + )); + + // Retry with the same wid should NOT hit the cache — it gets + // another attempt. It will still fail here (bucket still 0) but + // via the live path, not a cached replay. + let (req2, _) = StandardWithdrawalRequest::mock_signed_and_committee_with_seq( + Network::Regtest, + 42, + 200, + 0, + ); + let second = standard_withdrawal(enclave.clone(), req2).await; + assert!(matches!( + second.unwrap_err(), + GuardianError::RateLimitExceeded + )); + // Nothing was committed in either attempt. + let limiter_state = enclave.state.limiter_state().await.unwrap(); + assert_eq!(limiter_state.next_seq, 0); + } }