From 229cd1ef6f21efd7009ae4a6173fa668d29f6dc0 Mon Sep 17 00:00:00 2001
From: Siddharth Sharma <sid@siddharthks.com>
Date: Fri, 17 Apr 2026 06:05:05 -0700
Subject: [PATCH] [guardian] wid-keyed idempotency cache for
 standard_withdrawal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Retries of the same withdrawal (leader restart, leader rotation, lost
response, seq-mismatch retry on the client) previously debited the
bucket once per retry — even though wid is deterministic and the
"real" withdrawal only happens once on-chain.

Add an LRU cache of signed responses keyed by wid on EnclaveState
(cap 1024). standard_withdrawal consults the cache before touching
the committee sig, limiter, or BTC key; a hit returns the previously
signed response untouched. Entries are inserted only after a
successful S3 log commit, so the cache and bucket always agree.

Tests cover:
- same wid twice returns the same response and debits the bucket once
- a failed withdrawal is NOT cached; the next attempt runs the live
  path again
---
 Cargo.lock                            |  1 +
 crates/hashi-guardian/Cargo.toml      |  1 +
 crates/hashi-guardian/src/enclave.rs  | 43 +++++++++++++
 crates/hashi-guardian/src/withdraw.rs | 92 ++++++++++++++++++++++++++-
 4 files changed, 136 insertions(+), 1 deletion(-)
diff --git a/Cargo.lock b/Cargo.lock
index 6034d0e7c..ef0c15499 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2893,6 +2893,7 @@ dependencies = [
  "hashi-types",
  "hpke",
  "k256",
+ "lru",
  "rand 0.8.5",
  "serde",
  "serde_bytes",
diff --git a/crates/hashi-guardian/Cargo.toml b/crates/hashi-guardian/Cargo.toml
index 48e0cc61b..1c7fbffdb 100644
--- a/crates/hashi-guardian/Cargo.toml
+++ b/crates/hashi-guardian/Cargo.toml
@@ -13,6 +13,7 @@ tracing.workspace = true
 tracing-subscriber.workspace = true
 tonic.workspace = true
 tonic-health.workspace = true
+lru = "0.16"
 hashi-types = { path = "../hashi-types" }
 
 # Crypto dependencies
diff --git a/crates/hashi-guardian/src/enclave.rs b/crates/hashi-guardian/src/enclave.rs
index 25b264c00..05c874f71 100644
--- a/crates/hashi-guardian/src/enclave.rs
+++ b/crates/hashi-guardian/src/enclave.rs
@@ -61,6 +61,13 @@ pub struct EnclaveState {
     /// Rate limiter. Set once during provisioner_init.
     /// Uses `Arc<tokio::Mutex>` so the guard can be held across `.await`.
     rate_limiter: OnceLock<Arc<tokio::sync::Mutex<RateLimiter>>>,
+    /// LRU cache of signed withdrawal responses keyed by `wid`. Lets the
+    /// guardian return the same response for retried requests (leader
+    /// restart, leader rotation, lost response) without re-consuming from
+    /// the limiter or re-signing. Inserted only after a successful S3 log
+    /// commit, so the bucket and the cache never disagree.
+    recent_responses:
+        std::sync::Mutex<lru::LruCache<u64, GuardianSigned<StandardWithdrawalResponse>>>,
 }
 
 /// Scratchpad used only during initialization.
@@ -327,6 +334,33 @@ impl EnclaveState {
         let limiter = self.rate_limiter.get()?;
         Some(*limiter.lock().await.state())
     }
+
+    // ========================================================================
+    // Recent-response cache (wid-keyed idempotency)
+    // ========================================================================
+
+    /// Look up a previously signed response by wid. Marks the entry as
+    /// most-recently-used on hit.
+    pub fn get_cached_response(
+        &self,
+        wid: u64,
+    ) -> Option<GuardianSigned<StandardWithdrawalResponse>> {
+        self.recent_responses
+            .lock()
+            .expect("recent_responses mutex poisoned")
+            .get(&wid)
+            .cloned()
+    }
+
+    /// Insert a signed response into the cache. Called only after the
+    /// withdrawal has been logged to S3 and the limiter committed, so the
+    /// cache and bucket are always consistent.
+    pub fn cache_response(&self, wid: u64, response: GuardianSigned<StandardWithdrawalResponse>) {
+        self.recent_responses
+            .lock()
+            .expect("recent_responses mutex poisoned")
+            .put(wid, response);
+    }
 }
 
 impl Enclave {
@@ -340,11 +374,20 @@ impl Enclave {
             state: EnclaveState {
                 committee: RwLock::new(None),
                 rate_limiter: OnceLock::new(),
+                recent_responses: std::sync::Mutex::new(lru::LruCache::new(
+                    Self::RECENT_RESPONSES_CAPACITY,
+                )),
             },
             scratchpad: Scratchpad::default(),
         }
     }
 
+    /// Capacity of the wid-keyed response cache. Each entry is small
+    /// (Ed25519 sig + Schnorr sigs for each input), so 1024 is ample for
+    /// any realistic withdrawal throughput while bounding memory.
+    const RECENT_RESPONSES_CAPACITY: std::num::NonZeroUsize =
+        std::num::NonZeroUsize::new(1024).expect("1024 > 0");
+
     pub fn is_provisioner_init_complete(&self) -> bool {
         self.config.is_provisioner_init_complete()
             && self.state.is_provisioner_init_complete()
diff --git a/crates/hashi-guardian/src/withdraw.rs b/crates/hashi-guardian/src/withdraw.rs
index 3bbcad79a..68c07c7c3 100644
--- a/crates/hashi-guardian/src/withdraw.rs
+++ b/crates/hashi-guardian/src/withdraw.rs
@@ -32,6 +32,14 @@ pub async fn standard_withdrawal(
     let request_signature = signed_request.committee_signature().clone(); // for logging
     let wid = unsigned_request.wid;
 
+    // Idempotency: retries (leader rotation, pod restart, lost response)
+    // re-submit the same `wid`. Replay the cached signed response so the
+    // limiter is debited at most once per unique withdrawal.
+    if let Some(cached) = enclave.state.get_cached_response(wid) {
+        info!("Withdrawal {} served from idempotency cache.", wid);
+        return Ok(cached);
+    }
+
     match normal_withdrawal_inner(enclave.clone(), signed_request).await {
         Ok((txid, response, limiter_guard)) => {
             info!("Withdrawal {} processed successfully. Logging to S3.", wid);
@@ -42,7 +50,9 @@ pub async fn standard_withdrawal(
                 response: response.clone(),
             };
             log_withdrawal_success(enclave.as_ref(), wid, msg, limiter_guard).await?;
-            Ok(enclave.sign(response))
+            let signed_response = enclave.sign(response);
+            enclave.state.cache_response(wid, signed_response.clone());
+            Ok(signed_response)
         }
         Err(withdraw_err) => {
             error!("Withdrawal {} failed: {:?}", wid, withdraw_err);
@@ -322,4 +332,84 @@ mod tests {
             GuardianError::RateLimitExceeded
         ));
     }
+
+    /// Retrying the same `wid` (e.g. after leader rotation or a lost
+    /// response) returns the cached signed response and does NOT debit the
+    /// bucket a second time. Bucket capacity is sized to exactly one
+    /// withdrawal so a second debit would tip it over the rate limit.
+    #[tokio::test]
+    async fn test_standard_withdrawal_wid_cache_is_idempotent() {
+        let wid = 42;
+        let (req1, committee) = StandardWithdrawalRequest::mock_signed_and_committee_with_seq(
+            Network::Regtest,
+            wid,
+            100,
+            0,
+        );
+        let amount_sats = req1.message().utxos().external_out_amount().to_sat();
+        let enclave =
+            setup_fully_initialized_enclave(Network::Regtest, committee.clone(), amount_sats).await;
+
+        let first = standard_withdrawal(enclave.clone(), req1)
+            .await
+            .expect("first withdrawal succeeds");
+
+        // Same wid, fresh timestamp + seq. Without the cache this would
+        // fail with a seq mismatch (or debit the bucket a second time); the
+        // cache short-circuits before any of that.
+        let (req2, _) = StandardWithdrawalRequest::mock_signed_and_committee_with_seq(
+            Network::Regtest,
+            wid,
+            200,
+            1,
+        );
+        let second = standard_withdrawal(enclave.clone(), req2)
+            .await
+            .expect("retry serves cached response");
+        assert_eq!(first, second, "cache must return identical signed response");
+
+        // Bucket should still reflect exactly one debit.
+        let limiter_state = enclave.state.limiter_state().await.unwrap();
+        assert_eq!(limiter_state.next_seq, 1);
+        assert_eq!(limiter_state.num_tokens_available, 0);
+    }
+
+    /// A failed withdrawal must NOT be cached — otherwise retries would
+    /// permanently receive the same error even if the underlying cause
+    /// (e.g. a corrupted one-off request) is gone on the next attempt.
+    #[tokio::test]
+    async fn test_standard_withdrawal_failures_not_cached() {
+        // Bucket capacity = 0 so the first request fails with RateLimitExceeded.
+        let (req1, committee) = StandardWithdrawalRequest::mock_signed_and_committee_with_seq(
+            Network::Regtest,
+            42,
+            100,
+            0,
+        );
+        let enclave = setup_fully_initialized_enclave(Network::Regtest, committee, 0).await;
+
+        let first = standard_withdrawal(enclave.clone(), req1).await;
+        assert!(matches!(
+            first.unwrap_err(),
+            GuardianError::RateLimitExceeded
+        ));
+
+        // Retry with the same wid should NOT hit the cache — it gets
+        // another attempt. It will still fail here (bucket still 0) but
+        // via the live path, not a cached replay.
+        let (req2, _) = StandardWithdrawalRequest::mock_signed_and_committee_with_seq(
+            Network::Regtest,
+            42,
+            200,
+            0,
+        );
+        let second = standard_withdrawal(enclave.clone(), req2).await;
+        assert!(matches!(
+            second.unwrap_err(),
+            GuardianError::RateLimitExceeded
+        ));
+        // Nothing was committed in either attempt.
+        let limiter_state = enclave.state.limiter_state().await.unwrap();
+        assert_eq!(limiter_state.next_seq, 0);
+    }
 }