Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/hashi-guardian/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ tracing.workspace = true
tracing-subscriber.workspace = true
tonic.workspace = true
tonic-health.workspace = true
lru = "0.16"
hashi-types = { path = "../hashi-types" }

# Crypto dependencies
Expand Down
43 changes: 43 additions & 0 deletions crates/hashi-guardian/src/enclave.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ pub struct EnclaveState {
/// Rate limiter. Set once during provisioner_init.
/// Uses `Arc<tokio::Mutex>` so the guard can be held across `.await`.
rate_limiter: OnceLock<Arc<tokio::sync::Mutex<RateLimiter>>>,
/// LRU cache of signed withdrawal responses keyed by `wid`. Lets the
/// guardian return the same response for retried requests (leader
/// restart, leader rotation, lost response) without re-consuming from
/// the limiter or re-signing. Inserted only after a successful S3 log
/// commit, so the bucket and the cache never disagree.
recent_responses:
std::sync::Mutex<lru::LruCache<u64, GuardianSigned<StandardWithdrawalResponse>>>,
}

/// Scratchpad used only during initialization.
Expand Down Expand Up @@ -327,6 +334,33 @@ impl EnclaveState {
let limiter = self.rate_limiter.get()?;
Some(*limiter.lock().await.state())
}

// ========================================================================
// Recent-response cache (wid-keyed idempotency)
// ========================================================================

/// Look up a previously signed response by wid. Marks the entry as
/// most-recently-used on hit.
pub fn get_cached_response(
&self,
wid: u64,
) -> Option<GuardianSigned<StandardWithdrawalResponse>> {
self.recent_responses
.lock()
.expect("recent_responses mutex poisoned")
.get(&wid)
.cloned()
}

/// Insert a signed response into the cache. Called only after the
/// withdrawal has been logged to S3 and the limiter committed, so the
/// cache and bucket are always consistent.
pub fn cache_response(&self, wid: u64, response: GuardianSigned<StandardWithdrawalResponse>) {
self.recent_responses
.lock()
.expect("recent_responses mutex poisoned")
.put(wid, response);
}
}

impl Enclave {
Expand All @@ -340,11 +374,20 @@ impl Enclave {
state: EnclaveState {
committee: RwLock::new(None),
rate_limiter: OnceLock::new(),
recent_responses: std::sync::Mutex::new(lru::LruCache::new(
Self::RECENT_RESPONSES_CAPACITY,
)),
},
scratchpad: Scratchpad::default(),
}
}

/// Capacity of the wid-keyed response cache. Each entry is small
/// (Ed25519 sig + Schnorr sigs for each input), so 1024 is ample for
/// any realistic withdrawal throughput while bounding memory.
const RECENT_RESPONSES_CAPACITY: std::num::NonZeroUsize =
std::num::NonZeroUsize::new(1024).expect("1024 > 0");

pub fn is_provisioner_init_complete(&self) -> bool {
self.config.is_provisioner_init_complete()
&& self.state.is_provisioner_init_complete()
Expand Down
92 changes: 91 additions & 1 deletion crates/hashi-guardian/src/withdraw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ pub async fn standard_withdrawal(
let request_signature = signed_request.committee_signature().clone(); // for logging
let wid = unsigned_request.wid;

// Idempotency: retries (leader rotation, pod restart, lost response)
// re-submit the same `wid`. Replay the cached signed response so the
// limiter is debited at most once per unique withdrawal.
if let Some(cached) = enclave.state.get_cached_response(wid) {
info!("Withdrawal {} served from idempotency cache.", wid);
return Ok(cached);
}

match normal_withdrawal_inner(enclave.clone(), signed_request).await {
Ok((txid, response, limiter_guard)) => {
info!("Withdrawal {} processed successfully. Logging to S3.", wid);
Expand All @@ -42,7 +50,9 @@ pub async fn standard_withdrawal(
response: response.clone(),
};
log_withdrawal_success(enclave.as_ref(), wid, msg, limiter_guard).await?;
Ok(enclave.sign(response))
let signed_response = enclave.sign(response);
enclave.state.cache_response(wid, signed_response.clone());
Ok(signed_response)
}
Err(withdraw_err) => {
error!("Withdrawal {} failed: {:?}", wid, withdraw_err);
Expand Down Expand Up @@ -322,4 +332,84 @@ mod tests {
GuardianError::RateLimitExceeded
));
}

/// Retrying the same `wid` (e.g. after leader rotation or a lost
/// response) returns the cached signed response and does NOT debit the
/// bucket a second time. Bucket capacity is sized to exactly one
/// withdrawal so a second debit would tip it over the rate limit.
#[tokio::test]
async fn test_standard_withdrawal_wid_cache_is_idempotent() {
let wid = 42;
let (req1, committee) = StandardWithdrawalRequest::mock_signed_and_committee_with_seq(
Network::Regtest,
wid,
100,
0,
);
let amount_sats = req1.message().utxos().external_out_amount().to_sat();
let enclave =
setup_fully_initialized_enclave(Network::Regtest, committee.clone(), amount_sats).await;

let first = standard_withdrawal(enclave.clone(), req1)
.await
.expect("first withdrawal succeeds");

// Same wid, fresh timestamp + seq. Without the cache this would
// fail with a seq mismatch (or debit the bucket a second time); the
// cache short-circuits before any of that.
let (req2, _) = StandardWithdrawalRequest::mock_signed_and_committee_with_seq(
Network::Regtest,
wid,
200,
1,
);
let second = standard_withdrawal(enclave.clone(), req2)
.await
.expect("retry serves cached response");
assert_eq!(first, second, "cache must return identical signed response");

// Bucket should still reflect exactly one debit.
let limiter_state = enclave.state.limiter_state().await.unwrap();
assert_eq!(limiter_state.next_seq, 1);
assert_eq!(limiter_state.num_tokens_available, 0);
}

/// A failed withdrawal must NOT be cached — otherwise retries would
/// permanently receive the same error even if the underlying cause
/// (e.g. a corrupted one-off request) is gone on the next attempt.
#[tokio::test]
async fn test_standard_withdrawal_failures_not_cached() {
// Bucket capacity = 0 so the first request fails with RateLimitExceeded.
let (req1, committee) = StandardWithdrawalRequest::mock_signed_and_committee_with_seq(
Network::Regtest,
42,
100,
0,
);
let enclave = setup_fully_initialized_enclave(Network::Regtest, committee, 0).await;

let first = standard_withdrawal(enclave.clone(), req1).await;
assert!(matches!(
first.unwrap_err(),
GuardianError::RateLimitExceeded
));

// Retry with the same wid should NOT hit the cache — it gets
// another attempt. It will still fail here (bucket still 0) but
// via the live path, not a cached replay.
let (req2, _) = StandardWithdrawalRequest::mock_signed_and_committee_with_seq(
Network::Regtest,
42,
200,
0,
);
let second = standard_withdrawal(enclave.clone(), req2).await;
assert!(matches!(
second.unwrap_err(),
GuardianError::RateLimitExceeded
));
// Nothing was committed in either attempt.
let limiter_state = enclave.state.limiter_state().await.unwrap();
assert_eq!(limiter_state.next_seq, 0);
}
}
Loading