diff --git a/AGENTS.md b/AGENTS.md index 4c1017c08..96ece6bc2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -217,5 +217,10 @@ Scouter accepts: Pandas DataFrames, NumPy 2D arrays, Polars DataFrames, Pydantic | `REDIS_ADDR` | — | Redis URL | | `SCOUTER_STORAGE_URI` | `./scouter_storage` | Object storage (S3, GCS, Azure, local) | | `SCOUTER_TRACE_REFRESH_INTERVAL_SECS` | `10` | How often each pod refreshes its Delta table snapshot from shared storage. Set lower (e.g. `5`) for faster cross-pod visibility; set higher to reduce object-store LIST calls. Only relevant in multi-pod deployments. | +| `SCOUTER_TRACE_VISIBILITY_BUFFER_SECS` | refresh + 2 | Minimum delay before trace-backed evals are polled. Startup panics if this is lower than `SCOUTER_TRACE_REFRESH_INTERVAL_SECS + 2`, because a smaller buffer can make the poller fetch before a pod sees the committed anchor span. | +| `SCOUTER_INBOX_RECONCILE_AFTER_SECS` | `15` | Age floor before reconciliation scans `awaiting_trace` rows for missing anchor queue events. | +| `SCOUTER_INBOX_RECONCILE_LOOKBACK_SECS` | `86400` | Maximum supported anchor span start lookback used when reconciliation queries Delta for anchor spans. Increase for evals attached to spans that can run longer than 24 hours. | +| `SCOUTER_INBOX_RECONCILE_INTERVAL_SECS` | `60` | How often reconciliation recovers dropped live inbox notifications from Delta. | +| `SCOUTER_INBOX_RECONCILE_BATCH` | `200` | Maximum `awaiting_trace` rows scanned per reconciliation tick. | | `SCOUTER_ENCRYPT_SECRET` | — | HMAC-SHA256 key (32 bytes) | | `SCOUTER_BOOTSTRAP_KEY` | — | Initial admin bootstrap key | diff --git a/Cargo.lock b/Cargo.lock index 515992973..b7b336727 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6639,6 +6639,7 @@ dependencies = [ "deltalake", "futures", "hex", + "metrics", "mini-moka", "object_store", "potato-head", @@ -6685,6 +6686,7 @@ dependencies = [ "futures", "indicatif", "itertools 0.13.0", + "metrics", "ndarray", "ndarray-rand", "num-traits", @@ -7562,6 +7564,7 @@ dependencies = [ "tokio-stream", "tracing", "url", + "uuid", ] [[package]] @@ -7642,6 +7645,7 @@ dependencies = [ "stringprep", "thiserror 2.0.18", "tracing", + "uuid", "whoami", ] @@ -7680,6 +7684,7 @@ dependencies = [ "stringprep", "thiserror 2.0.18", "tracing", + "uuid", "whoami", ] @@ -7706,6 +7711,7 @@ dependencies = [ "thiserror 2.0.18", "tracing", "url", + "uuid", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 493c49450..23f6e43eb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -106,7 +106,7 @@ serde_json = "1.*" serde_yaml = "0.*" serde_qs = "0.*" simsimd = "6.5.*" -sqlx = { version = "0.*", features = [ "runtime-tokio", "tls-native-tls", "postgres", "chrono", "json"] } +sqlx = { version = "0.*", features = [ "runtime-tokio", "tls-native-tls", "postgres", "chrono", "json", "uuid"] } strum = "0.*" strum_macros = "0.*" tabled = { version = "0.*", features = ["ansi"] } @@ -122,7 +122,7 @@ utoipa-swagger-ui = { version = "9", features = ["axum"] } tokio-util = "0.*" tracing = "0.*" tracing-subscriber = {version = "0.*", features = ["json", "time", "env-filter"] } -uuid = { version = "1.*", features = ["v7"] } +uuid = { version = "1.*", features = ["v4", "v7"] } url = "2.*" statrs = "0.18.0" diff --git a/crates/scouter_dataframe/Cargo.toml b/crates/scouter_dataframe/Cargo.toml index 0866be788..e2c645051 100644 --- a/crates/scouter_dataframe/Cargo.toml +++ b/crates/scouter_dataframe/Cargo.toml @@ -29,6 +29,7 @@ datafusion = { workspace = true } futures = { workspace = true } object_store = { workspace = true } mini-moka = { workspace = true } +metrics = { workspace = true } regex = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/crates/scouter_dataframe/src/parquet/tracing/engine.rs b/crates/scouter_dataframe/src/parquet/tracing/engine.rs index a46d1c147..6a92fda18 100644 --- a/crates/scouter_dataframe/src/parquet/tracing/engine.rs +++ b/crates/scouter_dataframe/src/parquet/tracing/engine.rs @@ -19,12 +19,12 @@ use deltalake::datafusion::parquet::schema::types::ColumnPath; use deltalake::operations::optimize::OptimizeType; use deltalake::{DeltaTable, DeltaTableBuilder, TableProperty}; use scouter_settings::ObjectStorageSettings; -use scouter_types::SpanId; -use scouter_types::TraceId; -use scouter_types::TraceSpanRecord; use scouter_types::{Attribute, SpanEvent, SpanLink}; +use scouter_types::{ + SCOUTER_EVAL_PROFILE_UID, SCOUTER_EVAL_RECORD_UID, SpanId, TraceCommitAnchor, TraceId, + TraceSpanRecord, +}; use serde_json::Value; -use std::collections::HashSet; use std::sync::Arc; use tokio::sync::oneshot; use tokio::sync::{RwLock as AsyncRwLock, mpsc}; @@ -221,7 +221,7 @@ pub struct TraceSpanDBEngine { /// respective `TableProvider`s without a deregister/register gap. pub catalog: Arc, control: ControlTableEngine, - commit_tx: Option>>, + commit_tx: Option>>, } impl TraceSchemaExt for TraceSpanDBEngine {} @@ -229,7 +229,7 @@ impl TraceSchemaExt for TraceSpanDBEngine {} impl TraceSpanDBEngine { pub async fn new( storage_settings: &ObjectStorageSettings, - commit_tx: Option>>, + commit_tx: Option>>, ) -> Result { let object_store = ObjectStore::new(storage_settings)?; let schema = Arc::new(Self::create_schema()); @@ -629,12 +629,49 @@ impl TraceSpanDBEngine { Some(cmd) = rx.recv() => { match cmd { TableCommand::Write { spans, respond_to } => { - let trace_ids: Vec = if self.commit_tx.is_some() { - let mut seen = HashSet::with_capacity(spans.len()); + let anchors: Vec = if self.commit_tx.is_some() { + let mut out = Vec::new(); for span in &spans { - seen.insert(span.trace_id); + let mut record_uid: Option = None; + let mut profile_uid: Option = None; + for attr in &span.attributes { + if attr.key == SCOUTER_EVAL_RECORD_UID { + if let Some(value) = attr.value.as_str() { + record_uid = Some(value.to_string()); + } + } else if attr.key == SCOUTER_EVAL_PROFILE_UID + && let Some(value) = attr.value.as_str() + { + profile_uid = Some(value.to_string()); + } + + if record_uid.is_some() && profile_uid.is_some() { + break; + } + } + if let (Some(record_uid), Some(profile_uid)) = + (record_uid, profile_uid) + { + if let Some(anchor) = TraceCommitAnchor::new( + span.trace_id, + span.span_id.clone(), + record_uid, + profile_uid, + ) { + out.push(anchor); + } else { + metrics::counter!( + "scouter_trace_commit_event_invalid_total" + ) + .increment(1); + tracing::warn!( + span_id = %span.span_id, + "dropping invalid trace eval anchor attributes" + ); + } + } } - seen.into_iter().collect() + out } else { Vec::new() }; @@ -642,15 +679,40 @@ impl TraceSpanDBEngine { match self.write_spans(spans).await { Ok(_) => { if let Some(tx) = &self.commit_tx - && !trace_ids.is_empty() + && !anchors.is_empty() { - let trace_count = trace_ids.len(); - if let Err(e) = tx.send(trace_ids).await { - tracing::warn!( - trace_count, - "trace-arrival commit_tx closed after Delta commit ({:?}); timeout sweep will recover affected eval rows", - e - ); + let anchor_count = anchors.len(); + match tx.try_send(anchors) { + Ok(()) => { + metrics::counter!( + "scouter_trace_commit_event_channel_sent_total" + ) + .increment(anchor_count as u64); + } + Err(tokio::sync::mpsc::error::TrySendError::Full( + _, + )) => { + metrics::counter!( + "scouter_trace_commit_event_channel_drop_total" + ) + .increment(anchor_count as u64); + tracing::warn!( + anchor_count, + "trace-anchor channel full; dropping live notification, reconciliation sweep will recover" + ); + } + Err(tokio::sync::mpsc::error::TrySendError::Closed( + _, + )) => { + metrics::counter!( + "scouter_trace_commit_event_channel_drop_total" + ) + .increment(anchor_count as u64); + tracing::error!( + anchor_count, + "trace-anchor channel closed; inbox consumer is dead, reconciliation sweep is now the sole recovery path" + ); + } } } let _ = respond_to.send(Ok(())); diff --git a/crates/scouter_dataframe/src/parquet/tracing/queries.rs b/crates/scouter_dataframe/src/parquet/tracing/queries.rs index b28916326..b00222c36 100644 --- a/crates/scouter_dataframe/src/parquet/tracing/queries.rs +++ b/crates/scouter_dataframe/src/parquet/tracing/queries.rs @@ -15,7 +15,10 @@ use datafusion::prelude::*; use datafusion::scalar::ScalarValue; use mini_moka::sync::Cache; use scouter_types::sql::{TraceFilters, TraceMetricBucket, TraceSpan}; -use scouter_types::{Attribute, SpanEvent, SpanId, SpanLink, TraceId}; +use scouter_types::{ + Attribute, AwaitingTraceCommit, SCOUTER_EVAL_PROFILE_UID, SCOUTER_EVAL_RECORD_UID, SpanEvent, + SpanId, SpanLink, TraceCommitAnchor, TraceId, +}; use std::collections::HashMap; use std::hash::{Hash, Hasher}; use std::sync::Arc; @@ -767,6 +770,7 @@ fn flat_to_trace_span( /// Time predicates are always applied FIRST to enable Delta Lake partition pruning. /// `span_cache` provides sub-millisecond repeat reads for trace detail clicks. /// `metrics_cache` provides sub-millisecond repeat reads for dashboard metric charts. +#[derive(Clone)] pub struct TraceQueries { ctx: Arc, /// LRU cache keyed by 16-byte trace ID. TTL=5 min — archived span data is immutable. @@ -941,6 +945,150 @@ impl TraceQueries { Ok(build_span_tree(flat_spans)) } + /// Find committed anchor spans for awaiting eval rows. + /// + /// Reconciliation bounds partition_date by the supported anchor span duration and end_time by + /// the eval arrival window before ID filters. Anchor spans can start long before attach_eval, + /// but they only commit after ending. + pub async fn find_anchor_spans_for_records( + &self, + records: &[AwaitingTraceCommit], + lookback: chrono::Duration, + trace_arrival_timeout: chrono::Duration, + ) -> Result, TraceEngineError> { + if records.is_empty() { + return Ok(Vec::new()); + } + + let mut by_trace: HashMap> = HashMap::new(); + for record in records { + by_trace.entry(record.trace_id).or_default().push(record); + } + + let mut anchors = Vec::new(); + for (trace_id, trace_records) in by_trace { + let Some(window_start) = trace_records + .iter() + .map(|record| record.created_at - lookback) + .min() + else { + continue; + }; + let Some(window_end) = trace_records + .iter() + .map(|record| record.created_at + trace_arrival_timeout) + .max() + else { + continue; + }; + + let expected: HashMap = trace_records + .iter() + .map(|record| (record.span_id.clone(), *record)) + .collect(); + let span_literals: Vec = expected + .keys() + .map(|span_id| lit(ScalarValue::Binary(Some(span_id.as_bytes().to_vec())))) + .collect(); + + let mut builder = + TraceQueryBuilder::set_table(self.ctx.clone(), SPAN_TABLE_NAME).await?; + builder = builder.add_filter(col(PARTITION_DATE_COL).gt_eq(date_lit(&window_start)))?; + builder = builder.add_filter(col(PARTITION_DATE_COL).lt_eq(date_lit(&window_end)))?; + builder = builder.add_filter(col(END_TIME_COL).gt_eq(ts_lit(&window_start)))?; + builder = builder.add_filter(col(END_TIME_COL).lt(ts_lit(&window_end)))?; + builder = builder.add_filter( + col(TRACE_ID_COL).eq(lit(ScalarValue::Binary(Some(trace_id.as_bytes().to_vec())))), + )?; + builder = builder.add_filter(col(SPAN_ID_COL).in_list(span_literals, false))?; + builder = builder.select_columns(&[TRACE_ID_COL, SPAN_ID_COL, ATTRIBUTES_COL])?; + + for batch in builder.execute().await? { + let schema = batch.schema(); + let trace_idx = schema.index_of(TRACE_ID_COL).map_err(|_| { + TraceEngineError::BatchConversion("Missing column: trace_id".into()) + })?; + let span_idx = schema.index_of(SPAN_ID_COL).map_err(|_| { + TraceEngineError::BatchConversion("Missing column: span_id".into()) + })?; + let attrs_idx = schema.index_of(ATTRIBUTES_COL).map_err(|_| { + TraceEngineError::BatchConversion("Missing column: attributes".into()) + })?; + + let trace_arr = + cast(batch.column(trace_idx).as_ref(), &DataType::Binary).map_err(|e| { + TraceEngineError::BatchConversion(format!("trace_id cast: {e}")) + })?; + let trace_col = trace_arr + .as_any() + .downcast_ref::() + .ok_or_else(|| { + TraceEngineError::BatchConversion("trace_id not BinaryArray".into()) + })?; + let span_arr = cast(batch.column(span_idx).as_ref(), &DataType::Binary) + .map_err(|e| TraceEngineError::BatchConversion(format!("span_id cast: {e}")))?; + let span_col = + span_arr + .as_any() + .downcast_ref::() + .ok_or_else(|| { + TraceEngineError::BatchConversion("span_id not BinaryArray".into()) + })?; + let attrs_col = batch + .column(attrs_idx) + .as_any() + .downcast_ref::() + .ok_or_else(|| { + TraceEngineError::BatchConversion("attributes not MapArray".into()) + })?; + + for row_idx in 0..batch.num_rows() { + let trace_bytes: [u8; 16] = + trace_col.value(row_idx).try_into().map_err(|_| { + TraceEngineError::BatchConversion( + "trace_id must be exactly 16 bytes".into(), + ) + })?; + let span_bytes: [u8; 8] = span_col.value(row_idx).try_into().map_err(|_| { + TraceEngineError::BatchConversion("span_id must be exactly 8 bytes".into()) + })?; + let row_trace_id = TraceId::from_bytes(trace_bytes); + let row_span_id = SpanId::from_bytes(span_bytes); + let Some(expected_record) = expected.get(&row_span_id) else { + continue; + }; + + let attrs = extract_attributes(attrs_col, row_idx); + let record_uid = attrs + .iter() + .find(|attr| attr.key == SCOUTER_EVAL_RECORD_UID) + .and_then(|attr| attr.value.as_str()); + if record_uid != Some(expected_record.record_uid.as_str()) { + continue; + } + let Some(profile_uid) = attrs + .iter() + .find(|attr| attr.key == SCOUTER_EVAL_PROFILE_UID) + .and_then(|attr| attr.value.as_str()) + else { + continue; + }; + + if let Some(anchor) = TraceCommitAnchor::new( + row_trace_id, + row_span_id, + expected_record.record_uid.clone(), + profile_uid.to_string(), + ) { + anchors.push(anchor); + } + } + } + } + + Ok(anchors) + } + /// Get trace metrics over a time range, bucketed by the given interval string. /// /// `bucket_interval` must be a valid DataFusion `DATE_TRUNC` precision unit: diff --git a/crates/scouter_dataframe/src/parquet/tracing/service.rs b/crates/scouter_dataframe/src/parquet/tracing/service.rs index b96d5fcf5..e632ad292 100644 --- a/crates/scouter_dataframe/src/parquet/tracing/service.rs +++ b/crates/scouter_dataframe/src/parquet/tracing/service.rs @@ -6,7 +6,7 @@ use crate::parquet::tracing::queries::TraceQueries; use crate::storage::ObjectStore; use datafusion::prelude::SessionContext; use scouter_settings::ObjectStorageSettings; -use scouter_types::{TraceId, TraceSpanRecord, extract_gen_ai_span}; +use scouter_types::{TraceCommitAnchor, TraceSpanRecord, extract_gen_ai_span}; use std::sync::Arc; use tokio::sync::mpsc; use tokio::time::{Duration, interval}; @@ -32,7 +32,7 @@ pub async fn init_trace_span_service( flush_interval_secs: Option, retention_days: Option, refresh_interval_secs: u64, - commit_tx: Option>>, + commit_tx: Option>>, ) -> Result, TraceEngineError> { // Shut down any existing service before replacing let old_service = { @@ -109,7 +109,7 @@ impl TraceSpanService { flush_interval_secs: Option, retention_days: Option, refresh_interval_secs: u64, - commit_tx: Option>>, + commit_tx: Option>>, ) -> Result { let buffer_size = storage_settings.trace_buffer_size(); let engine = TraceSpanDBEngine::new(storage_settings, commit_tx).await?; @@ -400,7 +400,10 @@ mod tests { use scouter_types::TraceMetricsRequest; use scouter_types::sql::TraceSpan; use scouter_types::trace::query::FilterClause; - use scouter_types::{Attribute, SpanId, TraceId, TraceSpanRecord}; + use scouter_types::{ + Attribute, SCOUTER_EVAL_PROFILE_UID, SCOUTER_EVAL_RECORD_UID, SpanId, TraceCommitAnchor, + TraceId, TraceSpanRecord, + }; use serde_json::Value; use tracing_subscriber; @@ -489,7 +492,7 @@ mod tests { cleanup(); let storage_settings = ObjectStorageSettings::default(); - let (commit_tx, mut commit_rx) = mpsc::channel::>(1); + let (commit_tx, mut commit_rx) = mpsc::channel::>(1); let service = TraceSpanService::new(&storage_settings, 24, Some(2), None, 10, Some(commit_tx)) .await?; @@ -501,7 +504,16 @@ mod tests { None, "svc", "op", - vec![], + vec![ + Attribute { + key: SCOUTER_EVAL_RECORD_UID.to_string(), + value: serde_json::json!("record-a"), + }, + Attribute { + key: SCOUTER_EVAL_PROFILE_UID.to_string(), + value: serde_json::json!("profile-a"), + }, + ], ); service.write_spans_direct(vec![span]).await?; @@ -510,7 +522,112 @@ mod tests { .await .map_err(|_| TraceEngineError::ChannelClosed)? .ok_or(TraceEngineError::ChannelClosed)?; - assert_eq!(committed, vec![trace_id]); + assert_eq!( + committed, + vec![TraceCommitAnchor { + trace_id, + span_id: SpanId::from_bytes([0x01; 8]), + record_uid: "record-a".to_string(), + profile_uid: "profile-a".to_string(), + }] + ); + + service.shutdown().await?; + cleanup(); + Ok(()) + } + + #[tokio::test] + async fn test_write_spans_direct_drops_invalid_trace_commit_anchor() + -> Result<(), TraceEngineError> { + cleanup(); + + let storage_settings = ObjectStorageSettings::default(); + let (commit_tx, mut commit_rx) = mpsc::channel::>(1); + let service = + TraceSpanService::new(&storage_settings, 24, Some(2), None, 10, Some(commit_tx)) + .await?; + + let trace_id = TraceId::from_bytes([0xAC; 16]); + let span = make_span( + &trace_id, + SpanId::from_bytes([0x01; 8]), + None, + "svc", + "op", + vec![ + Attribute { + key: SCOUTER_EVAL_RECORD_UID.to_string(), + value: serde_json::json!("record with spaces"), + }, + Attribute { + key: SCOUTER_EVAL_PROFILE_UID.to_string(), + value: serde_json::json!("profile-a"), + }, + ], + ); + + service.write_spans_direct(vec![span]).await?; + assert!( + tokio::time::timeout(std::time::Duration::from_millis(250), commit_rx.recv()) + .await + .is_err() + ); + + service.shutdown().await?; + cleanup(); + Ok(()) + } + + #[tokio::test] + async fn test_write_spans_direct_drops_when_trace_commit_channel_full() + -> Result<(), TraceEngineError> { + cleanup(); + + let storage_settings = ObjectStorageSettings::default(); + let (commit_tx, mut commit_rx) = mpsc::channel::>(1); + commit_tx + .try_send(vec![TraceCommitAnchor { + trace_id: TraceId::from_bytes([0xAD; 16]), + span_id: SpanId::from_bytes([0xAD; 8]), + record_uid: "pre-filled".to_string(), + profile_uid: "profile-a".to_string(), + }]) + .unwrap(); + let service = + TraceSpanService::new(&storage_settings, 24, Some(2), None, 10, Some(commit_tx)) + .await?; + + let trace_id = TraceId::from_bytes([0xAE; 16]); + let span = make_span( + &trace_id, + SpanId::from_bytes([0x01; 8]), + None, + "svc", + "op", + vec![ + Attribute { + key: SCOUTER_EVAL_RECORD_UID.to_string(), + value: serde_json::json!("record-a"), + }, + Attribute { + key: SCOUTER_EVAL_PROFILE_UID.to_string(), + value: serde_json::json!("profile-a"), + }, + ], + ); + + service.write_spans_direct(vec![span]).await?; + let queued = commit_rx + .recv() + .await + .ok_or(TraceEngineError::ChannelClosed)?; + assert_eq!(queued[0].record_uid, "pre-filled"); + assert!( + tokio::time::timeout(std::time::Duration::from_millis(250), commit_rx.recv()) + .await + .is_err() + ); service.shutdown().await?; cleanup(); diff --git a/crates/scouter_drift/Cargo.toml b/crates/scouter_drift/Cargo.toml index 86151f6bb..a6661ffde 100644 --- a/crates/scouter_drift/Cargo.toml +++ b/crates/scouter_drift/Cargo.toml @@ -36,6 +36,7 @@ sqlx = { workspace = true, optional = true } thiserror = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } +metrics = { workspace = true } [dev-dependencies] diff --git a/crates/scouter_drift/src/genai/inbox.rs b/crates/scouter_drift/src/genai/inbox.rs index a5617aea1..dd88b92c2 100644 --- a/crates/scouter_drift/src/genai/inbox.rs +++ b/crates/scouter_drift/src/genai/inbox.rs @@ -1,39 +1,95 @@ use crate::error::DriftError; +use scouter_dataframe::parquet::tracing::queries::TraceQueries; use scouter_sql::PostgresClient; use scouter_sql::sql::error::SqlError; use scouter_sql::sql::traits::AgentDriftSqlLogic; -use scouter_types::TraceId; +use scouter_types::TraceCommitAnchor; use sqlx::{Pool, Postgres}; -use std::time::Duration; +use std::sync::Arc; +use std::time::{Duration, Instant}; use tokio::sync::{mpsc, watch}; use tracing::{debug, error, info, warn}; const EVENT_WORKER_INTERVAL: Duration = Duration::from_secs(1); const EVENT_BATCH_LIMIT: i64 = 500; -const SWEEP_TICKS: u32 = 60; const AWAITING_TRACE_TIMEOUT: chrono::Duration = chrono::Duration::minutes(5); const TRACE_COMMIT_EVENT_RETENTION: chrono::Duration = chrono::Duration::days(1); const COMMIT_INSERT_INITIAL_BACKOFF: Duration = Duration::from_millis(100); const COMMIT_INSERT_MAX_BACKOFF: Duration = Duration::from_secs(5); +const RECONCILE_BATCH_LIMIT: i64 = 200; +const RECONCILE_LOOKBACK_SECS: i64 = 86_400; + +#[derive(Clone, Debug)] +pub struct InboxSweepConfig { + pub timeout_interval: Duration, + pub prune_interval: Duration, + pub reconcile_interval: Duration, + pub lease_recovery_interval: Duration, + pub processed_retention: chrono::Duration, + pub trace_arrival_timeout: chrono::Duration, + pub reconcile_after: chrono::Duration, + pub reconcile_lookback: chrono::Duration, + pub lease_ttl: chrono::Duration, + pub max_attempts: i32, +} + +impl Default for InboxSweepConfig { + fn default() -> Self { + Self { + timeout_interval: Duration::from_secs(60), + prune_interval: Duration::from_secs(60), + reconcile_interval: Duration::from_secs( + env_u64("SCOUTER_INBOX_RECONCILE_INTERVAL_SECS", 60).max(1), + ), + lease_recovery_interval: Duration::from_secs(60), + processed_retention: TRACE_COMMIT_EVENT_RETENTION, + trace_arrival_timeout: AWAITING_TRACE_TIMEOUT, + reconcile_after: chrono::Duration::seconds(env_i64( + "SCOUTER_INBOX_RECONCILE_AFTER_SECS", + 15, + )), + reconcile_lookback: chrono::Duration::seconds(env_i64( + "SCOUTER_INBOX_RECONCILE_LOOKBACK_SECS", + RECONCILE_LOOKBACK_SECS, + )), + lease_ttl: chrono::Duration::minutes(2), + max_attempts: 5, + } + } +} + +fn env_i64(name: &str, default: i64) -> i64 { + std::env::var(name) + .ok() + .and_then(|value| value.parse::().ok()) + .unwrap_or(default) +} + +fn env_u64(name: &str, default: u64) -> u64 { + std::env::var(name) + .ok() + .and_then(|value| value.parse::().ok()) + .unwrap_or(default) +} pub async fn run_commit_consumer_loop( pool: Pool, - mut rx: mpsc::Receiver>, + mut rx: mpsc::Receiver>, mut shutdown: watch::Receiver<()>, ) { - info!("trace-commit consumer started"); + info!("trace-anchor commit consumer started"); loop { tokio::select! { _ = shutdown.changed() => { - info!("trace-commit consumer shutting down"); + info!("trace-anchor commit consumer shutting down"); break; } - maybe_ids = rx.recv() => { - let Some(trace_ids) = maybe_ids else { break }; - if trace_ids.is_empty() { + maybe_anchors = rx.recv() => { + let Some(anchors) = maybe_anchors else { break }; + if anchors.is_empty() { continue; } - insert_trace_commit_events_with_retry(&pool, &trace_ids, &mut shutdown).await; + insert_trace_commit_events_with_retry(&pool, &anchors, &mut shutdown).await; } } } @@ -41,18 +97,37 @@ pub async fn run_commit_consumer_loop( async fn insert_trace_commit_events_with_retry( pool: &Pool, - trace_ids: &[TraceId], + anchors: &[TraceCommitAnchor], shutdown: &mut watch::Receiver<()>, ) -> bool { let mut backoff = COMMIT_INSERT_INITIAL_BACKOFF; loop { - match PostgresClient::insert_trace_commit_events(pool, trace_ids).await { - Ok(_) => return true, + match PostgresClient::insert_trace_commit_events(pool, anchors).await { + Ok(result) => { + let inserted = result.rows_affected() as usize; + let duplicate = anchors.len().saturating_sub(inserted); + if duplicate > 0 { + metrics::counter!("scouter_trace_commit_event_duplicate_total") + .increment(duplicate as u64); + } + return true; + } Err(e) => { + if is_permanent_inbox_insert_error(&e) { + metrics::counter!("scouter_trace_commit_event_invalid_total") + .increment(anchors.len() as u64); + error!( + error = ?e, + anchor_count = anchors.len(), + "insert_trace_commit_events rejected permanent invalid anchor batch; dropping" + ); + return false; + } + error!( error = ?e, - trace_count = trace_ids.len(), + anchor_count = anchors.len(), backoff_ms = backoff.as_millis(), "insert_trace_commit_events failed; retrying" ); @@ -62,8 +137,8 @@ async fn insert_trace_commit_events_with_retry( tokio::select! { _ = shutdown.changed() => { warn!( - trace_count = trace_ids.len(), - "trace-commit consumer shutting down before commit events were inserted" + anchor_count = anchors.len(), + "trace-anchor consumer shutting down before commit events were inserted" ); return false; } @@ -74,14 +149,31 @@ async fn insert_trace_commit_events_with_retry( } } +fn is_permanent_inbox_insert_error(error: &SqlError) -> bool { + let SqlError::SqlxError(sqlx::Error::Database(db)) = error else { + return false; + }; + + matches!( + db.code().as_deref(), + Some("22001" | "23502" | "23514" | "22P02") + ) +} + pub async fn run_trace_commit_event_worker_loop( pool: Pool, trace_visibility_buffer: chrono::Duration, + trace_query: Arc, mut shutdown: watch::Receiver<()>, ) { let mut tick = tokio::time::interval(EVENT_WORKER_INTERVAL); tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - let mut counter: u32 = 0; + let mut sweeps = Box::pin(run_sweeps( + pool.clone(), + trace_query, + InboxSweepConfig::default(), + shutdown.clone(), + )); info!( "trace-commit event worker started (interval = {:?})", EVENT_WORKER_INTERVAL @@ -93,14 +185,14 @@ pub async fn run_trace_commit_event_worker_loop( info!("trace-commit event worker shutting down"); break; } + _ = &mut sweeps => { + info!("trace-commit sweep manager exited"); + break; + } _ = tick.tick() => { - if let Err(e) = drain_once(&pool, trace_visibility_buffer).await { + if let Err(e) = drain_once(&pool, trace_visibility_buffer, "trace-inbox-worker", 5).await { warn!("event worker drain failed: {:?}", e); } - counter = counter.wrapping_add(1); - if counter.is_multiple_of(SWEEP_TICKS) { - run_sweeps(&pool).await; - } } } } @@ -109,37 +201,112 @@ pub async fn run_trace_commit_event_worker_loop( pub(crate) async fn drain_once( pool: &Pool, trace_visibility_buffer: chrono::Duration, + worker_id: &str, + max_attempts: i32, ) -> Result<(), DriftError> { loop { - let mut tx = pool.begin().await.map_err(SqlError::from)?; + let claim_started = Instant::now(); let claimed = - match PostgresClient::claim_trace_commit_events(&mut tx, EVENT_BATCH_LIMIT).await { - Ok(claimed) => claimed, - Err(e) => { - tx.rollback().await.ok(); - return Err(e.into()); - } - }; - + PostgresClient::claim_trace_commit_events(pool, EVENT_BATCH_LIMIT, worker_id).await?; + metrics::histogram!("scouter_trace_commit_event_claim_latency_ms") + .record(claim_started.elapsed().as_secs_f64() * 1000.0); if claimed.is_empty() { - tx.rollback().await.map_err(SqlError::from)?; return Ok(()); } - let event_ids: Vec = claimed.iter().map(|(id, _)| *id).collect(); - let trace_ids: Vec = claimed.iter().map(|(_, trace_id)| *trace_id).collect(); + let claim_token = claimed[0].claim_token; + let event_ids: Vec = claimed.iter().map(|claim| claim.id).collect(); - let flipped = - PostgresClient::flip_awaiting_evals(&mut tx, &trace_ids, trace_visibility_buffer) - .await?; - let _ = PostgresClient::mark_events_processed(&mut tx, &event_ids).await?; - tx.commit().await.map_err(SqlError::from)?; + let flip_result = async { + let mut tx = pool.begin().await.map_err(SqlError::from)?; + let completed = + PostgresClient::complete_trace_commit_events(&mut tx, &event_ids, claim_token) + .await?; + let flipped = if completed.is_empty() { + 0 + } else { + PostgresClient::flip_awaiting_evals(&mut tx, &completed, trace_visibility_buffer) + .await? + .rows_affected() + }; + tx.commit().await.map_err(SqlError::from)?; + Ok::<(u64, usize), DriftError>((flipped, completed.len())) + } + .await; - debug!( - claimed = claimed.len(), - flipped = flipped.rows_affected(), - "drained trace-commit event batch" - ); + match flip_result { + Ok((flipped, completed)) => { + let fenced = event_ids.len().saturating_sub(completed); + if fenced > 0 { + metrics::counter!("scouter_trace_commit_event_complete_fenced_total") + .increment(fenced as u64); + warn!( + fenced, + batch = event_ids.len(), + "complete_trace_commit_events fenced by stale claim_token" + ); + } + metrics::counter!("scouter_trace_commit_event_processed_total") + .increment(completed as u64); + + if completed > 0 && flipped == 0 { + metrics::counter!("scouter_trace_commit_event_processed_noop_total") + .increment(completed as u64); + debug!( + completed, + "drained anchor batch; eval rows already terminal or missing" + ); + } else if flipped > 0 { + metrics::histogram!("scouter_trace_commit_event_flip_rows_affected") + .record(flipped as f64); + debug!(completed, flipped, "drained trace-anchor commit batch"); + } + } + Err(e) => { + let message = format!("{e:?}"); + match PostgresClient::fail_trace_commit_events( + pool, + &event_ids, + max_attempts, + &message, + claim_token, + ) + .await + { + Ok(rows) => { + let returned = rows + .iter() + .filter(|(_, status)| status == "pending") + .count(); + let dead_lettered = rows + .iter() + .filter(|(_, status)| status == "dead_lettered") + .count(); + let fenced = event_ids.len().saturating_sub(returned + dead_lettered); + metrics::counter!("scouter_trace_commit_event_lease_recovered_total") + .increment(returned as u64); + metrics::counter!("scouter_trace_commit_event_dead_lettered_total") + .increment(dead_lettered as u64); + if fenced > 0 { + metrics::counter!("scouter_trace_commit_event_fail_fenced_total") + .increment(fenced as u64); + warn!( + fenced, + batch = event_ids.len(), + "fail_trace_commit_events fenced by stale claim_token" + ); + } + } + Err(release_err) => { + error!( + ?release_err, + "fail_trace_commit_events also failed; lease recovery sweep will retry" + ); + } + } + return Err(e); + } + } if (claimed.len() as i64) < EVENT_BATCH_LIMIT { return Ok(()); @@ -147,18 +314,158 @@ pub(crate) async fn drain_once( } } -pub(crate) async fn run_sweeps(pool: &Pool) { - match PostgresClient::sweep_awaiting_trace_timeouts(pool, AWAITING_TRACE_TIMEOUT).await { - Ok(result) if result.rows_affected() > 0 => warn!( - timed_out = result.rows_affected(), - "TraceArrivalTimeout sweep failed rows" - ), - Ok(_) => {} - Err(e) => warn!("timeout sweep failed: {:?}", e), +pub(crate) async fn reconcile_lost_events( + pool: &Pool, + trace_query: &TraceQueries, + reconcile_after: chrono::Duration, + lookback: chrono::Duration, + trace_arrival_timeout: chrono::Duration, +) -> Result { + let batch_limit = env_i64("SCOUTER_INBOX_RECONCILE_BATCH", RECONCILE_BATCH_LIMIT); + let stuck = + PostgresClient::list_awaiting_record_uids(pool, reconcile_after, batch_limit).await?; + if stuck.is_empty() { + return Ok(0); + } + + let confirmed = trace_query + .find_anchor_spans_for_records(&stuck, lookback, trace_arrival_timeout) + .await + .map_err(|e| DriftError::RunTimeError(e.to_string()))?; + if confirmed.is_empty() { + return Ok(0); + } + + let result = PostgresClient::insert_trace_commit_events(pool, &confirmed).await?; + let inserted = result.rows_affected() as usize; + let duplicate = confirmed.len().saturating_sub(inserted); + if duplicate > 0 { + metrics::counter!("scouter_trace_commit_event_duplicate_total").increment(duplicate as u64); + } + metrics::counter!("scouter_trace_commit_event_reconciled_total").increment(inserted as u64); + info!( + recovered = inserted, + scanned = stuck.len(), + "reconciliation sweep recovered lost anchor events" + ); + Ok(inserted) +} + +pub async fn run_sweeps( + pool: Pool, + trace_query: Arc, + cfg: InboxSweepConfig, + mut shutdown: watch::Receiver<()>, +) { + let mut timeout_tick = tokio::time::interval(cfg.timeout_interval); + let mut prune_tick = tokio::time::interval(cfg.prune_interval); + let mut reconcile_tick = tokio::time::interval(cfg.reconcile_interval); + let mut lease_tick = tokio::time::interval(cfg.lease_recovery_interval); + + loop { + tokio::select! { + _ = shutdown.changed() => break, + _ = timeout_tick.tick() => { + match PostgresClient::sweep_awaiting_trace_timeouts(&pool, cfg.trace_arrival_timeout).await { + Ok(result) if result.rows_affected() > 0 => warn!( + timed_out = result.rows_affected(), + "TraceArrivalTimeout sweep failed rows" + ), + Ok(_) => {} + Err(e) => warn!("timeout sweep failed: {:?}", e), + } + } + _ = prune_tick.tick() => { + if let Err(e) = PostgresClient::prune_processed_events(&pool, cfg.processed_retention).await { + warn!("prune sweep failed: {:?}", e); + } + } + _ = reconcile_tick.tick() => { + if let Err(e) = reconcile_lost_events( + &pool, + &trace_query, + cfg.reconcile_after, + cfg.reconcile_lookback, + cfg.trace_arrival_timeout, + ).await { + warn!("reconciliation sweep failed: {:?}", e); + } + } + _ = lease_tick.tick() => { + refresh_queue_gauges(&pool).await; + match PostgresClient::recover_stale_processing(&pool, cfg.lease_ttl, cfg.max_attempts).await { + Ok(rows) => { + let returned = rows.iter().filter(|(_, status)| status == "pending").count(); + let dead_lettered = rows + .iter() + .filter(|(_, status)| status == "dead_lettered") + .count(); + metrics::counter!("scouter_trace_commit_event_lease_recovered_total") + .increment(returned as u64); + metrics::counter!("scouter_trace_commit_event_dead_lettered_total") + .increment(dead_lettered as u64); + } + Err(e) => warn!("recover_stale_processing failed: {:?}", e), + } + } + } + } +} + +#[cfg(any(test, feature = "test-helpers"))] +pub(crate) async fn run_basic_sweeps_once(pool: &Pool) { + if let Err(e) = + PostgresClient::sweep_awaiting_trace_timeouts(pool, AWAITING_TRACE_TIMEOUT).await + { + warn!("timeout sweep failed: {:?}", e); } if let Err(e) = PostgresClient::prune_processed_events(pool, TRACE_COMMIT_EVENT_RETENTION).await { warn!("prune sweep failed: {:?}", e); } + + match PostgresClient::recover_stale_processing(pool, chrono::Duration::minutes(2), 5).await { + Ok(rows) => { + let returned = rows + .iter() + .filter(|(_, status)| status == "pending") + .count(); + let dead_lettered = rows + .iter() + .filter(|(_, status)| status == "dead_lettered") + .count(); + metrics::counter!("scouter_trace_commit_event_lease_recovered_total") + .increment(returned as u64); + metrics::counter!("scouter_trace_commit_event_dead_lettered_total") + .increment(dead_lettered as u64); + } + Err(e) => warn!("recover_stale_processing failed: {:?}", e), + } +} + +async fn refresh_queue_gauges(pool: &Pool) { + let row = sqlx::query( + r#" + SELECT count(*)::bigint AS pending_count, + COALESCE(EXTRACT(EPOCH FROM now() - min(created_at)), 0)::double precision + AS oldest_age_seconds + FROM scouter.trace_commit_event + WHERE status = 'pending' + "#, + ) + .fetch_one(pool) + .await; + + match row { + Ok(row) => { + use sqlx::Row; + let pending_count: i64 = row.try_get("pending_count").unwrap_or(0); + let oldest_age_seconds: f64 = row.try_get("oldest_age_seconds").unwrap_or(0.0); + metrics::gauge!("scouter_trace_commit_event_pending_count").set(pending_count as f64); + metrics::gauge!("scouter_trace_commit_event_oldest_pending_age_seconds") + .set(oldest_age_seconds); + } + Err(e) => warn!("trace commit queue gauge refresh failed: {:?}", e), + } } diff --git a/crates/scouter_drift/src/genai/mod.rs b/crates/scouter_drift/src/genai/mod.rs index 699f60646..2eaa8e214 100644 --- a/crates/scouter_drift/src/genai/mod.rs +++ b/crates/scouter_drift/src/genai/mod.rs @@ -16,15 +16,38 @@ pub use poller::AgentPoller; #[cfg(any(test, feature = "test-helpers"))] pub mod test_helpers { use crate::error::DriftError; + use scouter_dataframe::parquet::tracing::queries::TraceQueries; use sqlx::{Pool, Postgres}; use super::inbox; pub async fn drain_once(pool: &Pool) -> Result<(), DriftError> { - inbox::drain_once(pool, chrono::Duration::zero()).await + inbox::drain_once(pool, chrono::Duration::zero(), "test-worker", 5).await } pub async fn run_sweeps(pool: &Pool) { - inbox::run_sweeps(pool).await; + inbox::run_basic_sweeps_once(pool).await; + } + + pub async fn reconcile_lost_events( + pool: &Pool, + trace_query: &TraceQueries, + ) -> Result { + reconcile_lost_events_with_lookback(pool, trace_query, chrono::Duration::days(1)).await + } + + pub async fn reconcile_lost_events_with_lookback( + pool: &Pool, + trace_query: &TraceQueries, + lookback: chrono::Duration, + ) -> Result { + inbox::reconcile_lost_events( + pool, + trace_query, + chrono::Duration::zero(), + lookback, + chrono::Duration::minutes(5), + ) + .await } } diff --git a/crates/scouter_server/src/api/polling/agent_poller.rs b/crates/scouter_server/src/api/polling/agent_poller.rs index 0dcdf9037..75a7ec87a 100644 --- a/crates/scouter_server/src/api/polling/agent_poller.rs +++ b/crates/scouter_server/src/api/polling/agent_poller.rs @@ -1,10 +1,12 @@ // Module to process GenAI drift record tasks use crate::api::error::ServerError; +use scouter_dataframe::parquet::tracing::queries::TraceQueries; use scouter_drift::genai::AgentPoller; use scouter_settings::polling::AgentPollerSettings; -use scouter_types::TraceId; +use scouter_types::TraceCommitAnchor; use sqlx::{Pool, Postgres}; use std::future::Future; +use std::sync::Arc; use tokio::sync::{mpsc, watch}; use tokio::task::JoinHandle; use tracing::{Instrument, Level, debug, error, info, span}; @@ -17,7 +19,8 @@ impl BackgroundAgentDriftManager { pub async fn start_workers( db_pool: &Pool, poll_settings: &AgentPollerSettings, - commit_rx: mpsc::Receiver>, + commit_rx: mpsc::Receiver>, + trace_query: Arc, shutdown_rx: watch::Receiver<()>, ) -> Result<(), ServerError> { let num_workers = poll_settings.genai_workers; @@ -62,6 +65,7 @@ impl BackgroundAgentDriftManager { scouter_drift::genai::inbox::run_trace_commit_event_worker_loop( db_pool.clone(), poll_settings.trace_visibility_buffer, + trace_query, shutdown_rx.clone(), ), shutdown_rx.clone(), diff --git a/crates/scouter_server/src/api/setup.rs b/crates/scouter_server/src/api/setup.rs index 1ac5264e6..f46cdd1fd 100644 --- a/crates/scouter_server/src/api/setup.rs +++ b/crates/scouter_server/src/api/setup.rs @@ -46,14 +46,14 @@ use scouter_events::consumer::redis::RedisConsumerManager; use crate::api::task_manager::TaskManager; use scouter_events::consumer::http::consumer::MessageConsumerManager; use scouter_settings::events::HttpConsumerSettings; -use scouter_types::{ServerRecords, TagRecord, TraceId, TraceServerRecord}; +use scouter_types::{ServerRecords, TagRecord, TraceCommitAnchor, TraceServerRecord}; type TraceServices = ( Arc, Arc, Arc, Arc, - tokio::sync::mpsc::Receiver>, + tokio::sync::mpsc::Receiver>, ); pub struct ScouterSetupComponents { @@ -197,6 +197,7 @@ impl ScouterSetupComponents { &db_pool, &config.genai_polling_settings, commit_rx, + Arc::new(trace_service.query_service.clone()), tokio_shutdown_rx.clone(), ) .await?; @@ -249,7 +250,7 @@ impl ScouterSetupComponents { let refresh_secs = config.storage_settings.trace_refresh_interval_secs; let retention_days = Some(config.database_settings.trace_retention_period as u32); - let (commit_tx, commit_rx) = tokio::sync::mpsc::channel::>(1024); + let (commit_tx, commit_rx) = tokio::sync::mpsc::channel::>(1024); let trace_service = init_trace_span_service( &config.storage_settings, compaction_hours, @@ -610,11 +611,18 @@ impl ScouterSetupComponents { async fn setup_background_genai_drift_workers( db_pool: &Pool, poll_settings: &AgentPollerSettings, - commit_rx: tokio::sync::mpsc::Receiver>, + commit_rx: tokio::sync::mpsc::Receiver>, + trace_query: Arc, shutdown_rx: tokio::sync::watch::Receiver<()>, ) -> AnyhowResult<()> { - BackgroundAgentDriftManager::start_workers(db_pool, poll_settings, commit_rx, shutdown_rx) - .await?; + BackgroundAgentDriftManager::start_workers( + db_pool, + poll_settings, + commit_rx, + trace_query, + shutdown_rx, + ) + .await?; info!("✅ Started background genai workers"); Ok(()) diff --git a/crates/scouter_server/tests/api/agent_trace_inbox.rs b/crates/scouter_server/tests/api/agent_trace_inbox.rs index cf12ac828..09f76ef23 100644 --- a/crates/scouter_server/tests/api/agent_trace_inbox.rs +++ b/crates/scouter_server/tests/api/agent_trace_inbox.rs @@ -11,7 +11,8 @@ use scouter_types::agent::{ }; use scouter_types::{ Attribute, BoxedEvalRecord, DriftType, EvalRecord, MessageRecord, SCOUTER_EVAL_PROFILE_UID, - SCOUTER_EVAL_RECORD_UID, ServerRecord, ServerRecords, SpanId, Status, TraceId, + SCOUTER_EVAL_RECORD_UID, ServerRecord, ServerRecords, SpanId, Status, TraceCommitAnchor, + TraceId, }; use serde_json::{Value, json}; use sqlx::{Pool, Postgres}; @@ -198,10 +199,37 @@ async fn wait_for_status(pool: &Pool, uid: &str, status: &str) -> Valu } } -async fn wait_for_event(pool: &Pool, trace_id: TraceId) { +fn anchor( + trace_id: TraceId, + span_id: SpanId, + record_uid: &str, + profile_uid: &str, +) -> TraceCommitAnchor { + TraceCommitAnchor { + trace_id, + span_id, + record_uid: record_uid.to_string(), + profile_uid: profile_uid.to_string(), + } +} + +fn stamp_anchor(span: &mut scouter_types::TraceSpanRecord, record_uid: &str, profile_uid: &str) { + span.attributes.extend([ + Attribute { + key: SCOUTER_EVAL_RECORD_UID.to_string(), + value: json!(record_uid), + }, + Attribute { + key: SCOUTER_EVAL_PROFILE_UID.to_string(), + value: json!(profile_uid), + }, + ]); +} + +async fn wait_for_event(pool: &Pool, anchor: &TraceCommitAnchor) { let deadline = tokio::time::Instant::now() + Duration::from_secs(5); loop { - if PostgresClient::trace_commit_event_exists(pool, &trace_id) + if PostgresClient::trace_commit_event_exists(pool, anchor) .await .unwrap() { @@ -243,11 +271,11 @@ async fn insert_awaiting_record(pool: &Pool, uid: &str, trace_id: Trac .unwrap(); } -async fn processed_event_count(pool: &Pool, trace_id: TraceId) -> i64 { +async fn processed_event_count(pool: &Pool, record_uid: &str) -> i64 { sqlx::query_scalar( - "SELECT count(*) FROM scouter.trace_commit_event WHERE trace_id = $1 AND processed_at IS NOT NULL", + "SELECT count(*) FROM scouter.trace_commit_event WHERE record_uid = $1 AND status = 'processed'", ) - .bind(trace_id.as_bytes().to_vec()) + .bind(record_uid) .fetch_one(pool) .await .unwrap() @@ -264,8 +292,9 @@ async fn test_agent_trace_inbox_end_to_end_paths() { .await; // 1. Forward race: eval awaits trace, Delta commit emits inbox event, worker flips pending. - let (trace_a, spans_a, _) = generate_trace_with_spans(2, 0); + let (trace_a, mut spans_a, _) = generate_trace_with_spans(2, 0); let forward_uid = create_uuid7(); + stamp_anchor(&mut spans_a[0], &forward_uid, &trace_profile_uid); insert_message( &helper, eval_message( @@ -283,20 +312,24 @@ async fn test_agent_trace_inbox_end_to_end_paths() { .await .unwrap(); wait_for_status(&helper.pool, &forward_uid, "pending").await; - assert_eq!( - processed_event_count(&helper.pool, trace_a.trace_id).await, - 1 - ); + assert_eq!(processed_event_count(&helper.pool, &forward_uid).await, 1); // 2. Reverse race: committed trace is cached in inbox, so eval inserts as pending immediately. - let (trace_b, spans_b, _) = generate_trace_with_spans(2, 0); + let (trace_b, mut spans_b, _) = generate_trace_with_spans(2, 0); + let reverse_uid = create_uuid7(); + stamp_anchor(&mut spans_b[0], &reverse_uid, &trace_profile_uid); + let reverse_anchor = anchor( + trace_b.trace_id, + spans_b[0].span_id.clone(), + &reverse_uid, + &trace_profile_uid, + ); helper .trace_service .write_spans_direct(spans_b.clone()) .await .unwrap(); - wait_for_event(&helper.pool, trace_b.trace_id).await; - let reverse_uid = create_uuid7(); + wait_for_event(&helper.pool, &reverse_anchor).await; insert_message( &helper, eval_message( @@ -325,15 +358,23 @@ async fn test_agent_trace_inbox_end_to_end_paths() { // 3. Crash recovery simulation: durable event + awaiting eval drains to pending. let crash_trace = TraceId::from_bytes([0x55; 16]); let crash_uid = create_uuid7(); - PostgresClient::insert_trace_commit_events(&helper.pool, &[crash_trace]) - .await - .unwrap(); + PostgresClient::insert_trace_commit_events( + &helper.pool, + &[anchor( + crash_trace, + SpanId::from_bytes([0x11; 8]), + &crash_uid, + &trace_profile_uid, + )], + ) + .await + .unwrap(); insert_awaiting_record(&helper.pool, &crash_uid, crash_trace).await; scouter_drift::genai::test_helpers::drain_once(&helper.pool) .await .unwrap(); wait_for_status(&helper.pool, &crash_uid, "pending").await; - assert_eq!(processed_event_count(&helper.pool, crash_trace).await, 1); + assert_eq!(processed_event_count(&helper.pool, &crash_uid).await, 1); // 4. Content-only profile does not need a trace. let content_uid = create_uuid7(); @@ -369,38 +410,67 @@ async fn test_agent_trace_inbox_end_to_end_paths() { assert_eq!(context["error"], "TraceArrivalTimeout"); // 7. Processed inbox prune keeps recent and unprocessed rows. + let prune_old = anchor( + TraceId::from_bytes([0x77; 16]), + SpanId::from_bytes([0x77; 8]), + "prune-old", + "profile", + ); + let prune_fresh = anchor( + TraceId::from_bytes([0x78; 16]), + SpanId::from_bytes([0x78; 8]), + "prune-fresh", + "profile", + ); + let prune_pending = anchor( + TraceId::from_bytes([0x79; 16]), + SpanId::from_bytes([0x79; 8]), + "prune-pending", + "profile", + ); sqlx::query( - "INSERT INTO scouter.trace_commit_event (trace_id, processed_at) VALUES ($1, now() - interval '25 hours')", + "INSERT INTO scouter.trace_commit_event (trace_id, span_id, record_uid, profile_uid, status, processed_at) VALUES ($1, $2, $3, $4, 'processed', now() - interval '25 hours')", ) - .bind(TraceId::from_bytes([0x77; 16]).as_bytes().to_vec()) + .bind(prune_old.trace_id.as_bytes().to_vec()) + .bind(prune_old.span_id.as_bytes().to_vec()) + .bind(&prune_old.record_uid) + .bind(&prune_old.profile_uid) .execute(&helper.pool) .await .unwrap(); sqlx::query( - "INSERT INTO scouter.trace_commit_event (trace_id, processed_at) VALUES ($1, now() - interval '1 hour')", + "INSERT INTO scouter.trace_commit_event (trace_id, span_id, record_uid, profile_uid, status, processed_at) VALUES ($1, $2, $3, $4, 'processed', now() - interval '1 hour')", ) - .bind(TraceId::from_bytes([0x78; 16]).as_bytes().to_vec()) + .bind(prune_fresh.trace_id.as_bytes().to_vec()) + .bind(prune_fresh.span_id.as_bytes().to_vec()) + .bind(&prune_fresh.record_uid) + .bind(&prune_fresh.profile_uid) .execute(&helper.pool) .await .unwrap(); - sqlx::query("INSERT INTO scouter.trace_commit_event (trace_id) VALUES ($1)") - .bind(TraceId::from_bytes([0x79; 16]).as_bytes().to_vec()) + sqlx::query( + "INSERT INTO scouter.trace_commit_event (trace_id, span_id, record_uid, profile_uid) VALUES ($1, $2, $3, $4)", + ) + .bind(prune_pending.trace_id.as_bytes().to_vec()) + .bind(prune_pending.span_id.as_bytes().to_vec()) + .bind(&prune_pending.record_uid) + .bind(&prune_pending.profile_uid) .execute(&helper.pool) .await .unwrap(); scouter_drift::genai::test_helpers::run_sweeps(&helper.pool).await; assert!( - !PostgresClient::trace_commit_event_exists(&helper.pool, &TraceId::from_bytes([0x77; 16])) + !PostgresClient::trace_commit_event_exists(&helper.pool, &prune_old) .await .unwrap() ); assert!( - PostgresClient::trace_commit_event_exists(&helper.pool, &TraceId::from_bytes([0x78; 16])) + PostgresClient::trace_commit_event_exists(&helper.pool, &prune_fresh) .await .unwrap() ); assert!( - PostgresClient::trace_commit_event_exists(&helper.pool, &TraceId::from_bytes([0x79; 16])) + PostgresClient::trace_commit_event_exists(&helper.pool, &prune_pending) .await .unwrap() ); @@ -461,14 +531,22 @@ async fn test_agent_trace_inbox_end_to_end_paths() { assert_eq!(context["error"], "TraceArrivalTimeout"); // 10. Multi-pod claim concurrency: two drains process one shared inbox without double-work. - let mut trace_ids = Vec::new(); + let mut anchors = Vec::new(); + let mut record_uids = Vec::new(); for offset in 0u8..100 { let byte = 0xA0u8.wrapping_add(offset); let trace_id = TraceId::from_bytes([byte; 16]); - trace_ids.push(trace_id); - insert_awaiting_record(&helper.pool, &format!("concurrent-{byte}"), trace_id).await; + let record_uid = format!("concurrent-{byte}"); + anchors.push(anchor( + trace_id, + SpanId::from_bytes([0x11; 8]), + &record_uid, + &trace_profile_uid, + )); + record_uids.push(record_uid.clone()); + insert_awaiting_record(&helper.pool, &record_uid, trace_id).await; } - PostgresClient::insert_trace_commit_events(&helper.pool, &trace_ids) + PostgresClient::insert_trace_commit_events(&helper.pool, &anchors) .await .unwrap(); let (left, right) = tokio::join!( @@ -485,16 +563,211 @@ async fn test_agent_trace_inbox_end_to_end_paths() { .unwrap(); assert_eq!(pending, 100); let processed: i64 = sqlx::query_scalar( - "SELECT count(*) FROM scouter.trace_commit_event WHERE trace_id = ANY($1) AND processed_at IS NOT NULL", - ) - .bind( - trace_ids - .iter() - .map(|trace_id| trace_id.as_bytes().to_vec()) - .collect::>(), + "SELECT count(*) FROM scouter.trace_commit_event WHERE record_uid = ANY($1::text[]) AND status = 'processed'", ) + .bind(record_uids) .fetch_one(&helper.pool) .await .unwrap(); assert_eq!(processed, 100); } + +#[tokio::test] +async fn test_anchor_span_arriving_late_does_not_flip_eval() { + let helper = setup_test().await; + let trace_profile_uid = helper + .register_drift_profile(trace_profile().await.create_profile_request().unwrap()) + .await; + + let (trace, mut spans, _) = generate_trace_with_spans(2, 0); + let record_uid = create_uuid7(); + let anchor_span_id = spans[1].span_id.clone(); + stamp_anchor(&mut spans[1], &record_uid, &trace_profile_uid); + + insert_message( + &helper, + eval_message( + &trace_profile_uid, + Some(trace.trace_id), + Some(anchor_span_id), + &record_uid, + ), + ) + .await; + wait_for_status(&helper.pool, &record_uid, "awaiting_trace").await; + + helper + .trace_service + .write_spans_direct(vec![spans[0].clone()]) + .await + .unwrap(); + tokio::time::sleep(Duration::from_secs(2)).await; + + let status: String = + sqlx::query_scalar("SELECT status FROM scouter.agent_eval_record WHERE uid = $1") + .bind(&record_uid) + .fetch_one(&helper.pool) + .await + .unwrap(); + assert_eq!(status, "awaiting_trace"); + + let inbox_count: i64 = + sqlx::query_scalar("SELECT count(*) FROM scouter.trace_commit_event WHERE record_uid = $1") + .bind(&record_uid) + .fetch_one(&helper.pool) + .await + .unwrap(); + assert_eq!(inbox_count, 0); + + helper + .trace_service + .write_spans_direct(vec![spans[1].clone()]) + .await + .unwrap(); + wait_for_status(&helper.pool, &record_uid, "pending").await; +} + +#[tokio::test] +async fn test_reconciliation_recovers_lost_anchor_events() { + let helper = setup_test().await; + let trace_profile_uid = helper + .register_drift_profile(trace_profile().await.create_profile_request().unwrap()) + .await; + let entity_id = PostgresClient::get_entity_id_from_uid(&helper.pool, &trace_profile_uid) + .await + .unwrap(); + + let (trace, mut spans, _) = generate_trace_with_spans(1, 0); + let record_uid = create_uuid7(); + let span_id = spans[0].span_id.clone(); + stamp_anchor(&mut spans[0], &record_uid, &trace_profile_uid); + let event_anchor = anchor( + trace.trace_id, + span_id.clone(), + &record_uid, + &trace_profile_uid, + ); + + helper + .trace_service + .write_spans_direct(spans) + .await + .unwrap(); + wait_for_event(&helper.pool, &event_anchor).await; + + sqlx::query("DELETE FROM scouter.trace_commit_event WHERE record_uid = $1") + .bind(&record_uid) + .execute(&helper.pool) + .await + .unwrap(); + + let record = EvalRecord { + created_at: Utc::now(), + uid: record_uid.clone(), + entity_id, + context: json!({"input": "hello"}), + trace_id: Some(trace.trace_id), + span_id: Some(span_id), + ..Default::default() + }; + PostgresClient::insert_agent_eval_record( + &helper.pool, + BoxedEvalRecord::new(record), + &entity_id, + Status::AwaitingTrace, + ) + .await + .unwrap(); + + let recovered = scouter_drift::genai::test_helpers::reconcile_lost_events( + &helper.pool, + &helper.trace_service.query_service, + ) + .await + .unwrap(); + assert_eq!(recovered, 1); + + scouter_drift::genai::test_helpers::drain_once(&helper.pool) + .await + .unwrap(); + wait_for_status(&helper.pool, &record_uid, "pending").await; +} + +#[tokio::test] +async fn test_reconciliation_window_supports_long_running_anchor_spans() { + let helper = setup_test().await; + let trace_profile_uid = helper + .register_drift_profile(trace_profile().await.create_profile_request().unwrap()) + .await; + let entity_id = PostgresClient::get_entity_id_from_uid(&helper.pool, &trace_profile_uid) + .await + .unwrap(); + + let (trace, mut spans, _) = generate_trace_with_spans(1, 0); + let record_uid = create_uuid7(); + let span_id = spans[0].span_id.clone(); + stamp_anchor(&mut spans[0], &record_uid, &trace_profile_uid); + let event_anchor = anchor( + trace.trace_id, + span_id.clone(), + &record_uid, + &trace_profile_uid, + ); + let now = Utc::now(); + spans[0].start_time = now - chrono::Duration::days(2); + spans[0].end_time = now; + spans[0].duration_ms = (spans[0].end_time - spans[0].start_time).num_milliseconds(); + + helper + .trace_service + .write_spans_direct(spans) + .await + .unwrap(); + wait_for_event(&helper.pool, &event_anchor).await; + sqlx::query("DELETE FROM scouter.trace_commit_event WHERE record_uid = $1") + .bind(&record_uid) + .execute(&helper.pool) + .await + .unwrap(); + + let record = EvalRecord { + created_at: now, + uid: record_uid.clone(), + entity_id, + context: json!({"input": "hello"}), + trace_id: Some(trace.trace_id), + span_id: Some(span_id), + ..Default::default() + }; + PostgresClient::insert_agent_eval_record( + &helper.pool, + BoxedEvalRecord::new(record), + &entity_id, + Status::AwaitingTrace, + ) + .await + .unwrap(); + + let narrow = scouter_drift::genai::test_helpers::reconcile_lost_events_with_lookback( + &helper.pool, + &helper.trace_service.query_service, + chrono::Duration::seconds(60), + ) + .await + .unwrap(); + assert_eq!(narrow, 0); + + let recovered = scouter_drift::genai::test_helpers::reconcile_lost_events_with_lookback( + &helper.pool, + &helper.trace_service.query_service, + chrono::Duration::days(3), + ) + .await + .unwrap(); + assert_eq!(recovered, 1); + + scouter_drift::genai::test_helpers::drain_once(&helper.pool) + .await + .unwrap(); + wait_for_status(&helper.pool, &record_uid, "pending").await; +} diff --git a/crates/scouter_server/tests/common/mod.rs b/crates/scouter_server/tests/common/mod.rs index e719c3da6..978821e7f 100644 --- a/crates/scouter_server/tests/common/mod.rs +++ b/crates/scouter_server/tests/common/mod.rs @@ -187,7 +187,8 @@ impl TestHelper { env::set_var("DATA_RETENTION_PERIOD", "5"); std::env::set_var("OPENAI_API_KEY", "test_key"); std::env::set_var("SCOUTER_DATASET_FLUSH_INTERVAL_SECS", "1"); - std::env::set_var("SCOUTER_TRACE_VISIBILITY_BUFFER_SECS", "0"); + std::env::set_var("SCOUTER_TRACE_REFRESH_INTERVAL_SECS", "1"); + std::env::set_var("SCOUTER_TRACE_VISIBILITY_BUFFER_SECS", "3"); } if enable_kafka { diff --git a/crates/scouter_settings/src/polling.rs b/crates/scouter_settings/src/polling.rs index 68c1e7036..470c26aac 100644 --- a/crates/scouter_settings/src/polling.rs +++ b/crates/scouter_settings/src/polling.rs @@ -2,6 +2,8 @@ use chrono::Duration; use serde::Deserialize; use serde::Serialize; +use crate::storage::trace_refresh_interval_secs_from_env; + #[derive(Debug, Clone, Serialize)] pub struct PollingSettings { pub num_workers: usize, @@ -85,18 +87,57 @@ impl Default for AgentPollerSettings { impl AgentPollerSettings { pub fn trace_visibility_buffer() -> Duration { - let default_secs = std::env::var("SCOUTER_TRACE_REFRESH_INTERVAL_SECS") + let refresh_secs = trace_refresh_interval_secs_from_env() as i64; + let min_floor = refresh_secs + 2; + + let configured = std::env::var("SCOUTER_TRACE_VISIBILITY_BUFFER_SECS") .ok() .and_then(|v| v.parse::().ok()) - .unwrap_or(10) - + 2; - - Duration::seconds( - std::env::var("SCOUTER_TRACE_VISIBILITY_BUFFER_SECS") - .ok() - .and_then(|v| v.parse::().ok()) - .unwrap_or(default_secs) - .max(0), - ) + .unwrap_or(min_floor) + .max(0); + + if configured < min_floor { + panic!( + "SCOUTER_TRACE_VISIBILITY_BUFFER_SECS={} is below the safe minimum {} \ + (SCOUTER_TRACE_REFRESH_INTERVAL_SECS + 2). A smaller buffer can cause \ + the eval poller to fetch spans before the local Delta snapshot sees the anchor.", + configured, min_floor + ); + } + + Duration::seconds(configured) + } +} + +#[cfg(test)] +mod tests { + use super::AgentPollerSettings; + + #[test] + fn trace_visibility_buffer_floor() { + unsafe { + std::env::set_var("SCOUTER_TRACE_REFRESH_INTERVAL_SECS", "10"); + std::env::set_var("SCOUTER_TRACE_VISIBILITY_BUFFER_SECS", "1"); + } + let result = std::panic::catch_unwind(AgentPollerSettings::trace_visibility_buffer); + unsafe { + std::env::remove_var("SCOUTER_TRACE_REFRESH_INTERVAL_SECS"); + std::env::remove_var("SCOUTER_TRACE_VISIBILITY_BUFFER_SECS"); + } + assert!(result.is_err()); + } + + #[test] + fn trace_visibility_buffer_ignores_negative_refresh_like_storage_settings() { + unsafe { + std::env::set_var("SCOUTER_TRACE_REFRESH_INTERVAL_SECS", "-10"); + std::env::set_var("SCOUTER_TRACE_VISIBILITY_BUFFER_SECS", "1"); + } + let result = std::panic::catch_unwind(AgentPollerSettings::trace_visibility_buffer); + unsafe { + std::env::remove_var("SCOUTER_TRACE_REFRESH_INTERVAL_SECS"); + std::env::remove_var("SCOUTER_TRACE_VISIBILITY_BUFFER_SECS"); + } + assert!(result.is_err()); } } diff --git a/crates/scouter_settings/src/storage.rs b/crates/scouter_settings/src/storage.rs index 91239790b..dadade0b6 100644 --- a/crates/scouter_settings/src/storage.rs +++ b/crates/scouter_settings/src/storage.rs @@ -3,6 +3,13 @@ use scouter_types::StorageType; use serde::Serialize; use std::path::PathBuf; +pub fn trace_refresh_interval_secs_from_env() -> u64 { + std::env::var("SCOUTER_TRACE_REFRESH_INTERVAL_SECS") + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or(10) +} + #[derive(Debug, Clone, Serialize)] pub struct ObjectStorageSettings { pub storage_uri: String, @@ -42,10 +49,7 @@ impl Default for ObjectStorageSettings { .and_then(|v| v.parse().ok()) .unwrap_or(5u64); - let trace_refresh_interval_secs = std::env::var("SCOUTER_TRACE_REFRESH_INTERVAL_SECS") - .ok() - .and_then(|v| v.parse().ok()) - .unwrap_or(10u64); + let trace_refresh_interval_secs = trace_refresh_interval_secs_from_env(); Self { storage_uri, diff --git a/crates/scouter_sql/src/migrations/20260512000000_trace_commit_anchor_queue.sql b/crates/scouter_sql/src/migrations/20260512000000_trace_commit_anchor_queue.sql new file mode 100644 index 000000000..295cb6475 --- /dev/null +++ b/crates/scouter_sql/src/migrations/20260512000000_trace_commit_anchor_queue.sql @@ -0,0 +1,77 @@ +-- Forward-only migration: rebuild scouter.trace_commit_event from a trace-level +-- inbox into an anchor-level durable queue / audit table. +-- +-- The 20260509000000 migration shipped a trace-level inbox keyed only on trace_id. +-- That schema cannot represent the per-span anchor contract this change requires. +-- Deploy this migration during downtime with the server stopped. + +DROP TABLE IF EXISTS scouter.trace_commit_event; + +CREATE TABLE scouter.trace_commit_event ( + id BIGSERIAL PRIMARY KEY, + + trace_id BYTEA NOT NULL CHECK (octet_length(trace_id) = 16), + span_id BYTEA NOT NULL CHECK (octet_length(span_id) = 8), + record_uid TEXT NOT NULL, + profile_uid TEXT NOT NULL, + + status TEXT NOT NULL DEFAULT 'pending', + attempt_count INT NOT NULL DEFAULT 0, + claimed_at TIMESTAMPTZ, + claimed_by TEXT, + claim_token UUID, + processed_at TIMESTAMPTZ, + last_error TEXT, + + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), + + CONSTRAINT trace_commit_event_record_span_unique UNIQUE (record_uid, span_id), + CONSTRAINT chk_trace_commit_event_record_uid + CHECK ( + char_length(record_uid) BETWEEN 1 AND 128 + AND record_uid ~ '^[A-Za-z0-9_.:-]+$' + ), + CONSTRAINT chk_trace_commit_event_profile_uid + CHECK ( + char_length(profile_uid) BETWEEN 1 AND 128 + AND profile_uid ~ '^[A-Za-z0-9_.:-]+$' + ), + CONSTRAINT chk_trace_commit_event_status + CHECK (status IN ('pending', 'processing', 'processed', 'dead_lettered')) +); + +DROP INDEX IF EXISTS scouter.idx_agent_eval_record_awaiting_trace; + +CREATE INDEX IF NOT EXISTS idx_agent_eval_record_awaiting_trace_uid + ON scouter.agent_eval_record (uid) + WHERE status = 'awaiting_trace'; + +CREATE INDEX IF NOT EXISTS idx_agent_eval_record_awaiting_trace_reconcile + ON scouter.agent_eval_record (created_at, uid) + WHERE status = 'awaiting_trace' + AND trace_id IS NOT NULL + AND span_id IS NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_agent_eval_record_awaiting_trace_timeout + ON scouter.agent_eval_record (created_at) + WHERE status = 'awaiting_trace'; + +CREATE INDEX IF NOT EXISTS idx_trace_commit_event_pending + ON scouter.trace_commit_event (id) + WHERE status = 'pending'; + +CREATE INDEX IF NOT EXISTS idx_trace_commit_event_record_uid + ON scouter.trace_commit_event (record_uid); + +CREATE INDEX IF NOT EXISTS idx_trace_commit_event_processing + ON scouter.trace_commit_event (claimed_at) + WHERE status = 'processing'; + +CREATE INDEX IF NOT EXISTS idx_trace_commit_event_processed_at + ON scouter.trace_commit_event (processed_at) + WHERE status = 'processed'; + +CREATE INDEX IF NOT EXISTS idx_trace_commit_event_dead_lettered + ON scouter.trace_commit_event (created_at) + WHERE status = 'dead_lettered'; diff --git a/crates/scouter_sql/src/sql/postgres.rs b/crates/scouter_sql/src/sql/postgres.rs index 1cfcf8756..6e7ee08ce 100644 --- a/crates/scouter_sql/src/sql/postgres.rs +++ b/crates/scouter_sql/src/sql/postgres.rs @@ -11,7 +11,8 @@ use chrono::Duration as ChronoDuration; use scouter_settings::DatabaseSettings; use scouter_types::agent::profile::AgentEvalProfile; use scouter_types::{ - RecordType, ServerRecords, Status, TagRecord, ToDriftRecords, TraceServerRecord, + RecordType, ServerRecords, Status, TagRecord, ToDriftRecords, TraceCommitAnchor, + TraceServerRecord, }; use sqlx::ConnectOptions; use sqlx::{Pool, Postgres, postgres::PgConnectOptions}; @@ -192,23 +193,38 @@ impl MessageHandler { (Status::Failed, ChronoDuration::zero()) } Some(trace_id) => { - // Trace anchors are minted by trusted Scouter runtime - // instrumentation. If future public APIs allow arbitrary - // tenant-supplied anchors, validate ownership at ingestion. - let already_committed = - PostgresClient::trace_commit_event_exists(pool, trace_id) - .await - .unwrap_or_else(|e| { - warn!( - "trace_commit_event_exists probe failed ({:?}); falling back to AwaitingTrace", - e - ); - false - }); - if already_committed { - (Status::Pending, trace_visibility_buffer) + if let Some(span_id) = record.record.span_id.as_ref() { + if let Some(anchor) = TraceCommitAnchor::new( + *trace_id, + span_id.clone(), + record.record.uid.clone(), + record.record.entity_uid.clone(), + ) { + let already_committed = + PostgresClient::trace_commit_event_exists(pool, &anchor) + .await + .unwrap_or_else(|e| { + warn!( + "trace_commit_event_exists probe failed ({:?}); falling back to AwaitingTrace", + e + ); + false + }); + + if already_committed { + (Status::Pending, trace_visibility_buffer) + } else { + (Status::AwaitingTrace, ChronoDuration::zero()) + } + } else { + record.record.set_failed_with_error("InvalidTraceAnchor"); + (Status::Failed, ChronoDuration::zero()) + } } else { - (Status::AwaitingTrace, ChronoDuration::zero()) + record + .record + .set_failed_with_error("EvalRequiresAnchorSpan"); + (Status::Failed, ChronoDuration::zero()) } } } @@ -354,6 +370,20 @@ mod tests { const VERSION: &str = "1.0.0"; const ENTITY_ID: i32 = 9999; + fn anchor( + trace_id: TraceId, + span_id_byte: u8, + record_uid: &str, + profile_uid: &str, + ) -> TraceCommitAnchor { + TraceCommitAnchor { + trace_id, + span_id: SpanId::from_bytes([span_id_byte; 8]), + record_uid: record_uid.to_string(), + profile_uid: profile_uid.to_string(), + } + } + pub async fn cleanup(pool: &Pool) { sqlx::raw_sql( r#" @@ -1244,34 +1274,37 @@ mod tests { let pool = db_pool().await; let trace_a = TraceId::from_bytes([0xAA; 16]); let trace_b = TraceId::from_bytes([0xBB; 16]); + let anchor_a = anchor(trace_a, 0x11, "awaiting-record", "profile-a"); + let anchor_b = anchor(trace_b, 0x22, "other-record", "profile-a"); let empty = PostgresClient::insert_trace_commit_events(&pool, &[]) .await .unwrap(); assert_eq!(empty.rows_affected(), 0); - let inserted = PostgresClient::insert_trace_commit_events(&pool, &[trace_a, trace_b]) - .await - .unwrap(); + let inserted = + PostgresClient::insert_trace_commit_events(&pool, &[anchor_a.clone(), anchor_b]) + .await + .unwrap(); assert_eq!(inserted.rows_affected(), 2); assert!( - PostgresClient::trace_commit_event_exists(&pool, &trace_a) + PostgresClient::trace_commit_event_exists(&pool, &anchor_a) .await .unwrap() ); + let missing_anchor = anchor(trace_a, 0x11, "missing-record", "profile-a"); assert!( - !PostgresClient::trace_commit_event_exists(&pool, &TraceId::from_bytes([0xCC; 16])) + !PostgresClient::trace_commit_event_exists(&pool, &missing_anchor) .await .unwrap() ); - let mut tx = pool.begin().await.unwrap(); - let claimed = PostgresClient::claim_trace_commit_events(&mut tx, 100) + let claimed = PostgresClient::claim_trace_commit_events(&pool, 100, "test-worker") .await .unwrap(); assert_eq!(claimed.len(), 2); - let event_ids: Vec = claimed.iter().map(|(id, _)| *id).collect(); - let trace_ids: Vec = claimed.iter().map(|(_, trace_id)| *trace_id).collect(); + let claim_token = claimed[0].claim_token; + let event_ids: Vec = claimed.iter().map(|claim| claim.id).collect(); let (_uid, entity_id) = PostgresClient::create_entity( &pool, @@ -1297,14 +1330,15 @@ mod tests { .await .unwrap(); - let flipped = PostgresClient::flip_awaiting_evals(&mut tx, &trace_ids, Duration::zero()) + let mut tx = pool.begin().await.unwrap(); + let completed = + PostgresClient::complete_trace_commit_events(&mut tx, &event_ids, claim_token) + .await + .unwrap(); + let flipped = PostgresClient::flip_awaiting_evals(&mut tx, &completed, Duration::zero()) .await .unwrap(); assert_eq!(flipped.rows_affected(), 1); - let marked = PostgresClient::mark_events_processed(&mut tx, &event_ids) - .await - .unwrap(); - assert_eq!(marked.rows_affected(), 2); tx.commit().await.unwrap(); let status: String = @@ -1316,7 +1350,7 @@ mod tests { assert_eq!(status, "pending"); let processed_count: i64 = sqlx::query_scalar( - "SELECT count(*) FROM scouter.trace_commit_event WHERE processed_at IS NOT NULL", + "SELECT count(*) FROM scouter.trace_commit_event WHERE status = 'processed'", ) .fetch_one(&pool) .await @@ -1324,37 +1358,192 @@ mod tests { assert_eq!(processed_count, 2); } + #[tokio::test] + async fn test_trace_commit_event_flip_requires_full_anchor_tuple() { + let pool = db_pool().await; + let trace_id = TraceId::from_bytes([0xAC; 16]); + let anchor = anchor(trace_id, 0x11, "tuple-record", "profile-a"); + PostgresClient::insert_trace_commit_events(&pool, &[anchor]) + .await + .unwrap(); + + let claimed = PostgresClient::claim_trace_commit_events(&pool, 1, "tuple-worker") + .await + .unwrap(); + let claim_token = claimed[0].claim_token; + let event_ids: Vec = claimed.iter().map(|claim| claim.id).collect(); + + let (_uid, entity_id) = PostgresClient::create_entity( + &pool, + SPACE, + "tuple-fence", + VERSION, + DriftType::Agent.to_string(), + ) + .await + .unwrap(); + let record = EvalRecord { + created_at: Utc::now(), + context: serde_json::json!({"input": "hello"}), + status: Status::AwaitingTrace, + uid: "tuple-record".to_string(), + entity_id, + trace_id: Some(trace_id), + span_id: Some(SpanId::from_bytes([0x22; 8])), + ..Default::default() + }; + PostgresClient::insert_agent_eval_record( + &pool, + BoxedEvalRecord::new(record), + &entity_id, + Status::AwaitingTrace, + ) + .await + .unwrap(); + + let mut tx = pool.begin().await.unwrap(); + let completed = + PostgresClient::complete_trace_commit_events(&mut tx, &event_ids, claim_token) + .await + .unwrap(); + let flipped = PostgresClient::flip_awaiting_evals(&mut tx, &completed, Duration::zero()) + .await + .unwrap(); + tx.commit().await.unwrap(); + assert_eq!(flipped.rows_affected(), 0); + + let status: String = + sqlx::query_scalar("SELECT status FROM scouter.agent_eval_record WHERE uid = $1") + .bind("tuple-record") + .fetch_one(&pool) + .await + .unwrap(); + assert_eq!(status, "awaiting_trace"); + } + + #[tokio::test] + async fn test_trace_commit_event_lease_fencing_and_dead_letter() { + let pool = db_pool().await; + let trace_id = TraceId::from_bytes([0xAD; 16]); + PostgresClient::insert_trace_commit_events( + &pool, + &[anchor(trace_id, 0x11, "lease-record", "profile-a")], + ) + .await + .unwrap(); + + let worker_a = PostgresClient::claim_trace_commit_events(&pool, 1, "worker-a") + .await + .unwrap(); + let event_ids: Vec = worker_a.iter().map(|claim| claim.id).collect(); + let stale_token = worker_a[0].claim_token; + sqlx::query( + "UPDATE scouter.trace_commit_event SET claimed_at = now() - interval '10 minutes' WHERE id = $1", + ) + .bind(event_ids[0]) + .execute(&pool) + .await + .unwrap(); + + let recovered = PostgresClient::recover_stale_processing(&pool, Duration::seconds(1), 5) + .await + .unwrap(); + assert_eq!(recovered, vec![(event_ids[0], "pending".to_string())]); + + let worker_b = PostgresClient::claim_trace_commit_events(&pool, 1, "worker-b") + .await + .unwrap(); + let live_token = worker_b[0].claim_token; + assert_ne!(stale_token, live_token); + + let mut tx = pool.begin().await.unwrap(); + let stale_complete = + PostgresClient::complete_trace_commit_events(&mut tx, &event_ids, stale_token) + .await + .unwrap(); + tx.commit().await.unwrap(); + assert!(stale_complete.is_empty()); + + let stale_fail = PostgresClient::fail_trace_commit_events( + &pool, + &event_ids, + 5, + "stale failure", + stale_token, + ) + .await + .unwrap(); + assert!(stale_fail.is_empty()); + + let row: (String, uuid::Uuid) = sqlx::query_as( + "SELECT status, claim_token FROM scouter.trace_commit_event WHERE id = $1", + ) + .bind(event_ids[0]) + .fetch_one(&pool) + .await + .unwrap(); + assert_eq!(row.0, "processing"); + assert_eq!(row.1, live_token); + + sqlx::query( + "UPDATE scouter.trace_commit_event SET claimed_at = now() - interval '10 minutes' WHERE id = $1", + ) + .bind(event_ids[0]) + .execute(&pool) + .await + .unwrap(); + let dead = PostgresClient::recover_stale_processing(&pool, Duration::seconds(1), 2) + .await + .unwrap(); + assert_eq!(dead, vec![(event_ids[0], "dead_lettered".to_string())]); + } + #[tokio::test] async fn test_trace_commit_event_claim_uses_skip_locked() { let pool = db_pool().await; - let trace_ids = [ - TraceId::from_bytes([0xA1; 16]), - TraceId::from_bytes([0xA2; 16]), - TraceId::from_bytes([0xA3; 16]), - TraceId::from_bytes([0xA4; 16]), + let anchors = [ + anchor( + TraceId::from_bytes([0xA1; 16]), + 0xA1, + "record-a1", + "profile-a", + ), + anchor( + TraceId::from_bytes([0xA2; 16]), + 0xA2, + "record-a2", + "profile-a", + ), + anchor( + TraceId::from_bytes([0xA3; 16]), + 0xA3, + "record-a3", + "profile-a", + ), + anchor( + TraceId::from_bytes([0xA4; 16]), + 0xA4, + "record-a4", + "profile-a", + ), ]; - PostgresClient::insert_trace_commit_events(&pool, &trace_ids) + PostgresClient::insert_trace_commit_events(&pool, &anchors) .await .unwrap(); - let mut tx1 = pool.begin().await.unwrap(); - let tx1_claimed = PostgresClient::claim_trace_commit_events(&mut tx1, 2) + let tx1_claimed = PostgresClient::claim_trace_commit_events(&pool, 2, "worker-a") .await .unwrap(); assert_eq!(tx1_claimed.len(), 2); - let mut tx2 = pool.begin().await.unwrap(); - let tx2_claimed = PostgresClient::claim_trace_commit_events(&mut tx2, 4) + let tx2_claimed = PostgresClient::claim_trace_commit_events(&pool, 4, "worker-b") .await .unwrap(); assert_eq!(tx2_claimed.len(), 2); - let tx1_ids: HashSet = tx1_claimed.iter().map(|(id, _)| *id).collect(); - let tx2_ids: HashSet = tx2_claimed.iter().map(|(id, _)| *id).collect(); + let tx1_ids: HashSet = tx1_claimed.iter().map(|claim| claim.id).collect(); + let tx2_ids: HashSet = tx2_claimed.iter().map(|claim| claim.id).collect(); assert!(tx1_ids.is_disjoint(&tx2_ids)); - - tx2.rollback().await.unwrap(); - tx1.rollback().await.unwrap(); } #[tokio::test] @@ -1478,21 +1667,32 @@ mod tests { assert_eq!(fresh_status, "awaiting_trace"); sqlx::query( - "INSERT INTO scouter.trace_commit_event (trace_id, processed_at) VALUES ($1, now() - interval '25 hours')", + "INSERT INTO scouter.trace_commit_event (trace_id, span_id, record_uid, profile_uid, status, processed_at) VALUES ($1, $2, $3, $4, 'processed', now() - interval '25 hours')", ) .bind(TraceId::from_bytes([0x01; 16]).as_bytes().to_vec()) + .bind(SpanId::from_bytes([0x01; 8]).as_bytes().to_vec()) + .bind("old-processed") + .bind("profile") .execute(&pool) .await .unwrap(); sqlx::query( - "INSERT INTO scouter.trace_commit_event (trace_id, processed_at) VALUES ($1, now() - interval '1 hour')", + "INSERT INTO scouter.trace_commit_event (trace_id, span_id, record_uid, profile_uid, status, processed_at) VALUES ($1, $2, $3, $4, 'processed', now() - interval '1 hour')", ) .bind(TraceId::from_bytes([0x02; 16]).as_bytes().to_vec()) + .bind(SpanId::from_bytes([0x02; 8]).as_bytes().to_vec()) + .bind("fresh-processed") + .bind("profile") .execute(&pool) .await .unwrap(); - sqlx::query("INSERT INTO scouter.trace_commit_event (trace_id) VALUES ($1)") + sqlx::query( + "INSERT INTO scouter.trace_commit_event (trace_id, span_id, record_uid, profile_uid) VALUES ($1, $2, $3, $4)", + ) .bind(TraceId::from_bytes([0x03; 16]).as_bytes().to_vec()) + .bind(SpanId::from_bytes([0x03; 8]).as_bytes().to_vec()) + .bind("pending") + .bind("profile") .execute(&pool) .await .unwrap(); diff --git a/crates/scouter_sql/src/sql/query.rs b/crates/scouter_sql/src/sql/query.rs index 83f4d5420..978936301 100644 --- a/crates/scouter_sql/src/sql/query.rs +++ b/crates/scouter_sql/src/sql/query.rs @@ -61,7 +61,11 @@ const INSERT_TRACE_COMMIT_EVENTS: &str = const TRACE_COMMIT_EVENT_EXISTS: &str = include_str!("scripts/inbox/trace_commit_event_exists.sql"); const CLAIM_TRACE_COMMIT_EVENTS: &str = include_str!("scripts/inbox/claim_trace_commit_events.sql"); const FLIP_AWAITING_EVALS: &str = include_str!("scripts/inbox/flip_awaiting_evals.sql"); -const MARK_EVENTS_PROCESSED: &str = include_str!("scripts/inbox/mark_events_processed.sql"); +const COMPLETE_TRACE_COMMIT_EVENTS: &str = + include_str!("scripts/inbox/complete_trace_commit_events.sql"); +const FAIL_TRACE_COMMIT_EVENTS: &str = include_str!("scripts/inbox/fail_trace_commit_events.sql"); +const RECOVER_STALE_PROCESSING: &str = include_str!("scripts/inbox/recover_stale_processing.sql"); +const LIST_AWAITING_RECORD_UIDS: &str = include_str!("scripts/inbox/list_awaiting_record_uids.sql"); const SWEEP_AWAITING_TRACE_TIMEOUTS: &str = include_str!("scripts/inbox/sweep_awaiting_trace_timeouts.sql"); const PRUNE_PROCESSED_EVENTS: &str = include_str!("scripts/inbox/prune_processed_events.sql"); @@ -223,7 +227,10 @@ pub enum Queries { TraceCommitEventExists, ClaimTraceCommitEvents, FlipAwaitingEvals, - MarkEventsProcessed, + CompleteTraceCommitEvents, + FailTraceCommitEvents, + RecoverStaleProcessing, + ListAwaitingRecordUids, SweepAwaitingTraceTimeouts, PruneProcessedEvents, @@ -324,7 +331,10 @@ impl Queries { Queries::TraceCommitEventExists => TRACE_COMMIT_EVENT_EXISTS, Queries::ClaimTraceCommitEvents => CLAIM_TRACE_COMMIT_EVENTS, Queries::FlipAwaitingEvals => FLIP_AWAITING_EVALS, - Queries::MarkEventsProcessed => MARK_EVENTS_PROCESSED, + Queries::CompleteTraceCommitEvents => COMPLETE_TRACE_COMMIT_EVENTS, + Queries::FailTraceCommitEvents => FAIL_TRACE_COMMIT_EVENTS, + Queries::RecoverStaleProcessing => RECOVER_STALE_PROCESSING, + Queries::ListAwaitingRecordUids => LIST_AWAITING_RECORD_UIDS, Queries::SweepAwaitingTraceTimeouts => SWEEP_AWAITING_TRACE_TIMEOUTS, Queries::PruneProcessedEvents => PRUNE_PROCESSED_EVENTS, diff --git a/crates/scouter_sql/src/sql/scripts/inbox/claim_trace_commit_events.sql b/crates/scouter_sql/src/sql/scripts/inbox/claim_trace_commit_events.sql index 6205e3d55..334dfb03a 100644 --- a/crates/scouter_sql/src/sql/scripts/inbox/claim_trace_commit_events.sql +++ b/crates/scouter_sql/src/sql/scripts/inbox/claim_trace_commit_events.sql @@ -1,7 +1,22 @@ -- Bind: $1 = LIMIT (e.g. 500) -SELECT id, trace_id -FROM scouter.trace_commit_event -WHERE processed_at IS NULL -ORDER BY id ASC -LIMIT $1 -FOR UPDATE SKIP LOCKED; +-- Bind: $2 = claimed_by worker id +-- Bind: $3 = claim_token +WITH claimed AS ( + SELECT id + FROM scouter.trace_commit_event + WHERE status = 'pending' + ORDER BY id ASC + LIMIT $1 + FOR UPDATE SKIP LOCKED +) +UPDATE scouter.trace_commit_event e +SET status = 'processing', + claimed_at = now(), + claimed_by = $2, + claim_token = $3, + attempt_count = attempt_count + 1, + updated_at = now() +FROM claimed +WHERE e.id = claimed.id +RETURNING e.id, e.trace_id, e.span_id, e.record_uid, e.profile_uid, + e.attempt_count, e.claim_token; diff --git a/crates/scouter_sql/src/sql/scripts/inbox/complete_trace_commit_events.sql b/crates/scouter_sql/src/sql/scripts/inbox/complete_trace_commit_events.sql new file mode 100644 index 000000000..9aa24b76c --- /dev/null +++ b/crates/scouter_sql/src/sql/scripts/inbox/complete_trace_commit_events.sql @@ -0,0 +1,11 @@ +-- Bind: $1 = bigint[] event ids drained in this batch +-- Bind: $2 = uuid claim_token issued by the originating claim +UPDATE scouter.trace_commit_event +SET status = 'processed', + processed_at = now(), + claim_token = NULL, + updated_at = now() +WHERE id = ANY($1) + AND status = 'processing' + AND claim_token = $2 +RETURNING id, trace_id, span_id, record_uid, profile_uid; diff --git a/crates/scouter_sql/src/sql/scripts/inbox/fail_trace_commit_events.sql b/crates/scouter_sql/src/sql/scripts/inbox/fail_trace_commit_events.sql new file mode 100644 index 000000000..f632f5d17 --- /dev/null +++ b/crates/scouter_sql/src/sql/scripts/inbox/fail_trace_commit_events.sql @@ -0,0 +1,18 @@ +-- Bind: $1 = bigint[] event ids +-- Bind: $2 = int max_attempts +-- Bind: $3 = text last_error +-- Bind: $4 = uuid claim_token issued by the originating claim +UPDATE scouter.trace_commit_event +SET status = CASE + WHEN attempt_count >= $2 THEN 'dead_lettered' + ELSE 'pending' + END, + last_error = $3, + claimed_at = NULL, + claimed_by = NULL, + claim_token = NULL, + updated_at = now() +WHERE id = ANY($1) + AND status = 'processing' + AND claim_token = $4 +RETURNING id, status; diff --git a/crates/scouter_sql/src/sql/scripts/inbox/flip_awaiting_evals.sql b/crates/scouter_sql/src/sql/scripts/inbox/flip_awaiting_evals.sql index 24b736e55..97b85ea6a 100644 --- a/crates/scouter_sql/src/sql/scripts/inbox/flip_awaiting_evals.sql +++ b/crates/scouter_sql/src/sql/scripts/inbox/flip_awaiting_evals.sql @@ -1,8 +1,18 @@ --- Bind: $1 = bytea[] (trace_ids from the claimed batch) --- Bind: $2 = interval trace visibility buffer -UPDATE scouter.agent_eval_record +-- Bind: $1 = text[] record_uids +-- Bind: $2 = bytea[] trace_ids +-- Bind: $3 = bytea[] span_ids +-- Bind: $4 = interval trace visibility buffer +WITH completed(record_uid, trace_id, span_id) AS ( + SELECT record_uid, trace_id, span_id + FROM unnest($1::text[], $2::bytea[], $3::bytea[]) + AS t(record_uid, trace_id, span_id) +) +UPDATE scouter.agent_eval_record AS aer SET status = 'pending', - ready_at = now() + $2, + ready_at = now() + $4, updated_at = now() -WHERE status = 'awaiting_trace' - AND trace_id = ANY($1); +FROM completed +WHERE aer.status = 'awaiting_trace' + AND aer.uid = completed.record_uid + AND aer.trace_id = completed.trace_id + AND aer.span_id = completed.span_id; diff --git a/crates/scouter_sql/src/sql/scripts/inbox/insert_trace_commit_events.sql b/crates/scouter_sql/src/sql/scripts/inbox/insert_trace_commit_events.sql index 6bf676331..e60a56697 100644 --- a/crates/scouter_sql/src/sql/scripts/inbox/insert_trace_commit_events.sql +++ b/crates/scouter_sql/src/sql/scripts/inbox/insert_trace_commit_events.sql @@ -1,3 +1,9 @@ --- Bind: $1 = bytea[] (distinct trace_ids from the just-committed Delta batch) -INSERT INTO scouter.trace_commit_event (trace_id) -SELECT trace_id FROM unnest($1::bytea[]) AS t(trace_id); +-- Bind: $1 = bytea[] trace_ids +-- Bind: $2 = bytea[] span_ids +-- Bind: $3 = text[] record_uids +-- Bind: $4 = text[] profile_uids +INSERT INTO scouter.trace_commit_event (trace_id, span_id, record_uid, profile_uid) +SELECT trace_id, span_id, record_uid, profile_uid +FROM unnest($1::bytea[], $2::bytea[], $3::text[], $4::text[]) + AS t(trace_id, span_id, record_uid, profile_uid) +ON CONFLICT ON CONSTRAINT trace_commit_event_record_span_unique DO NOTHING; diff --git a/crates/scouter_sql/src/sql/scripts/inbox/list_awaiting_record_uids.sql b/crates/scouter_sql/src/sql/scripts/inbox/list_awaiting_record_uids.sql new file mode 100644 index 000000000..19f4d2c1a --- /dev/null +++ b/crates/scouter_sql/src/sql/scripts/inbox/list_awaiting_record_uids.sql @@ -0,0 +1,10 @@ +-- Bind: $1 = interval reconcile_after +-- Bind: $2 = bigint limit +SELECT uid, trace_id, span_id, created_at +FROM scouter.agent_eval_record +WHERE status = 'awaiting_trace' + AND span_id IS NOT NULL + AND trace_id IS NOT NULL + AND created_at < now() - $1 +ORDER BY created_at ASC +LIMIT $2; diff --git a/crates/scouter_sql/src/sql/scripts/inbox/mark_events_processed.sql b/crates/scouter_sql/src/sql/scripts/inbox/mark_events_processed.sql deleted file mode 100644 index 22b458b6f..000000000 --- a/crates/scouter_sql/src/sql/scripts/inbox/mark_events_processed.sql +++ /dev/null @@ -1,4 +0,0 @@ --- Bind: $1 = bigint[] (event ids from the claimed batch) -UPDATE scouter.trace_commit_event -SET processed_at = now() -WHERE id = ANY($1); diff --git a/crates/scouter_sql/src/sql/scripts/inbox/prune_processed_events.sql b/crates/scouter_sql/src/sql/scripts/inbox/prune_processed_events.sql index 708d4abc5..2f5d9320c 100644 --- a/crates/scouter_sql/src/sql/scripts/inbox/prune_processed_events.sql +++ b/crates/scouter_sql/src/sql/scripts/inbox/prune_processed_events.sql @@ -1,4 +1,4 @@ -- Bind: $1 = retention INTERVAL (e.g. '1 day'::interval) DELETE FROM scouter.trace_commit_event -WHERE processed_at IS NOT NULL +WHERE status = 'processed' AND processed_at < now() - $1; diff --git a/crates/scouter_sql/src/sql/scripts/inbox/recover_stale_processing.sql b/crates/scouter_sql/src/sql/scripts/inbox/recover_stale_processing.sql new file mode 100644 index 000000000..bb7840a6b --- /dev/null +++ b/crates/scouter_sql/src/sql/scripts/inbox/recover_stale_processing.sql @@ -0,0 +1,15 @@ +-- Bind: $1 = interval lease_ttl +-- Bind: $2 = int max_attempts +UPDATE scouter.trace_commit_event +SET status = CASE + WHEN attempt_count >= $2 THEN 'dead_lettered' + ELSE 'pending' + END, + last_error = COALESCE(last_error, 'ProcessingLeaseExpired'), + claimed_at = NULL, + claimed_by = NULL, + claim_token = NULL, + updated_at = now() +WHERE status = 'processing' + AND claimed_at < now() - $1 +RETURNING id, status; diff --git a/crates/scouter_sql/src/sql/scripts/inbox/trace_commit_event_exists.sql b/crates/scouter_sql/src/sql/scripts/inbox/trace_commit_event_exists.sql index c04e761d9..b5f0fd864 100644 --- a/crates/scouter_sql/src/sql/scripts/inbox/trace_commit_event_exists.sql +++ b/crates/scouter_sql/src/sql/scripts/inbox/trace_commit_event_exists.sql @@ -1,6 +1,12 @@ --- Bind: $1 = bytea (trace_id, length 16) +-- Bind: $1 = text record_uid +-- Bind: $2 = bytea trace_id +-- Bind: $3 = bytea span_id +-- Bind: $4 = text profile_uid SELECT EXISTS ( SELECT 1 FROM scouter.trace_commit_event - WHERE trace_id = $1 + WHERE record_uid = $1 + AND trace_id = $2 + AND span_id = $3 + AND profile_uid = $4 ) AS exists; diff --git a/crates/scouter_sql/src/sql/traits/agent.rs b/crates/scouter_sql/src/sql/traits/agent.rs index 5dbe183cd..9b951fbc8 100644 --- a/crates/scouter_sql/src/sql/traits/agent.rs +++ b/crates/scouter_sql/src/sql/traits/agent.rs @@ -15,8 +15,9 @@ use scouter_types::EvalTaskResult; use scouter_types::Status; use scouter_types::contracts::DriftRequest; use scouter_types::{ - BinnedMetrics, EvalRecordPaginationRequest, EvalRecordPaginationResponse, EvalRecordSource, - RecordCursor, RecordType, TraceId, + AwaitingTraceCommit, BinnedMetrics, ClaimedTraceCommitEvent, CompletedTraceCommitEvent, + EvalRecordPaginationRequest, EvalRecordPaginationResponse, EvalRecordSource, RecordCursor, + RecordType, SpanId, TraceCommitAnchor, TraceId, }; use sqlx::types::Json; use sqlx::{Pool, Postgres, Row, Transaction, postgres::PgQueryResult}; @@ -73,15 +74,29 @@ pub trait AgentDriftSqlLogic { async fn insert_trace_commit_events( pool: &Pool, - trace_ids: &[TraceId], + anchors: &[TraceCommitAnchor], ) -> Result { - if trace_ids.is_empty() { + if anchors.is_empty() { return Ok(PgQueryResult::default()); } - let bytes: Vec> = trace_ids.iter().map(|t| t.as_bytes().to_vec()).collect(); + let n = anchors.len(); + let mut trace_ids: Vec> = Vec::with_capacity(n); + let mut span_ids: Vec> = Vec::with_capacity(n); + let mut record_uids: Vec<&str> = Vec::with_capacity(n); + let mut profile_uids: Vec<&str> = Vec::with_capacity(n); + for anchor in anchors { + trace_ids.push(anchor.trace_id.as_bytes().to_vec()); + span_ids.push(anchor.span_id.as_bytes().to_vec()); + record_uids.push(anchor.record_uid.as_str()); + profile_uids.push(anchor.profile_uid.as_str()); + } + sqlx::query(Queries::InsertTraceCommitEvents.get_query()) - .bind(bytes) + .bind(trace_ids) + .bind(span_ids) + .bind(record_uids) + .bind(profile_uids) .execute(pool) .await .map_err(SqlError::SqlxError) @@ -89,10 +104,13 @@ pub trait AgentDriftSqlLogic { async fn trace_commit_event_exists( pool: &Pool, - trace_id: &TraceId, + anchor: &TraceCommitAnchor, ) -> Result { let row = sqlx::query(Queries::TraceCommitEventExists.get_query()) - .bind(trace_id.as_bytes().to_vec()) + .bind(&anchor.record_uid) + .bind(anchor.trace_id.as_bytes().as_slice()) + .bind(anchor.span_id.as_bytes().as_slice()) + .bind(&anchor.profile_uid) .fetch_one(pool) .await .map_err(SqlError::SqlxError)?; @@ -100,54 +118,182 @@ pub trait AgentDriftSqlLogic { } async fn claim_trace_commit_events( - tx: &mut Transaction<'_, Postgres>, + pool: &Pool, limit: i64, - ) -> Result, SqlError> { + claimed_by: &str, + ) -> Result, SqlError> { + let claim_token = uuid::Uuid::new_v4(); let rows = sqlx::query(Queries::ClaimTraceCommitEvents.get_query()) .bind(limit) - .fetch_all(&mut **tx) + .bind(claimed_by) + .bind(claim_token) + .fetch_all(pool) .await .map_err(SqlError::SqlxError)?; rows.into_iter() .map(|row| { let id: i64 = row.try_get("id")?; - let bytes: Vec = row.try_get("trace_id")?; - let trace_id = TraceId::from_slice(&bytes) + let attempt_count: i32 = row.try_get("attempt_count")?; + let claim_token: uuid::Uuid = row.try_get("claim_token")?; + let trace_bytes: Vec = row.try_get("trace_id")?; + let span_bytes: Vec = row.try_get("span_id")?; + let record_uid: String = row.try_get("record_uid")?; + let profile_uid: String = row.try_get("profile_uid")?; + let trace_id = TraceId::from_slice(&trace_bytes) .map_err(|e| SqlError::InvalidInboxData(format!("invalid trace_id: {e}")))?; - Ok((id, trace_id)) + let span_id = SpanId::from_slice(&span_bytes) + .map_err(|e| SqlError::InvalidInboxData(format!("invalid span_id: {e}")))?; + Ok(ClaimedTraceCommitEvent { + id, + attempt_count, + claim_token, + anchor: TraceCommitAnchor { + trace_id, + span_id, + record_uid, + profile_uid, + }, + }) }) .collect() } async fn flip_awaiting_evals( tx: &mut Transaction<'_, Postgres>, - trace_ids: &[TraceId], + completed: &[CompletedTraceCommitEvent], ready_delay: chrono::Duration, ) -> Result { - let bytes: Vec> = trace_ids.iter().map(|t| t.as_bytes().to_vec()).collect(); + let n = completed.len(); + let mut record_uids: Vec<&str> = Vec::with_capacity(n); + let mut trace_ids: Vec> = Vec::with_capacity(n); + let mut span_ids: Vec> = Vec::with_capacity(n); + for event in completed { + record_uids.push(event.anchor.record_uid.as_str()); + trace_ids.push(event.anchor.trace_id.as_bytes().to_vec()); + span_ids.push(event.anchor.span_id.as_bytes().to_vec()); + } + let pg_interval = sqlx::postgres::types::PgInterval { months: 0, days: 0, microseconds: ready_delay.num_microseconds().unwrap_or(0), }; sqlx::query(Queries::FlipAwaitingEvals.get_query()) - .bind(bytes) + .bind(record_uids) + .bind(trace_ids) + .bind(span_ids) .bind(pg_interval) .execute(&mut **tx) .await .map_err(SqlError::SqlxError) } - async fn mark_events_processed( + async fn complete_trace_commit_events( tx: &mut Transaction<'_, Postgres>, event_ids: &[i64], - ) -> Result { - sqlx::query(Queries::MarkEventsProcessed.get_query()) + claim_token: uuid::Uuid, + ) -> Result, SqlError> { + let rows = sqlx::query(Queries::CompleteTraceCommitEvents.get_query()) .bind(event_ids) - .execute(&mut **tx) + .bind(claim_token) + .fetch_all(&mut **tx) .await - .map_err(SqlError::SqlxError) + .map_err(SqlError::SqlxError)?; + rows.into_iter() + .map(|row| { + let trace_bytes: Vec = row.try_get("trace_id")?; + let span_bytes: Vec = row.try_get("span_id")?; + let trace_id = TraceId::from_slice(&trace_bytes) + .map_err(|e| SqlError::InvalidInboxData(format!("invalid trace_id: {e}")))?; + let span_id = SpanId::from_slice(&span_bytes) + .map_err(|e| SqlError::InvalidInboxData(format!("invalid span_id: {e}")))?; + Ok(CompletedTraceCommitEvent { + id: row.try_get("id")?, + anchor: TraceCommitAnchor { + trace_id, + span_id, + record_uid: row.try_get("record_uid")?, + profile_uid: row.try_get("profile_uid")?, + }, + }) + }) + .collect() + } + + async fn fail_trace_commit_events( + pool: &Pool, + event_ids: &[i64], + max_attempts: i32, + last_error: &str, + claim_token: uuid::Uuid, + ) -> Result, SqlError> { + let rows = sqlx::query(Queries::FailTraceCommitEvents.get_query()) + .bind(event_ids) + .bind(max_attempts) + .bind(last_error) + .bind(claim_token) + .fetch_all(pool) + .await + .map_err(SqlError::SqlxError)?; + rows.into_iter() + .map(|row| Ok((row.try_get("id")?, row.try_get("status")?))) + .collect() + } + + async fn recover_stale_processing( + pool: &Pool, + lease_ttl: chrono::Duration, + max_attempts: i32, + ) -> Result, SqlError> { + let pg_interval = sqlx::postgres::types::PgInterval { + months: 0, + days: 0, + microseconds: lease_ttl.num_microseconds().unwrap_or(0), + }; + let rows = sqlx::query(Queries::RecoverStaleProcessing.get_query()) + .bind(pg_interval) + .bind(max_attempts) + .fetch_all(pool) + .await + .map_err(SqlError::SqlxError)?; + rows.into_iter() + .map(|row| Ok((row.try_get("id")?, row.try_get("status")?))) + .collect() + } + + async fn list_awaiting_record_uids( + pool: &Pool, + reconcile_after: chrono::Duration, + limit: i64, + ) -> Result, SqlError> { + let pg_interval = sqlx::postgres::types::PgInterval { + months: 0, + days: 0, + microseconds: reconcile_after.num_microseconds().unwrap_or(0), + }; + let rows = sqlx::query(Queries::ListAwaitingRecordUids.get_query()) + .bind(pg_interval) + .bind(limit) + .fetch_all(pool) + .await + .map_err(SqlError::SqlxError)?; + rows.into_iter() + .map(|row| { + let trace_bytes: Vec = row.try_get("trace_id")?; + let span_bytes: Vec = row.try_get("span_id")?; + let trace_id = TraceId::from_slice(&trace_bytes) + .map_err(|e| SqlError::InvalidInboxData(format!("invalid trace_id: {e}")))?; + let span_id = SpanId::from_slice(&span_bytes) + .map_err(|e| SqlError::InvalidInboxData(format!("invalid span_id: {e}")))?; + Ok(AwaitingTraceCommit { + record_uid: row.try_get("uid")?, + trace_id, + span_id, + created_at: row.try_get("created_at")?, + }) + }) + .collect() } async fn sweep_awaiting_trace_timeouts( diff --git a/crates/scouter_types/src/trace/mod.rs b/crates/scouter_types/src/trace/mod.rs index 53e6a32fc..279952c18 100644 --- a/crates/scouter_types/src/trace/mod.rs +++ b/crates/scouter_types/src/trace/mod.rs @@ -69,6 +69,15 @@ pub const SCOUTER_EVAL_SCENARIO_ID_ATTR: &str = "scouter.eval.scenario_id"; pub const SCOUTER_EVAL_RUN_ID_ATTR: &str = "scouter.eval.run_id"; pub const SCOUTER_EVAL_RECORD_UID: &str = "scouter.eval.record_uid"; pub const SCOUTER_EVAL_PROFILE_UID: &str = "scouter.eval.profile_uid"; +pub const TRACE_COMMIT_ANCHOR_UID_MAX_LEN: usize = 128; + +pub fn is_valid_trace_commit_anchor_uid(value: &str) -> bool { + !value.is_empty() + && value.len() <= TRACE_COMMIT_ANCHOR_UID_MAX_LEN + && value + .bytes() + .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'-' | b'_' | b'.' | b':')) +} // patterns for identifying baggage and tags pub const BAGGAGE_PATTERN: &str = "baggage."; @@ -293,6 +302,62 @@ impl sqlx::Encode<'_, sqlx::Postgres> for SpanId { } } +/// Anchor tuple emitted when a committed span carries eval readiness attributes. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct TraceCommitAnchor { + pub trace_id: TraceId, + pub span_id: SpanId, + pub record_uid: String, + pub profile_uid: String, +} + +impl TraceCommitAnchor { + pub fn new( + trace_id: TraceId, + span_id: SpanId, + record_uid: String, + profile_uid: String, + ) -> Option { + if !is_valid_trace_commit_anchor_uid(&record_uid) + || !is_valid_trace_commit_anchor_uid(&profile_uid) + { + return None; + } + + Some(Self { + trace_id, + span_id, + record_uid, + profile_uid, + }) + } +} + +/// Leased trace commit event claimed from the durable inbox queue. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ClaimedTraceCommitEvent { + pub id: i64, + pub attempt_count: i32, + pub claim_token: uuid::Uuid, + pub anchor: TraceCommitAnchor, +} + +/// Queue row this worker still owns after lease-fenced completion. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct CompletedTraceCommitEvent { + pub id: i64, + pub anchor: TraceCommitAnchor, +} + +/// Awaiting eval row used by the Delta-backed inbox reconciliation sweep. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct AwaitingTraceCommit { + pub record_uid: String, + pub trace_id: TraceId, + pub span_id: SpanId, + pub created_at: DateTime, +} + #[derive(Clone, Debug, Serialize, Deserialize, Default)] #[pyclass(from_py_object)] pub struct TraceRecord { diff --git a/docs/pnpm-workspace.yaml b/docs/pnpm-workspace.yaml new file mode 100644 index 000000000..dbb26c827 --- /dev/null +++ b/docs/pnpm-workspace.yaml @@ -0,0 +1,3 @@ +allowBuilds: + esbuild: true + sharp: true diff --git a/docs/src/content/docs/agents/eval-dataset.md b/docs/src/content/docs/agents/eval-dataset.md index e0dee72b4..d8e783afe 100644 --- a/docs/src/content/docs/agents/eval-dataset.md +++ b/docs/src/content/docs/agents/eval-dataset.md @@ -10,7 +10,7 @@ Use `EvalDataset` when: - You're doing post-hoc analysis on production samples. - You want to run tasks against records that were generated separately from the eval run. -`EvalDataset` doesn't support regression comparison, multi-agent structure, or trace correlation. For those, use [`EvalOrchestrator`](/agents/offline-evaluation/). +`EvalDataset` doesn't support regression comparison, multi-agent structure, or trace correlation. For those, use [`EvalOrchestrator`](/scouter/agents/offline-evaluation/). --- @@ -186,7 +186,7 @@ category_classification (always runs) └── outdoor_expert_validation → outdoor_durability_score ``` -See [Conditional gates](/agents/gates/) for a full explanation of how gates interact with `depends_on`. +See [Conditional gates](/scouter/agents/gates/) for a full explanation of how gates interact with `depends_on`. --- diff --git a/docs/src/content/docs/agents/offline-evaluation.md b/docs/src/content/docs/agents/offline-evaluation.md index 1f3a21ab3..f48c2ade7 100644 --- a/docs/src/content/docs/agents/offline-evaluation.md +++ b/docs/src/content/docs/agents/offline-evaluation.md @@ -4,7 +4,7 @@ description: "Run agent evaluations against a dataset before you ship changes." --- Offline evaluation runs your agent against a fixed set of test scenarios and measures quality before anything reaches production. Use it to catch regressions between model versions, validate prompt changes, and build a quality baseline to compare future runs against. -For pre-generated records without a live agent, see [EvalDataset](/agents/eval-dataset/). +For pre-generated records without a live agent, see [EvalDataset](/scouter/agents/eval-dataset/). --- @@ -697,7 +697,7 @@ print(detail.pass_rate) print(detail.passed) ``` -For a full explanation of what each table shows, see [Reading your results](/agents/reading-results/). +For a full explanation of what each table shows, see [Reading your results](/scouter/agents/reading-results/). --- @@ -705,4 +705,4 @@ For a full explanation of what each table shows, see [Reading your results](/age If you have records from a previous run or a separate data pipeline (no live agent needed), use `EvalDataset` instead. It takes `EvalRecord` objects directly alongside evaluation tasks. -→ [EvalDataset reference](/agents/eval-dataset/) +→ [EvalDataset reference](/scouter/agents/eval-dataset/) diff --git a/docs/src/content/docs/agents/online-evaluation.md b/docs/src/content/docs/agents/online-evaluation.md index 85b374150..82f72e098 100644 --- a/docs/src/content/docs/agents/online-evaluation.md +++ b/docs/src/content/docs/agents/online-evaluation.md @@ -237,9 +237,11 @@ client.register_profile( ) ``` -## Two evaluation paths +## Inserting records with or without trace correlation -**Path A — Trace-attached:** Associate an evaluation record with an OpenTelemetry span for trace-aware evaluation. +Online evals run through the same server-side workflow whether or not a record has trace IDs. Tracing is optional correlation data until the profile includes a `TraceAssertionTask`. + +**Trace-correlated record:** Attach the eval record to the active OpenTelemetry span. ```python with tracer.start_as_current_span("agent.callback") as span: @@ -253,9 +255,9 @@ with tracer.start_as_current_span("agent.callback") as span: ) ``` -The `trace_id` and `span_id` are set once by the Rust span constructor. The server checks if the trace has committed to Delta Lake. If yes, the record is inserted as `pending` for immediate evaluation. If not, it's inserted as `awaiting_trace` and flipped to `pending` when the trace arrives (via the inbox worker). If the trace never arrives within 5 minutes, the record fails with `TraceArrivalTimeout`. +`attach_eval()` creates the `EvalRecord`, sets `trace_id` and `span_id` from the active span, and stamps the span with the eval `record_uid` and `profile_uid`. Once the record reaches the server, its path depends on the profile. If the profile has no `TraceAssertionTask`, the record is inserted as `pending` immediately. If the profile does have trace assertions, the server waits until that specific anchor span has committed to Delta Lake before letting the poller run the workflow. Dropped live inbox notifications are recovered by a Delta-backed reconciliation sweep. -**Path B — Standalone/content-only:** Insert an evaluation record without trace correlation. +**Content-only record:** Insert an evaluation record without trace correlation. ```python queue.insert(EvalRecord( @@ -266,9 +268,9 @@ queue.insert(EvalRecord( )) ``` -No `trace_id`. Record inserted as `pending` immediately. If the profile has `TraceAssertionTask` definitions but the record has no trace, evaluation fails with `failed(EvalRequiresTrace)`. +No `trace_id` is attached. If the profile has no `TraceAssertionTask`, the record is inserted as `pending` immediately. If the profile has trace assertions, the record fails with `failed(EvalRequiresTrace)` because the workflow requires span data. -Use Path A when you need to correlate records with traces (most common for agents). Use Path B for stateless evaluation that doesn't depend on execution internals. +Use trace correlation when you want to inspect execution internals or run `TraceAssertionTask`. Use content-only records for evaluations that depend only on the record context. --- @@ -467,7 +469,7 @@ for user_query, model_response in production_requests: ## Best practices -**Sampling**: High-traffic services should use lower `sample_ratio` values. For statistically meaningful alerts, ensure you're collecting enough samples per evaluation window. The right number depends on your traffic volume and how tight your thresholds are. To correlate evaluations with distributed traces, pass your `ScouterQueue` to the Scouter tracer. See [Tracing overview](/tracing/overview/). +**Sampling**: High-traffic services should use lower `sample_ratio` values. For statistically meaningful alerts, ensure you're collecting enough samples per evaluation window. The right number depends on your traffic volume and how tight your thresholds are. To correlate evaluations with distributed traces, pass your `ScouterQueue` to the Scouter tracer. See [Tracing overview](/scouter/tracing/overview/). **Task design**: Lead with `AssertionTask` before `LLMJudgeTask`. Use `condition=True` to skip expensive LLM calls when cheap preconditions fail. Set `expected_value` thresholds based on what you observed in offline evaluation runs, not guesses. diff --git a/docs/src/content/docs/agents/overview.md b/docs/src/content/docs/agents/overview.md index 42efeb8d7..a3aa93c9c 100644 --- a/docs/src/content/docs/agents/overview.md +++ b/docs/src/content/docs/agents/overview.md @@ -20,9 +20,9 @@ Scouter formalizes this split: **Workflow evaluation (the mechanic's view)** opens the hood. Each sub-agent in your pipeline emits structured `EvalRecord`s during execution — intermediate outputs, tool results, context data. Tasks on your `AgentEvalProfile` evaluate those records, giving you per-component health signals independent of whether the final output passed. -Both run in a single [`EvalOrchestrator`](/agents/offline-evaluation/) pass. You don't have to choose between them, and you'll want both. A passing scenario with failing workflow tasks means you got lucky, not that your agent is healthy. +Both run in a single [`EvalOrchestrator`](/scouter/agents/offline-evaluation/) pass. You don't have to choose between them, and you'll want both. A passing scenario with failing workflow tasks means you got lucky, not that your agent is healthy. -The tasks and profiles you define for these two views are the same components whether you're running offline batch evaluation or online production monitoring. Define your evaluation once, reuse it everywhere. See [reading your results](/agents/reading-results/) for how the two views show up in the output tables. +The tasks and profiles you define for these two views are the same components whether you're running offline batch evaluation or online production monitoring. Define your evaluation once, reuse it everywhere. See [reading your results](/scouter/agents/reading-results/) for how the two views show up in the output tables. --- @@ -34,7 +34,7 @@ Offline evaluation runs your agent against a fixed set of test scenarios before Use it to gate releases, catch regressions between model versions or prompt changes, and establish a quality baseline you can compare future runs against. The comparison API diffs pass rates across runs and flags regressions above a configurable threshold. -If you already have records from a previous run or a production log export and don't need a live agent, [`EvalDataset`](/agents/eval-dataset/) is the lighter-weight option. Same task engine, no orchestrator. +If you already have records from a previous run or a production log export and don't need a live agent, [`EvalDataset`](/scouter/agents/eval-dataset/) is the lighter-weight option. Same task engine, no orchestrator. ### Online @@ -50,17 +50,17 @@ The task definitions are identical across offline and online. Write them once an ## What you get -- **Four task types.** Deterministic assertions, LLM-powered semantic judges, OpenTelemetry span assertions, and agent-specific tool call / response checks. All four work in both offline and online modes. → [Evaluation tasks](/agents/tasks/) +- **Four task types.** Deterministic assertions, LLM-powered semantic judges, OpenTelemetry span assertions, and agent-specific tool call / response checks. All four work in both offline and online modes. → [Evaluation tasks](/scouter/agents/tasks/) - **Bring your own context.** `EvalRecord` takes a freeform dict. Put whatever you want in it: model outputs, metadata, ground truth labels, intermediate results. Tasks read from it via `context_path` (dot-notation into nested fields). No fixed schema to conform to. -- **Dependency chains and conditional gates.** Tasks can depend on upstream results and act as gates that short-circuit expensive downstream work. If a format check fails, the LLM judge never runs. → [Conditional gates](/agents/gates/) +- **Dependency chains and conditional gates.** Tasks can depend on upstream results and act as gates that short-circuit expensive downstream work. If a format check fails, the LLM judge never runs. → [Conditional gates](/scouter/agents/gates/) -- **Multi-agent evaluation.** One profile per sub-agent in your pipeline. Each gets its own task set, its own results, and its own pass rate. → [Multi-agent setup](/agents/offline-evaluation/#multi-agent-setup) +- **Multi-agent evaluation.** One profile per sub-agent in your pipeline. Each gets its own task set, its own results, and its own pass rate. → [Multi-agent setup](/scouter/agents/offline-evaluation/#multi-agent-setup) -- **Regression comparison.** Save results from a known-good run, then diff against new runs. The comparison flags regressions above a configurable threshold and tells you which aliases degraded. → [Comparing runs](/agents/offline-evaluation/#saving-loading-and-comparing-results) +- **Regression comparison.** Save results from a known-good run, then diff against new runs. The comparison flags regressions above a configurable threshold and tells you which aliases degraded. → [Comparing runs](/scouter/agents/offline-evaluation/#saving-loading-and-comparing-results) -- **Scheduled alerting.** Online profiles evaluate on a cron schedule and dispatch alerts when pass rates drop below your baseline. Slack, OpsGenie, and Console are supported out of the box. → [Online evaluation](/agents/online-evaluation/) +- **Scheduled alerting.** Online profiles evaluate on a cron schedule and dispatch alerts when pass rates drop below your baseline. Slack, OpsGenie, and Console are supported out of the box. → [Online evaluation](/scouter/agents/online-evaluation/) - **Portable definitions.** Profiles, tasks, and thresholds move between offline batch runs and online production monitoring without modification. @@ -70,12 +70,12 @@ The task definitions are identical across offline and online. Write them once an | What you have | Where to go | |---|---| -| A callable agent function and test scenarios | [Offline evaluation](/agents/offline-evaluation/) | -| Scenarios in a file (JSONL, JSON, YAML) | [Loading from a file](/agents/offline-evaluation/#loading-scenarios-from-a-file) | -| Pre-generated records, no live agent | [EvalDataset](/agents/eval-dataset/) | -| A deployed agent you want to monitor | [Online evaluation](/agents/online-evaluation/) | +| A callable agent function and test scenarios | [Offline evaluation](/scouter/agents/offline-evaluation/) | +| Scenarios in a file (JSONL, JSON, YAML) | [Loading from a file](/scouter/agents/offline-evaluation/#loading-scenarios-from-a-file) | +| Pre-generated records, no live agent | [EvalDataset](/scouter/agents/eval-dataset/) | +| A deployed agent you want to monitor | [Online evaluation](/scouter/agents/online-evaluation/) | -If you're unsure, start with [offline evaluation](/agents/offline-evaluation/). It's the fastest way to see how tasks, scenarios, and results fit together. +If you're unsure, start with [offline evaluation](/scouter/agents/offline-evaluation/). It's the fastest way to see how tasks, scenarios, and results fit together. --- @@ -85,9 +85,9 @@ Tasks are what Scouter runs against your agent's outputs or records. They work t | Task | What it checks | Cost | |------|---------------|------| -| [`AssertionTask`](/agents/tasks/#assertiontask) | Deterministic rules: format, threshold, presence, pattern matching | None | -| [`LLMJudgeTask`](/agents/tasks/#llmjudgetask) | Semantic quality (relevance, faithfulness, tone) via an LLM call | One LLM call | -| [`TraceAssertionTask`](/agents/tasks/#traceassertiontask) | Span properties: execution order, retry counts, token budgets | None | -| [`AgentAssertionTask`](/agents/tasks/#agentassertiontask) | Tool calls and response structure: which tools ran, with what args, what they returned | None | +| [`AssertionTask`](/scouter/agents/tasks/#assertiontask) | Deterministic rules: format, threshold, presence, pattern matching | None | +| [`LLMJudgeTask`](/scouter/agents/tasks/#llmjudgetask) | Semantic quality (relevance, faithfulness, tone) via an LLM call | One LLM call | +| [`TraceAssertionTask`](/scouter/agents/tasks/#traceassertiontask) | Span properties: execution order, retry counts, token budgets | None | +| [`AgentAssertionTask`](/scouter/agents/tasks/#agentassertiontask) | Tool calls and response structure: which tools ran, with what args, what they returned | None | -Tasks can depend on each other and act as conditional gates to prevent expensive downstream work when preconditions fail. Full reference: [Evaluation tasks](/agents/tasks/) · [Conditional gates](/agents/gates/). +Tasks can depend on each other and act as conditional gates to prevent expensive downstream work when preconditions fail. Full reference: [Evaluation tasks](/scouter/agents/tasks/) · [Conditional gates](/scouter/agents/gates/). diff --git a/docs/src/content/docs/agents/tasks.md b/docs/src/content/docs/agents/tasks.md index 7fac5da49..c5edabbb5 100644 --- a/docs/src/content/docs/agents/tasks.md +++ b/docs/src/content/docs/agents/tasks.md @@ -1201,6 +1201,6 @@ client.register_profile(profile, set_active=True) Now that you understand evaluation tasks, explore how to build complete agent evaluation workflows: -- [Offline Evaluation](/agents/offline-evaluation/) - Batch evaluation with complex task chains -- [Online Monitoring](/agents/online-evaluation/) - Production monitoring setup -- [Distributed Tracing](/tracing/overview/) - Capture trace data that `TraceAssertionTask` evaluates +- [Offline Evaluation](/scouter/agents/offline-evaluation/) - Batch evaluation with complex task chains +- [Online Monitoring](/scouter/agents/online-evaluation/) - Production monitoring setup +- [Distributed Tracing](/scouter/tracing/overview/) - Capture trace data that `TraceAssertionTask` evaluates diff --git a/docs/src/content/docs/api/index.md b/docs/src/content/docs/api/index.md index 83a3de67b..33106fbb7 100644 --- a/docs/src/content/docs/api/index.md +++ b/docs/src/content/docs/api/index.md @@ -349,7 +349,7 @@ Profile for LLM evaluation and drift detection. #### `EvalRecord` ```python -EvalRecord(context: Context, id: Optional[str] = None, session_id: Optional[str] = None, trace_id: Optional[str] = None) +EvalRecord(context: Optional[Context] = None, record_id: Optional[str] = None, *, session_id: Optional[str] = None, media: Optional[List[Union[EvalMedia, ImageMedia, DocumentMedia]]] = None, profile_uid: Optional[str] = None, tags: Optional[List[str]] = None, trace_id: Optional[str] = None) ``` LLM record containing context tied to a Large Language Model interaction that is used to evaluate drift in LLM responses. @@ -517,3 +517,13 @@ ServiceMapMiddleware ``` No stub docstring was found for this export. + +### Other exports + +#### `ScouterEnv` + +```python +class ScouterEnv +``` + +See the signature above and the guide docs for usage examples. diff --git a/docs/src/content/docs/architecture/overview.mdx b/docs/src/content/docs/architecture/overview.mdx index 06edebb75..401e752ab 100644 --- a/docs/src/content/docs/architecture/overview.mdx +++ b/docs/src/content/docs/architecture/overview.mdx @@ -3,7 +3,7 @@ title: "System architecture" description: "High-level layout of Scouter components and data flow." --- -Scouter is a dual-protocol Rust server — Axum HTTP on `:8000`, Tonic gRPC on `:50051` — backed by PostgreSQL for recent drift and evaluation data, and Delta Lake for long-term trace storage. Three subsystems (model monitoring, distributed tracing, and agentic evaluation) share a single ingestion pipeline. Records enter through the same channel, get routed in a single consumer layer, and diverge to storage at the `MessageHandler` fork point. Background workers then consume from storage on independent schedules. +Scouter is a dual-protocol Rust server — Axum HTTP on `:8000`, Tonic gRPC on `:50051` — backed by PostgreSQL for recent drift and evaluation data, and Delta Lake for long-term trace storage. Model monitoring, distributed tracing, and agent evaluation run in the same server process, but ingestion is split into typed channels after the request handler: server records, trace records, and tag records each have their own queue and worker pool. Background workers consume from storage on independent schedules. --- @@ -17,17 +17,23 @@ flowchart LR SCC[ScouterClient] end - subgraph SI ["Server shared ingestion"] + subgraph SI ["Server ingress"] GRPC["gRPC :50051\nMessageGrpcService"] HTTP["HTTP :8000\nPOST /scouter/message"] - FC["flume channel\nhttp_consumer_tx"] - WK["HttpConsumerWorkers\nN configurable"] - MH[MessageHandler] + OTLP["HTTP :8000\nPOST /scouter/v1/traces"] + ROUTER["record router"] + SR["server_record_tx\nbounded 1000"] + TR["trace_record_tx\nbounded 500"] + TG["tag_record_tx\nbounded 200"] + SW["server-record workers"] + TW["trace workers"] + GW["tag workers"] end subgraph BW ["Background workers"] DE["DriftExecutor\nN workers"] AP["AgentPoller\nN workers"] + INBOX["Trace inbox workers\ncommit events + sweeps"] end AD["Alert dispatch\nSlack / OpsGenie / Console"] @@ -39,23 +45,27 @@ flowchart LR SQ -->|"gRPC InsertMessageRequest"| GRPC SQ -->|"HTTP POST /scouter/message"| HTTP - SCT -->|"TraceServerRecord\nsame pipeline"| GRPC + SCT -->|"TraceServerRecord\ngRPC MessageService"| GRPC + SCT -->|"OTLP protobuf"| OTLP SCC -->|"register profiles\nquery results"| HTTP - GRPC --> FC - HTTP --> FC - FC --> WK - WK --> MH - MH -->|"drift + eval rows"| PG - MH -->|"TraceSpanService\nTraceSummaryService"| DL + GRPC --> ROUTER + HTTP --> ROUTER + OTLP --> TR + ROUTER --> SR & TR & TG + SR --> SW -->|"drift + eval rows"| PG + TR --> TW -->|"TraceSpanService\nTraceSummaryService"| DL + TG --> GW --> PG DE -->|"SELECT skip lock\ncompute drift\nalert if triggered"| PG AP -->|"SELECT pending eval\nexecute tasks\nstore results"| PG AP --> DL + INBOX -->|"flip awaiting_trace\nrecover missed anchors\ntimeout stale rows"| PG + INBOX -->|"reconcile anchors"| DL DE --> AD AP --> AD ``` -The important thing this diagram shows: `ScouterTracer` does not have a dedicated ingest path. A `TraceServerRecord` enters the same `flume::Sender` as an `SpcRecord` or an `EvalRecord`. The `MessageHandler` is the only place the paths diverge. Under write pressure, trace ingest competes with drift record ingest for consumer worker bandwidth. +The important thing this diagram shows: traces have dedicated routes and workers, but not a separate trace ingest service. The HTTP OTLP endpoint (`POST /scouter/v1/traces`) accepts protobuf `ExportTraceServiceRequest` batches directly. gRPC uses the generic `MessageService.InsertMessage`; there is no separate gRPC `TraceService` in the protobuf. Once a trace record reaches `AppState`, it goes to `trace_record_tx`, not the server-record queue used for drift and eval rows. --- @@ -69,9 +79,13 @@ Every HTTP handler, gRPC handler, and background worker holds an `Arc` | `auth_manager` | `AuthManager` | JWT verification on every authenticated request | | `task_manager` | `TaskManager` | Coordinates all background workers; `watch::channel` broadcasts shutdown | | `config` | `Arc` | Typed env-var config, frozen at startup | -| `http_consumer_tx` | `Sender` | flume channel sender; both HTTP and gRPC handlers drop records here | +| `server_record_tx` | `Sender` | Bounded channel for drift records, eval records, and profile updates | +| `trace_record_tx` | `Sender` | Bounded channel for OTLP trace batches | +| `tag_record_tx` | `Sender` | Bounded channel for indexed tag writes | | `trace_service` | `Arc` | Owns both Delta Lake actors; singleton for the lifetime of the server | | `trace_summary_service` | `Arc` | Hour-bucketed summary table; feeds the paginated trace list | +| `trace_dispatch_service` | `Arc` | Runs trace retention and dispatch queries | +| `genai_service` | `Arc` | Owns GenAI span extraction and query support | | `dataset_manager` | `Arc` | Bifrost offline dataset engine | | `eval_scenario_service` | `Arc` | Offline eval scenario orchestration | @@ -123,30 +137,38 @@ Queue insert errors are logged but not returned to the caller. The `ScouterQueue ```mermaid graph TD - CH["flume channel\nhttp_consumer_tx"] - W1["Worker 1"] - W2["Worker 2"] - WN["Worker N"] - PR["process_message_record()"] + GRPC["gRPC MessageService\nInsertMessage"] + MSG["HTTP /scouter/message"] + OTLP["HTTP /scouter/v1/traces\nOTLP protobuf"] + ROUTER["MessageRecord router"] + + SR["server_record_tx\nbounded 1000"] + TR["trace_record_tx\nbounded 500"] + TG["tag_record_tx\nbounded 200"] + + SW["server-record workers"] + TW["trace workers"] + GW["tag workers"] R1["SPC features → spc_drift"] R2["PSI features → psi_drift"] R3["Custom metrics → custom_drift"] R4["EvalRecord → agent_eval_record"] - R5["TraceServerRecord → Delta Lake\n(TraceSpanService + TraceSummaryService)"] - - CH --> W1 & W2 & WN - W1 & W2 & WN --> PR - PR --> R1 - PR --> R2 - PR --> R3 - PR --> R4 - PR --> R5 + R5["TraceServerRecord → Delta Lake\nTraceSpanService + TraceSummaryService"] + R6["TagRecord → PostgreSQL tag index"] + + GRPC --> ROUTER + MSG --> ROUTER + ROUTER --> SR & TR & TG + OTLP --> TR + SR --> SW --> R1 & R2 & R3 & R4 + TR --> TW --> R5 + TG --> GW --> R6 ``` -`MessageRecord` is an enum. The variant determines where the record goes. `TraceServerRecord` is the only type that routes to Delta Lake; everything else goes to PostgreSQL. The worker count is configurable at startup (`HTTP_CONSUMER_WORKERS`, defaults to 4). +The generic message routes deserialize `MessageRecord` and send each variant to the matching channel. The OTLP route skips the enum wrapper: it decodes protobuf bytes into `ExportTraceServiceRequest` and sends a `TraceServerRecord` directly to `trace_record_tx`. -There is no separate ingest service for traces. The same `HttpConsumerWorker` that handles drift records handles spans. If you're sizing worker pools, account for trace volume alongside drift and eval volume. +There is no separate trace ingest service or gRPC `TraceService`. There is a dedicated HTTP OTLP route and a dedicated trace worker pool. Size `TRACE_CONSUMER_WORKERS` for span volume separately from `SERVER_RECORD_CONSUMER_WORKERS`, which handles drift and eval records. --- @@ -188,28 +210,37 @@ Drift computation reads from PostgreSQL only. Delta Lake is trace-only. ## Agent evaluation: AgentPoller -N workers, same structure as drift workers. Agent eval is more complex because it may need to wait for trace spans to arrive before it can execute `TraceAssertionTask` entries. +Agent evaluation is split across three background paths: + +- `AgentPoller` claims `pending` eval records and executes the task DAG. +- The trace commit consumer writes anchor events emitted after Delta commits. +- The trace inbox worker flips `awaiting_trace` records to `pending`, reconciles dropped anchor events from Delta, recovers stale queue leases, and times out records whose traces never arrive. + +`AgentPoller` no longer waits and reschedules records while polling for trace spans. Trace readiness is handled before a record becomes pollable. ```mermaid flowchart TD - A([poll_for_tasks]) --> B["get_pending_agent_eval_record()\nstatus = Pending"] + A([poll_for_tasks]) --> B["get_pending_agent_eval_record()\nstatus = pending\nready_at <= now()"] B -- None --> C["sleep 1s"] C --> A B -- "Some(record)" --> D["fetch AgentEvalProfile\nfrom Postgres"] D --> E{has TraceAssertionTasks?} E -- No --> H - E -- Yes --> F["wait_for_trace_spans_with_reschedule()\nquery Delta Lake for trace_id"] - F -- Found --> H["execute task DAG\nrespecting depends_on order"] - F -- "Not found (attempt N)" --> G["exponential backoff\n100ms → 5s"] - G --> F - F -- "timeout exceeded" --> R["reschedule record\nstatus = Pending, next_run_at += delay"] - R --> A + E -- Yes --> F{trace_id + span_id present?} + F -- No --> X["mark failed\nmissing trace anchor"] + F -- Yes --> G["fetch spans by trace_id\nfrom TraceSpanService"] + G -- "no spans or anchor invalid" --> X + G -- Found --> H["execute task DAG\nrespecting depends_on order"] H --> I["AssertionTask (in-process)\nLLMJudgeTask (external LLM)\nAgentAssertionTask (vendor-agnostic)\nTraceAssertionTask (span-based)"] I --> J["insert_eval_task_results_batch()\ninsert_agent_eval_workflow_record()"] - J --> A + J --> K["mark record processed"] + X --> A + K --> A ``` -The trace wait exists because `EvalRecord`s often arrive before their associated OTEL spans — the eval record hits PostgreSQL fast, but spans go through Delta Lake writes with a 5-second flush interval. At default settings (`GENAI_TRACE_WAIT_TIMEOUT_SECS=10`, `GENAI_TRACE_BACKOFF_MILLIS=100`, `GENAI_TRACE_RESCHEDULE_DELAY_SECS=30`), worst-case latency from eval record insertion to stored result is around 90 seconds. Tune these based on your span arrival p95 in production. +The SQL poller only sees rows with `status = 'pending'`, `scheduled_at <= now()`, and `ready_at <= now()`. Records that need trace assertions are inserted as `awaiting_trace` until the inbox path proves the exact `(record_uid, trace_id, span_id)` anchor committed to Delta Lake. When the inbox worker completes the matching `trace_commit_event`, it flips that eval row to `pending` and sets `ready_at = now() + SCOUTER_TRACE_VISIBILITY_BUFFER_SECS`. + +This changes the failure mode. A record with `TraceAssertionTask` and no `trace_id` fails as `EvalRequiresTrace`. A record with `trace_id` but no `span_id` fails as `EvalRequiresAnchorSpan`. A record with a complete anchor that never commits stays `awaiting_trace` until the timeout sweep marks it `TraceArrivalTimeout`. Task execution follows the declared `depends_on` DAG. Each task sees base context plus explicitly declared dependency outputs — not the full context of every upstream task. This is intentional scoping. @@ -232,9 +263,9 @@ graph TB EX["ScouterSpanExporter\nbatch: 512 spans / 5s flush"] end - subgraph "Server — same ingest pipeline as drift" - MSG["MessageRecord::TraceServerRecord\nsame flume channel + HttpConsumerWorkers"] - MH["MessageHandler forks"] + subgraph "Server trace ingest" + MSG["TraceServerRecord\ntrace_record_tx"] + TW["trace workers"] TSS["TraceSpanService"] SUM["TraceSummaryService\nhour-bucketed summaries"] end @@ -248,8 +279,8 @@ graph TB OT --> EX EX -->|"TraceServerRecord\ngRPC or HTTP"| MSG - MSG --> MH - MH --> TSS & SUM + MSG --> TW + TW --> TSS & SUM TSS --> BUF BUF --> ENG ENG --> BB @@ -257,7 +288,7 @@ graph TB SUM --> DL ``` -For full schema, compaction schedule, bloom filter configuration, and multi-pod deployment details, see [Trace Storage Architecture](/tracing/storage-architecture/). +For full schema, compaction schedule, bloom filter configuration, and multi-pod deployment details, see [Trace Storage Architecture](/scouter/tracing/storage-architecture/). Two callouts that aren't in the storage doc: @@ -276,7 +307,7 @@ sequenceDiagram participant DL as Delta Lake (Parquet) participant DF as DataFusion SessionContext - CL->>RT: GET /scouter/trace/{trace_id} + CL->>RT: GET /scouter/v1/traces/{id}/spans RT->>TQ: get_trace_spans_by_id(trace_id) TQ->>DF: SQL with time filter first Note over DF: time filter prunes Delta log files\nbloom filter (FPP 0.01) skips row groups @@ -313,10 +344,15 @@ graph LR subgraph "Online — production traffic" SQ["ScouterQueue.insert(EvalRecord)\nnon-blocking, samples at sample_ratio"] + EVALPG[("PostgreSQL\nagent_eval_record")] + INBOX["Trace inbox\nonly for TraceAssertionTask"] AGP["AgentPoller\nN workers"] PG[("PostgreSQL\neval results + alert records")] AL["Alert dispatch"] - SQ --> AGP --> PG --> AL + SQ --> EVALPG + EVALPG -->|"no trace gate"| AGP + EVALPG -->|"awaiting_trace"| INBOX -->|"pending + ready_at"| AGP + AGP --> PG --> AL end AT & LJ & AA & TA -.->|same task types| EO @@ -363,7 +399,7 @@ Step 7 must come last. The trace cache flush at step 2 still needs a live pool. **`SKIP LOCKED` is not a durable job queue.** Drift profile tasks live in Postgres. The window between `SELECT ... SKIP LOCKED` and `update_drift_profile_run_dates()` is not transactional. If a drift worker crashes mid-computation, the task will be picked up again on the next tick — but double-computation is possible if the crash happens at exactly the right moment. -**Trace wait adds latency to agent eval.** At default config, worst-case lag from `EvalRecord` insertion to stored eval result is ~90 seconds (10s timeout × 3 retries + reschedule delays). Tune `GENAI_TRACE_WAIT_TIMEOUT_SECS`, `GENAI_TRACE_BACKOFF_MILLIS`, and `GENAI_TRACE_RESCHEDULE_DELAY_SECS` based on your span arrival p95. If your agents don't use `TraceAssertionTask`, this entire path is skipped. +**Trace-backed eval waits in the inbox, not inside `AgentPoller`.** Records without `TraceAssertionTask` go straight to `pending`. Records with trace assertions stay `awaiting_trace` until the matching anchor span commits to Delta and the inbox worker flips the row to `pending` with a visibility buffer. Tune `SCOUTER_TRACE_VISIBILITY_BUFFER_SECS`, `SCOUTER_INBOX_RECONCILE_INTERVAL_SECS`, and `TRACE_ARRIVAL_TIMEOUT_SECS` around your Delta refresh and span arrival behavior. **Queue errors are logged, not returned.** The `QueueBus` insert path intentionally swallows errors. In production, monitor server logs for `Error inserting entity into queue` — there's no counter exported by default. diff --git a/docs/src/content/docs/bifrost/overview.mdx b/docs/src/content/docs/bifrost/overview.mdx index b817f56ed..289ac2139 100644 --- a/docs/src/content/docs/bifrost/overview.mdx +++ b/docs/src/content/docs/bifrost/overview.mdx @@ -121,7 +121,7 @@ That said, the two are designed to converge. In a future release, you will be ab ## Next Steps -- [Quickstart](/bifrost/quickstart/) -- end-to-end write and read example -- [Writing Data](/bifrost/writing-data/) -- `DatasetProducer` configuration and patterns -- [Reading Data](/bifrost/reading-data/) -- `DatasetClient`, SQL queries, format conversions -- [Schema Reference](/bifrost/schema/) -- `TableConfig`, type mapping, fingerprinting +- [Quickstart](/scouter/bifrost/quickstart/) -- end-to-end write and read example +- [Writing Data](/scouter/bifrost/writing-data/) -- `DatasetProducer` configuration and patterns +- [Reading Data](/scouter/bifrost/reading-data/) -- `DatasetClient`, SQL queries, format conversions +- [Schema Reference](/scouter/bifrost/schema/) -- `TableConfig`, type mapping, fingerprinting diff --git a/docs/src/content/docs/bifrost/quickstart.md b/docs/src/content/docs/bifrost/quickstart.md index 4f9e7209b..844a9bdea 100644 --- a/docs/src/content/docs/bifrost/quickstart.md +++ b/docs/src/content/docs/bifrost/quickstart.md @@ -244,6 +244,6 @@ def predict(request: Request, payload: PredictRequest): ## Next Steps -- [Writing Data](/bifrost/writing-data/) -- batching behavior, backpressure, shutdown patterns -- [Reading Data](/bifrost/reading-data/) -- `DatasetClient`, `QueryResult`, SQL reference -- [Schema Reference](/bifrost/schema/) -- type mapping, fingerprinting, `TableConfig` utilities +- [Writing Data](/scouter/bifrost/writing-data/) -- batching behavior, backpressure, shutdown patterns +- [Reading Data](/scouter/bifrost/reading-data/) -- `DatasetClient`, `QueryResult`, SQL reference +- [Schema Reference](/scouter/bifrost/schema/) -- type mapping, fingerprinting, `TableConfig` utilities diff --git a/docs/src/content/docs/bifrost/reading-data.md b/docs/src/content/docs/bifrost/reading-data.md index 6785425d2..34c5a48bd 100644 --- a/docs/src/content/docs/bifrost/reading-data.md +++ b/docs/src/content/docs/bifrost/reading-data.md @@ -270,5 +270,5 @@ Writers (`DatasetProducer`) and readers (`DatasetClient`) can operate on the sam ## Next Steps -- [Writing Data](/bifrost/writing-data/) -- `DatasetProducer` configuration and patterns -- [Schema Reference](/bifrost/schema/) -- type mapping, fingerprinting, `TableConfig` utilities +- [Writing Data](/scouter/bifrost/writing-data/) -- `DatasetProducer` configuration and patterns +- [Schema Reference](/scouter/bifrost/schema/) -- type mapping, fingerprinting, `TableConfig` utilities diff --git a/docs/src/content/docs/evaluation-platform/comparison.md b/docs/src/content/docs/evaluation-platform/comparison.md index 9eea328eb..75e4538d5 100644 --- a/docs/src/content/docs/evaluation-platform/comparison.md +++ b/docs/src/content/docs/evaluation-platform/comparison.md @@ -6,7 +6,7 @@ description: "How Scouter compares to the surrounding evaluation tool landscape. This page compares Scouter's evaluation platform against five alternatives: LangSmith, Langfuse, MLflow, Datadog LLM Observability, and Google ADK. We focus on architectural differences and trade-offs rather than feature checklists. Every platform on this list does something well; the question is which set of trade-offs matches your requirements. -For the problem Scouter's evaluation platform solves and how it works, see the [evaluation platform overview](/evaluation-platform/). +For the problem Scouter's evaluation platform solves and how it works, see the [evaluation platform overview](/scouter/evaluation-platform/). --- diff --git a/docs/src/content/docs/evaluation-platform/discussion-and-tradeoffs.md b/docs/src/content/docs/evaluation-platform/discussion-and-tradeoffs.md index ee0f892be..180520e7d 100644 --- a/docs/src/content/docs/evaluation-platform/discussion-and-tradeoffs.md +++ b/docs/src/content/docs/evaluation-platform/discussion-and-tradeoffs.md @@ -4,7 +4,7 @@ description: "Design tradeoffs behind Scouter’s evaluation architecture." --- This section covers the things you need to know before running Scouter in production. The previous sections describe how the system works; this one describes where it gets interesting under load, what it costs, and where the sharp edges are. -For system-wide architecture constraints, see [Architecture](/architecture/overview/). For the regression comparison API, see [Comparing runs](/agents/offline-evaluation/#saving-loading-and-comparing-results). +For system-wide architecture constraints, see [Architecture](/scouter/architecture/overview/). For the regression comparison API, see [Comparing runs](/scouter/agents/offline-evaluation/#saving-loading-and-comparing-results). --- @@ -183,7 +183,7 @@ Scouter is designed to be extended at specific points: ## Known constraints -These are documented here and in the [architecture overview](/architecture/overview/). Linking for completeness: +These are documented here and in the [architecture overview](/scouter/architecture/overview/). Linking for completeness: - `EvalRunner.evaluate()` calls `block_on` internally. It checks for an existing async runtime and returns an error if one is detected. You cannot call it from inside an async context. Use `evaluate_async()` from async code, or call `evaluate()` from synchronous Python. - Queue insert errors are logged at ERROR level, not returned to the caller. No counter is exported by default. Monitor server logs for `Error inserting entity into queue`. diff --git a/docs/src/content/docs/evaluation-platform/eval-profiles-and-tasks.mdx b/docs/src/content/docs/evaluation-platform/eval-profiles-and-tasks.mdx index 420e43b13..a7a52c508 100644 --- a/docs/src/content/docs/evaluation-platform/eval-profiles-and-tasks.mdx +++ b/docs/src/content/docs/evaluation-platform/eval-profiles-and-tasks.mdx @@ -4,9 +4,9 @@ description: "Define profiles, tasks, and dependencies for the evaluation runtim --- Scouter's evaluation system is built from a small set of composable primitives. An `AgentEvalProfile` packages configuration and tasks into a single unit. Four task types cover different evaluation concerns. Tasks compose into dependency DAGs with conditional gates that control execution flow and cost. -This page covers the primitives and how they fit together. For full API reference with code examples, see [Evaluation tasks](/agents/tasks/). +This page covers the primitives and how they fit together. For full API reference with code examples, see [Evaluation tasks](/scouter/agents/tasks/). -> For the problem this solves, see the [evaluation platform overview](/evaluation-platform/). +> For the problem this solves, see the [evaluation platform overview](/scouter/evaluation-platform/). --- @@ -126,7 +126,7 @@ Use for: verifying correct tools were called, argument validation, call ordering The comparison engine handles type coercion automatically. Floats and ints normalize for comparison (`300.0` equals `300`). Strings parse to numbers when the other operand is numeric. Arrays extract their length for `HasLength*` operators. Template variable substitution (`${field.path}`) works with all operators. -For the full operator reference, see [Evaluation tasks](/agents/tasks/). +For the full operator reference, see [Evaluation tasks](/scouter/agents/tasks/). --- @@ -169,7 +169,7 @@ Tasks without `condition=True` are regular dependencies. Their downstream tasks Each task sees the base evaluation context plus the outputs of its declared dependencies. Not the full context of every upstream task. This scoping is intentional: it keeps task inputs predictable and avoids accidental coupling between unrelated branches of the DAG. -For detailed gate usage with code examples, see [Conditional gates](/agents/gates/). +For detailed gate usage with code examples, see [Conditional gates](/scouter/agents/gates/). ### Circular dependency detection @@ -204,4 +204,4 @@ Independent checks run in parallel regardless of layer. A token budget check (Tr | Is the response faithful to the source documents? | LLMJudgeTask | | Does the tone match the persona? | LLMJudgeTask | -The rest of this technical overview covers how these building blocks execute in [offline evaluation](/evaluation-platform/offline-evaluation/) and [online evaluation](/evaluation-platform/online-evaluation/). +The rest of this technical overview covers how these building blocks execute in [offline evaluation](/scouter/evaluation-platform/offline-evaluation/) and [online evaluation](/scouter/evaluation-platform/online-evaluation/). diff --git a/docs/src/content/docs/evaluation-platform/index.mdx b/docs/src/content/docs/evaluation-platform/index.mdx index 1bb7de2d4..25a55f20f 100644 --- a/docs/src/content/docs/evaluation-platform/index.mdx +++ b/docs/src/content/docs/evaluation-platform/index.mdx @@ -6,7 +6,7 @@ Traditional ML evaluation is a solved workflow. You have a labeled test set, you Agents break this model in ways that matter. -> For usage guide and quick start, see [Agent evaluation](/agents/overview/). +> For usage guide and quick start, see [Agent evaluation](/scouter/agents/overview/). --- @@ -116,7 +116,7 @@ Agent-specific assertions ("was this tool called with these arguments," "did the And then there's pricing. SaaS platforms like Datadog and LangSmith add per-span or per-trace costs that scale linearly with traffic. Self-hosted options like Langfuse and MLflow avoid that, but they lack the evaluation orchestration or online monitoring pieces. You get storage and visualization, not a complete eval system. -For a detailed feature-by-feature comparison, see [Platform comparison](/evaluation-platform/comparison/). +For a detailed feature-by-feature comparison, see [Platform comparison](/scouter/evaluation-platform/comparison/). --- @@ -148,8 +148,8 @@ The operator count matters less than what you can compose with it. Scouter's fou The rest of this document covers how each of these pieces works: -- [Building blocks: eval profiles and task types](/evaluation-platform/eval-profiles-and-tasks/) -- [Offline evaluation architecture](/evaluation-platform/offline-evaluation/) -- [Online evaluation architecture](/evaluation-platform/online-evaluation/) -- [Observability engine and trace-based evaluation](/evaluation-platform/observability-and-traces/) -- [Trade-offs and operational considerations](/evaluation-platform/discussion-and-tradeoffs/) +- [Building blocks: eval profiles and task types](/scouter/evaluation-platform/eval-profiles-and-tasks/) +- [Offline evaluation architecture](/scouter/evaluation-platform/offline-evaluation/) +- [Online evaluation architecture](/scouter/evaluation-platform/online-evaluation/) +- [Observability engine and trace-based evaluation](/scouter/evaluation-platform/observability-and-traces/) +- [Trade-offs and operational considerations](/scouter/evaluation-platform/discussion-and-tradeoffs/) diff --git a/docs/src/content/docs/evaluation-platform/observability-and-traces.mdx b/docs/src/content/docs/evaluation-platform/observability-and-traces.mdx index e977875ec..a5b899d72 100644 --- a/docs/src/content/docs/evaluation-platform/observability-and-traces.mdx +++ b/docs/src/content/docs/evaluation-platform/observability-and-traces.mdx @@ -4,11 +4,13 @@ description: "Tie evaluation decisions back to trace data and runtime evidence." --- Agents are multi-step pipelines. The final output tells you *what* happened. Traces tell you *how*. Without them, you can't answer: did the retriever call the right API? Did the agent retry too many times? Was the right model used? Did latency exceed the SLA? Did total token usage blow the budget? -Scouter treats traces as first-class evaluation data, not a separate observability concern. `TraceAssertionTask` runs assertions directly on OpenTelemetry span properties, and records attached to spans via `span.attach_eval()` correlate with traces automatically. The same span that shows up in your trace viewer feeds into your evaluation pipeline. +Scouter does not make every evaluation depend on tracing. `span.attach_eval()` creates an `EvalRecord` and records correlation metadata. The record still follows the normal online eval path unless its registered profile includes `TraceAssertionTask`. -For tracing setup and instrumentation, see [Distributed tracing](/tracing/overview/). For the full task type reference, see [Building blocks](/evaluation-platform/eval-profiles-and-tasks/). +When a profile does include trace assertions, spans become part of the eval input. `TraceAssertionTask` runs assertions directly on OpenTelemetry span properties, so Scouter uses the inbox pattern below to wait for the specific anchor span before the poller runs that workflow. -> For system-wide ingestion pipeline details, see [Architecture](/architecture/overview/). +For tracing setup and instrumentation, see [Distributed tracing](/scouter/tracing/overview/). For the full task type reference, see [Building blocks](/scouter/evaluation-platform/eval-profiles-and-tasks/). + +> For system-wide ingestion pipeline details, see [Architecture](/scouter/architecture/overview/). --- @@ -55,7 +57,7 @@ Beyond standard OTEL attributes, Scouter adds: - `set_tag(key, value)`: indexed tags stored in PostgreSQL for fast lookup - `attach_eval(profile_uid, context, *, record_id, session_id, media, tags)`: insert an `EvalRecord` with trace correlation, setting `trace_id` and `span_id` from the active span -For the full instrumentor API, see [Instrumentor guide](/tracing/instrumentor/). +For the full instrumentor API, see [Instrumentor guide](/scouter/tracing/instrumentor/). ### Instrumentation approaches: where Scouter differs @@ -104,7 +106,7 @@ The attributes stored as dedicated columns: | Error | `error.type` | | Evaluation | `gen_ai.evaluation.result` events with `name`, `score.label`, `score.value`, `explanation` | -See the [GenAI semantic conventions](/tracing/genai-semantics/) page for Google ADK / Vertex vendor fallback details and sensitive-content redaction rules. +See the [GenAI semantic conventions](/scouter/tracing/genai-semantics/) page for Google ADK / Vertex vendor fallback details and sensitive-content redaction rules. These attributes are set by your application code on spans. Scouter doesn't auto-instrument LLM calls; your framework or manual instrumentation provides the attributes. What Scouter does is store them in a columnar layout where each attribute is a typed column with dictionary encoding on high-repetition fields (`provider_name`, `request_model`, `response_model`, `operation_name`, `output_type`). Token columns use delta encoding. The result: queries like "average input tokens by model over the last 24 hours" scan compressed, sorted columns instead of parsing JSON blobs. @@ -297,40 +299,111 @@ The DataFusion `SessionContext` is tuned for trace workloads: ## Trace-aware evaluation architecture -Scouter connects traces to evaluation through an inbox pattern. When you call `span.attach_eval()`, the evaluation record references the active span's `trace_id` and `span_id`. The server stores the record in one of three states based on trace arrival status. +Evaluation does not depend on tracing by default. `span.attach_eval()` creates an `EvalRecord`, fills in `trace_id` and `span_id` from the active span, and stamps the span with `scouter.eval.record_uid` and `scouter.eval.profile_uid`. That is correlation metadata. It does not, by itself, make the record wait for trace storage. + +The server decides whether trace availability gates the workflow by reading the registered `AgentEvalProfile`. If the profile has no `TraceAssertionTask`, the record is inserted as `pending` immediately and evaluated like any other online record. The trace IDs remain useful for later lookup, but they do not slow down evaluation. + +The inbox pattern exists for one case: profiles that include `TraceAssertionTask`. Those tasks read spans from Delta Lake, so Scouter must make sure the specific anchor span is committed before the poller runs the workflow. For those profiles, readiness means the exact `(record_uid, trace_id, span_id)` anchor has committed to Delta Lake, not merely that some span with the same `trace_id` exists. + +The important split for trace-assertion workflows is this: Delta Lake is the source of truth, PostgreSQL is the eval workflow state machine, and the in-memory channel is only a latency path. If the channel drops an anchor event, correctness still comes from reconciliation scanning Delta and recreating the queue row. + +### End-to-end inbox flow ```mermaid -graph LR - subgraph "Client" - S["span.attach_eval
(profile_uid, context)"] +flowchart TD + subgraph APP["Application process"] + SP["Active span"] + AE["span.attach_eval()
creates EvalRecord
stamps span metadata"] + ER["EvalRecord
context + trace_id + span_id"] + END["span.end()"] + SP --> AE --> ER + SP --> END end - subgraph "Server ingestion" - S -->|"if trace exists in Delta"| P["Record → pending"] - S -->|"if trace not yet committed"| A["Record → awaiting_trace"] + subgraph EVAL_INGEST["Eval record ingestion"] + MR["Message router"] + INSERT["insert agent_eval_record"] + NEEDS_TRACE{"workflow includes
TraceAssertionTask?"} + PROBE["probe trace_commit_event
by anchor tuple"] + FAIL_TRACE["failed(EvalRequiresTrace)"] + FAIL_ANCHOR["failed(EvalRequiresAnchorSpan)"] + FAIL_TIMEOUT["failed(TraceArrivalTimeout)"] + AWAIT["agent_eval_record
status = awaiting_trace"] + PENDING_FAST["agent_eval_record
status = pending"] + PENDING_TRACE["agent_eval_record
status = pending
ready_at = now + visibility buffer"] + end + + subgraph TRACE_INGEST["Trace ingestion"] + EXPORT["BatchSpanProcessor / exporter"] + TRACE_CH["trace channel"] + BUFFER["TraceSpanService
buffer actor"] + ENGINE["TraceSpanDBEngine
Arrow + Delta append"] + EXTRACT["extract committed anchors
only spans with record_uid + profile_uid"] end - subgraph "Inbox worker (scheduled)" - TCE["trace_commit_event
arrives for trace_id"] - AT["awaiting_trace rows
for that trace_id"] - TCE -->|"match on trace_id"| AT - AT -->|"flip status"| P + subgraph STORES["Durable stores"] + DELTA["Delta Lake trace_spans
source of truth"] + QUEUE["PostgreSQL trace_commit_event
durable queue + audit log"] end - subgraph "Timeout sweep" - TM["awaiting_trace rows
older than 5 min"] - TM -->|"no trace commit event"| F["failed(TraceArrivalTimeout)"] + subgraph LIVE["Live latency path"] + COMMIT_TX["bounded commit_tx
try_send, may drop"] + WRITER["inbox writer
ON CONFLICT DO NOTHING"] end - subgraph "AgentPoller" - P -->|"SELECT FOR UPDATE"| AP["Evaluation worker"] - AP --> RS["Results + alerts"] + subgraph WORKERS["Background workers"] + DRAIN["inbox drain
claim pending rows
FOR UPDATE SKIP LOCKED"] + COMPLETE["complete owned rows
claim_token fenced"] + RECON["reconciliation sweep
old awaiting_trace rows"] + LEASE["lease recovery
stale processing rows"] + TIMEOUT["timeout sweep
TraceArrivalTimeout"] + POLLER["AgentPoller
SELECT pending rows"] + EVAL["task DAG + TraceAssertionTask"] + RESULTS["workflow results + alerts"] end + + ER --> MR --> INSERT --> NEEDS_TRACE + NEEDS_TRACE -->|"no"| PENDING_FAST + NEEDS_TRACE -->|"yes, missing trace_id"| FAIL_TRACE + NEEDS_TRACE -->|"yes, trace_id present,
missing span_id"| FAIL_ANCHOR + NEEDS_TRACE -->|"yes, complete anchor"| PROBE + PROBE -->|"exact anchor queued"| PENDING_TRACE + PROBE -->|"not seen yet"| AWAIT + + END --> EXPORT --> TRACE_CH --> BUFFER --> ENGINE --> DELTA + ENGINE --> EXTRACT --> COMMIT_TX + COMMIT_TX -->|"delivered"| WRITER --> QUEUE + COMMIT_TX -.->|"dropped / pod died"| RECON + + AWAIT --> RECON + RECON -->|"query Delta by record_uid,
trace_id, span_id, time window"| DELTA + RECON -->|"insert missing anchors"| QUEUE + + QUEUE --> DRAIN --> COMPLETE + COMPLETE -->|"flip matching anchor tuple only"| PENDING_TRACE + QUEUE --> LEASE -->|"retry or dead_lettered"| QUEUE + AWAIT --> TIMEOUT -->|"older than timeout"| FAIL_TIMEOUT + + PENDING_FAST --> POLLER + PENDING_TRACE -->|"after ready_at"| POLLER --> EVAL + EVAL -->|"fetch spans by trace_id"| DELTA + EVAL --> RESULTS ``` -### Two paths for record insertion +Read the diagram left to right by concern: + +- **Eval record ingestion** writes `agent_eval_record`. If the workflow has no `TraceAssertionTask`, the record goes straight to `pending`. If trace assertions are present, ingestion never guesses that a trace is ready because some span in the trace exists; it either finds the exact anchor tuple or leaves the record in `awaiting_trace`. +- **Trace ingestion** writes spans to Delta first. Only after the Delta commit succeeds does the engine extract anchor spans and send them toward the inbox writer. +- **The live path** is deliberately best-effort. `commit_tx` is bounded and uses `try_send`; dropping an event is acceptable because the event can be rebuilt from Delta. +- **The durable queue** is `trace_commit_event`. The inbox worker claims queue rows, completes the rows it owns, and flips only eval rows that match the returned `(record_uid, trace_id, span_id)` anchor. +- **Reconciliation** is the correctness path. It scans older `awaiting_trace` rows, queries Delta for their anchor spans, and inserts any missing queue rows. It does not update `agent_eval_record` directly. +- **The poller** evaluates pending records. Only records that were gated for trace assertions receive the `ready_at` visibility buffer; normal eval records do not wait on tracing. + +### Two server-side paths after insertion -**Path A — Trace-attached (with span correlation):** +Both examples below create eval records for the same online evaluation system. The difference is whether the record includes trace correlation metadata. + +**Trace-correlated record:** ```python with tracer.start_as_current_span("agent.callback") as span: @@ -344,13 +417,13 @@ with tracer.start_as_current_span("agent.callback") as span: ) ``` -- `trace_id` and `span_id` are set once by the Rust constructor from the active span context. -- Server immediately checks if trace has committed to Delta Lake (within ~5 seconds of span.attach_eval call). -- If trace found: record inserted as `pending` for immediate evaluation. -- If trace not yet committed: record inserted as `awaiting_trace` with a reference timestamp. -- `ready_at` column provides a visibility window before the poller processes the record, allowing time for task-upstream data to stabilize. +- `attach_eval()` creates the `EvalRecord`; the server still owns workflow routing. +- If the registered profile has no `TraceAssertionTask`, the record is inserted as `pending`. +- If the profile has `TraceAssertionTask`, the server checks `trace_commit_event` for the exact `(record_uid, trace_id, span_id, profile_uid)` anchor. +- If the anchor queue row already exists, the record is inserted as `pending` with a visibility delay. +- If the anchor has not been delivered yet, the record is inserted as `awaiting_trace`. -**Path B — Standalone/content-only (no trace needed):** +**Content-only record:** ```python queue.insert(EvalRecord( @@ -361,29 +434,44 @@ queue.insert(EvalRecord( )) ``` -- No `trace_id`, no `span_id`. Record inserted as `pending` immediately. -- If profile has `TraceAssertionTask` but record has no `trace_id`: record fails with `failed(EvalRequiresTrace)` during evaluation. +- No `trace_id` or `span_id` is attached. +- If the profile has no `TraceAssertionTask`, the record is inserted as `pending`. +- If the profile has `TraceAssertionTask`, the record fails with `failed(EvalRequiresTrace)` because the workflow asked for span data that this record cannot provide. ### Trace arrival coordination -The `trace_commit_event` table records when traces land in Delta Lake. After a successful Delta append, the server writes a commit event with the trace's `trace_id`, `partition_date`, and timestamp. The inbox worker polls for new commit events and flips matching `awaiting_trace` rows to `pending`. +The `trace_commit_event` table is a durable queue and audit log for anchor spans. It is only used to unblock eval records waiting on trace assertions. Rows move through `pending → processing → processed | dead_lettered` and carry `attempt_count`, `claimed_at`, `claimed_by`, `claim_token`, and `last_error`. The inbox worker claims rows with `FOR UPDATE SKIP LOCKED`, completes owned queue rows, then flips only eval rows that match the returned anchor tuple. + +Delta Lake remains the source of truth. The live channel from the engine actor is a latency optimization: it uses bounded `try_send`, may drop when full or closed, and increments `scouter_trace_commit_event_channel_drop_total`. A reconciliation sweep recovers correctness by scanning old `awaiting_trace` eval rows, querying Delta with bounded time predicates, and inserting missing queue rows with `ON CONFLICT DO NOTHING`. Reconciliation never writes `agent_eval_record`. **Failure modes:** | Scenario | Behavior | |---|---| -| Trace commits before record arrives | Short-circuit: record inserted as `pending` directly (0 retry latency) | -| Trace commits after record inserted as `awaiting_trace` | Inbox worker matches trace_id and flips to `pending` within 1-2 poll intervals | +| Record has trace IDs but workflow has no trace assertions | Record inserts as `pending`; tracing is correlation metadata only | +| Anchor commits before record arrives | Short-circuit: record inserted as `pending` by probing the exact anchor | +| Anchor commits after record inserted as `awaiting_trace` | Inbox worker completes the owned queue row, then flips the eval row that matches `(record_uid, trace_id, span_id)` to `pending` | +| Live anchor channel drops | Reconciliation finds the anchor in Delta and inserts the missing queue row | +| Worker dies while processing | Lease recovery returns the row to `pending` or moves it to `dead_lettered` after max attempts | | Trace never commits (client crashed, network partition) | Record remains `awaiting_trace` for 5 minutes, then marked `failed(TraceArrivalTimeout)` | -| Profile requires trace assertions but record has no trace_id | Record fails immediately with `failed(EvalRequiresTrace)` during evaluation | +| Profile requires trace assertions but record has no trace_id | Record fails immediately with `failed(EvalRequiresTrace)` | +| Profile requires trace assertions and record has trace_id but no span_id | Record fails immediately with `failed(EvalRequiresAnchorSpan)` | ### Scheduled maintenance -**Inbox worker:** Polls for `trace_commit_event` rows matching `awaiting_trace` records. Configurable poll interval (default every 10 seconds). Flips status to `pending` and marks the trace_commit_event as processed. +**Inbox worker:** Claims `pending` queue rows, completes rows fenced by `claim_token`, and flips only eval rows matching the returned `(record_uid, trace_id, span_id)` anchors to `pending`. + +**Reconciliation sweep:** Scans old `awaiting_trace` rows and queries Delta for their anchor spans. It inserts missing `trace_commit_event` rows only. + +**Lease recovery sweep:** Recovers stuck `processing` rows after the lease TTL. Rows are retried until `max_attempts`, then kept as `dead_lettered` for operator triage. **Timeout sweep:** Background task that scans `awaiting_trace` rows older than 5 minutes (configurable via `TRACE_ARRIVAL_TIMEOUT_SECS`). Marks them as `failed(TraceArrivalTimeout)` and emits a metric for alerting. -**Prune sweep:** Removes old `trace_commit_event` rows after processing (default: 24 hours). Keeps the table bounded. +**Prune sweep:** Removes old `processed` queue rows after 24 hours. Pending, processing, and dead-lettered rows are not pruned automatically. + +Operators should alert on sustained channel drops, non-zero dead-lettered rows, rising pending count, and oldest pending age above the reconciliation interval. The main metrics are `scouter_trace_commit_event_pending_count`, `scouter_trace_commit_event_oldest_pending_age_seconds`, `scouter_trace_commit_event_channel_drop_total`, `scouter_trace_commit_event_reconciled_total`, `scouter_trace_commit_event_dead_lettered_total`, `scouter_trace_commit_event_lease_recovered_total`, and the claim/flip histograms. + +Migration note: `20260512000000_trace_commit_anchor_queue.sql` is forward-only. Stop the server during the migration; the old trace-level inbox rows are intentionally dropped and any remaining `awaiting_trace` rows are recovered by reconciliation or failed by the timeout sweep. ### Configuration @@ -392,13 +480,19 @@ The `trace_commit_event` table records when traces land in Delta Lake. After a s | `TRACE_ARRIVAL_TIMEOUT_SECS` | 300 | How long to wait for a trace to commit before failing the record | | `INBOX_POLL_INTERVAL_SECS` | 10 | How often the inbox worker checks for new commit events | | `TRACE_COMMIT_EVENT_RETENTION_HOURS` | 24 | How long to keep processed commit event rows before pruning | +| `SCOUTER_TRACE_REFRESH_INTERVAL_SECS` | 10 | How often each pod refreshes its Delta snapshot | +| `SCOUTER_TRACE_VISIBILITY_BUFFER_SECS` | refresh + 2 | Delay before pending trace evals are polled; startup fails if below refresh + 2 | +| `SCOUTER_INBOX_RECONCILE_AFTER_SECS` | 15 | How old an `awaiting_trace` row must be before reconciliation scans for it | +| `SCOUTER_INBOX_RECONCILE_LOOKBACK_SECS` | 86400 | Maximum supported anchor span start lookback used by reconciliation Delta queries | +| `SCOUTER_INBOX_RECONCILE_INTERVAL_SECS` | 60 | Reconciliation tick interval | +| `SCOUTER_INBOX_RECONCILE_BATCH` | 200 | Awaiting rows scanned per reconciliation tick | --- ## Same data, unified evaluation -Scouter's observability engine and evaluation pipeline share the same trace data. Spans in Delta Lake feed `TraceAssertionTask` evaluations. An evaluation record attached to a trace is evaluated with that trace's spans available. Results flow into the alert system. +Scouter's observability engine and evaluation pipeline share the same trace data, but only trace-assertion workflows are gated on it. Spans in Delta Lake feed `TraceAssertionTask` evaluations. Records for profiles without trace assertions can still carry trace IDs for lookup, but they are evaluated without waiting for trace storage. -The trace storage, evaluation engine, and alert pipeline are stages of a single data path. Spans arrive, get stored, records correlate with traces, evaluations run, results get stored, and alerts fire. +For trace-assertion workflows, the trace storage, evaluation engine, and alert pipeline become one data path: spans arrive, anchor events unblock matching records, evaluations run, results get stored, and alerts fire. -For Delta Lake schema details and compaction internals, see [Storage architecture](/tracing/storage-architecture/). +For Delta Lake schema details and compaction internals, see [Storage architecture](/scouter/tracing/storage-architecture/). diff --git a/docs/src/content/docs/evaluation-platform/offline-evaluation.mdx b/docs/src/content/docs/evaluation-platform/offline-evaluation.mdx index 6f935262e..b7694ee57 100644 --- a/docs/src/content/docs/evaluation-platform/offline-evaluation.mdx +++ b/docs/src/content/docs/evaluation-platform/offline-evaluation.mdx @@ -6,9 +6,9 @@ Offline evaluation gates releases. You run your agent against a suite of test sc Scouter splits this into two layers: a Python orchestrator that handles the messy parts (calling your agent, managing sessions, simulating users) and a Rust evaluation engine that handles the compute (parallel task execution, comparison operators, trace matching). The orchestrator collects data; the engine evaluates it. -For the complete usage guide with code examples, see [Offline evaluation](/agents/offline-evaluation/). +For the complete usage guide with code examples, see [Offline evaluation](/scouter/agents/offline-evaluation/). -> For how these building blocks (task types, DAGs, operators) work, see [Building blocks](/evaluation-platform/eval-profiles-and-tasks/). +> For how these building blocks (task types, DAGs, operators) work, see [Building blocks](/scouter/evaluation-platform/eval-profiles-and-tasks/). --- @@ -207,4 +207,4 @@ When you already have records (from a previous run, a production log export, or Use for: re-evaluating historical data with updated task definitions, evaluating exported production records offline, quick iteration on tasks without running your agent. -For details, see [EvalDataset](/agents/eval-dataset/). +For details, see [EvalDataset](/scouter/agents/eval-dataset/). diff --git a/docs/src/content/docs/evaluation-platform/online-evaluation.mdx b/docs/src/content/docs/evaluation-platform/online-evaluation.mdx index cdeb0dbbf..b40c12680 100644 --- a/docs/src/content/docs/evaluation-platform/online-evaluation.mdx +++ b/docs/src/content/docs/evaluation-platform/online-evaluation.mdx @@ -4,9 +4,9 @@ description: "Evaluate sampled production traffic and raise alerts from live beh --- Online evaluation monitors agents in production. You sample live traffic, evaluate it asynchronously on the server, and fire alerts when quality degrades. The application never blocks on evaluation; the monitoring infrastructure is invisible to your users. -This page walks through the full data path, from client-side queue insertion to alert dispatch. For setup and configuration, see [Online evaluation](/agents/online-evaluation/). +This page walks through the full data path, from client-side queue insertion to alert dispatch. For setup and configuration, see [Online evaluation](/scouter/agents/online-evaluation/). -> For system-wide ingestion pipeline details, see [Architecture](/architecture/overview/). +> For system-wide ingestion pipeline details, see [Architecture](/scouter/architecture/overview/). --- @@ -51,11 +51,11 @@ Step by step: 4. On the server, the message router dispatches records to dedicated flume channels by type. Eval records and drift profiles go to the server records channel (bounded at 1000, `SERVER_RECORD_CONSUMER_WORKERS` workers). Trace spans go to a separate trace channel (bounded at 500, `TRACE_CONSUMER_WORKERS` workers). Tags go to a third channel (bounded at 200). Each channel has its own worker pool, so traffic in one type doesn't starve the others. -5. Server record workers process the `EvalRecord` and insert it into PostgreSQL's `agent_eval_record` table. The status is set based on trace arrival: `awaiting_trace` (if record has trace_id and trace not yet committed), `pending` (ready for evaluation), or `failed(EvalRequiresTrace)` (if profile needs trace assertions but record has no trace_id). +5. Server record workers process the `EvalRecord` and inspect the registered profile. Trace IDs on the record are correlation metadata unless the workflow includes `TraceAssertionTask`. Profiles without trace assertions insert as `pending` immediately. Profiles with trace assertions use the inbox path: `awaiting_trace` (complete anchor, not committed yet), `pending` (ready for evaluation), `failed(EvalRequiresTrace)`, or `failed(EvalRequiresAnchorSpan)`. 6. `AgentPoller` workers continuously poll for pending records using `SELECT ... FOR UPDATE SKIP LOCKED`. A record locked by one worker is invisible to others. No external job queue needed. -7. If the profile has `TraceAssertionTask` instances, the poller queries Delta Lake for spans matching the record's `trace_id`. Spans often arrive after eval records (Delta Lake has a 5-second flush interval), so the poller uses exponential backoff: 100ms, 200ms, 400ms, up to a 5-second cap. If spans still aren't available after the timeout, the record is rescheduled with a 30-second delay and retried later. +7. If the profile has `TraceAssertionTask` instances, the poller queries Delta Lake for spans matching the record's `trace_id`. The inbox path already waited for the anchor span; the poller still uses short backoff while the local Delta snapshot catches up to the committed trace data. If spans still aren't available after the timeout, the record is rescheduled with a 30-second delay and retried later. 8. The task DAG executes. Same engine as offline evaluation: topological sort into stages, parallel execution within stages via Tokio `JoinSet`, sequential across stages. @@ -87,15 +87,15 @@ The polling query uses PostgreSQL's `FOR UPDATE SKIP LOCKED`, which means a reco ### Trace availability and record status -EvalRecords attached to traces arrive before their spans reach Delta Lake (Delta has a ~5-second flush interval). Records are stored in one of three states: +EvalRecords can carry `trace_id` and `span_id`, but evals run independently of tracing unless the workflow asks for trace data. The inbox path only applies to profiles with `TraceAssertionTask`. Profiles without trace assertions go straight to `pending`, even when the record carries trace correlation IDs. -**Awaiting trace.** If a record references a `trace_id` that hasn't yet committed to Delta Lake, it's inserted as `awaiting_trace`. The inbox worker monitors `trace_commit_event` rows and flips matching records to `pending` when the trace arrives. If the trace doesn't arrive within 5 minutes (configurable), the record fails with `TraceArrivalTimeout`. +**Awaiting trace.** If a trace-assertion profile receives a record that references a trace anchor whose span has not yet committed to Delta Lake, it's inserted as `awaiting_trace`. The engine emits inbox events only for spans stamped with both `scouter.eval.record_uid` and `scouter.eval.profile_uid`; the inbox worker completes owned queue rows, then flips only eval rows that match the returned `(record_uid, trace_id, span_id)` anchor. If the anchor does not arrive within 5 minutes, the record fails with `TraceArrivalTimeout`. -**Pending.** Ready for evaluation. Either the trace already committed when the record arrived (short-circuit), or the inbox worker flipped it from `awaiting_trace` after the trace committed. +**Pending.** Ready for evaluation. Either the exact anchor already committed when the record arrived, or the inbox worker flipped it from `awaiting_trace` after the anchor committed. **Failed (EvalRequiresTrace).** If a profile has `TraceAssertionTask` definitions but the record has no `trace_id`, evaluation fails immediately. This prevents incomplete evaluations. -If your profiles don't use `TraceAssertionTask`, records go straight to `pending` (no trace correlation needed). Evaluation latency is just the task DAG execution: nanoseconds for assertion-only profiles, seconds for profiles with `LLMJudgeTask`. +If your profiles don't use `TraceAssertionTask`, records go straight to `pending`; trace correlation can still be stored, but it does not gate evaluation. Evaluation latency is just the task DAG execution: nanoseconds for assertion-only profiles, seconds for profiles with `LLMJudgeTask`. ### Configuration @@ -105,8 +105,10 @@ If your profiles don't use `TraceAssertionTask`, records go straight to `pending | `GENAI_MAX_RETRIES` | 3 | Max reschedule attempts per record | | `TRACE_ARRIVAL_TIMEOUT_SECS` | 300 | How long to wait for a trace to commit before failing the record | | `INBOX_POLL_INTERVAL_SECS` | 10 | How often the inbox worker checks for new commit events | +| `SCOUTER_TRACE_VISIBILITY_BUFFER_SECS` | refresh + 2 | Delay before trace-backed evals are polled; startup fails below `SCOUTER_TRACE_REFRESH_INTERVAL_SECS + 2` | +| `SCOUTER_INBOX_RECONCILE_INTERVAL_SECS` | 60 | How often the reconciliation sweep recovers dropped live inbox events from Delta | -Typical latency: records without trace assertions are evaluated in <1 second. Records awaiting traces are processed once the trace commits (usually within 1-5 seconds). If a trace doesn't commit within 5 minutes, the record fails with `TraceArrivalTimeout`. +Typical latency: records without trace assertions are evaluated in less than 1 second. Records awaiting traces are processed once the trace commits (usually within 1-5 seconds). If a trace doesn't commit within 5 minutes, the record fails with `TraceArrivalTimeout`. --- diff --git a/docs/src/content/docs/index.mdx b/docs/src/content/docs/index.mdx index 247111c51..7bdf1b04b 100644 --- a/docs/src/content/docs/index.mdx +++ b/docs/src/content/docs/index.mdx @@ -10,7 +10,7 @@ hero: file: "../../assets/scouter-icon.svg" actions: - text: "Get started" - link: "/installation/" + link: "/scouter/installation/" icon: right-arrow variant: primary - text: "GitHub" @@ -82,7 +82,7 @@ The server is Rust. PostgreSQL stores recent operational data. DataFusion handle ## Reference -- [API reference](/api/) — top-level Python exports -- [Architecture](/architecture/overview/) — runtime components and data flow -- [Tech specs](/specs/) — design notes for queueing, archival, gRPC, and evaluation internals -- [Evaluation platform](/evaluation-platform/) — deep dive on online and offline evaluation +- [API reference](/scouter/api/) — top-level Python exports +- [Architecture](/scouter/architecture/overview/) — runtime components and data flow +- [Tech specs](/scouter/specs/) — design notes for queueing, archival, gRPC, and evaluation internals +- [Evaluation platform](/scouter/evaluation-platform/) — deep dive on online and offline evaluation diff --git a/docs/src/content/docs/monitoring/inference.md b/docs/src/content/docs/monitoring/inference.md index 4905ed5e0..090abcc16 100644 --- a/docs/src/content/docs/monitoring/inference.md +++ b/docs/src/content/docs/monitoring/inference.md @@ -45,7 +45,7 @@ In addition to the HTTP transport config, Scouter also support the following tra - **RabbitMQ**: `from_path(path, transport_config=RabbitMQConfig())` - **Redis**: `from_path(path, transport_config=RedisConfig())` -For more information on how to configure these transports, please refer to the [queue](/api/) documentation and the server documentation. +For more information on how to configure these transports, please refer to the [queue](/scouter/api/) documentation and the server documentation. ## Inserting data @@ -230,8 +230,8 @@ qa_record = QARecord( record = EvalRecord(context=qa_record) ``` -For more information on creating `EvalRecord` objects, please refer to the [Agent Tasks Documentation](/agents/tasks/). +For more information on creating `EvalRecord` objects, please refer to the [Agent Tasks Documentation](/scouter/agents/tasks/). ### Ready to go! -And that's all you need to get started for real-time model monitoring with Scouter. For more technical discussion on the ScouterQueue, please refer to the [ScouterQueue](/specs/scouter-queue/) documentation. +And that's all you need to get started for real-time model monitoring with Scouter. For more technical discussion on the ScouterQueue, please refer to the [ScouterQueue](/scouter/specs/scouter-queue/) documentation. diff --git a/docs/src/content/docs/monitoring/spc/theory.mdx b/docs/src/content/docs/monitoring/spc/theory.mdx index 52196fc18..27627ade1 100644 --- a/docs/src/content/docs/monitoring/spc/theory.mdx +++ b/docs/src/content/docs/monitoring/spc/theory.mdx @@ -57,7 +57,7 @@ Where $k$ is the number of standard deviations from the grand mean. Typically, $ Out of the box, `Scouter` will calculate the center line and control limits for 3 zones ($\pm{1}$, $\pm{2}$ and $\pm{3}$). This is based on the 3 sigma rule in process control. The resulting chart and zones would appear as follows if plotted: -SPC control chart +SPC control chart Each dot on the chart represents the mean of a sample from the process being monitored @@ -104,7 +104,7 @@ In addition to the 8 digit rule, `Scouter` will also check for a consecutive tre ### Example Alert -SPC control chart alert example +SPC control chart alert example ### Custom Alerts diff --git a/docs/src/content/docs/server/index.md b/docs/src/content/docs/server/index.md index d6bc085c5..5c765fd94 100644 --- a/docs/src/content/docs/server/index.md +++ b/docs/src/content/docs/server/index.md @@ -112,7 +112,7 @@ There are a few different ways to deploy the Scouter server in production. ### Prerequisites -Scouter requires a **PostgreSQL 16.3+** database with the `pg_partman` and `pg_cron` extensions. See the [PostgreSQL setup guide](/server/postgres/) for details. +Scouter requires a **PostgreSQL 16.3+** database with the `pg_partman` and `pg_cron` extensions. See the [PostgreSQL setup guide](/scouter/server/postgres/) for details. Set the following environment variable before starting the server: diff --git a/docs/src/content/docs/specs/data-archive.md b/docs/src/content/docs/specs/data-archive.md index 1806f2161..273e1fe10 100644 --- a/docs/src/content/docs/specs/data-archive.md +++ b/docs/src/content/docs/specs/data-archive.md @@ -32,7 +32,7 @@ In addition to `ObjectStorageSettings`, `DatabaseSettings` now takes on an addit ## Component Architecture -Data Archive Architecture +Data Archive Architecture ## Implementation Details diff --git a/docs/src/content/docs/specs/scouter-queue.md b/docs/src/content/docs/specs/scouter-queue.md index ece37f1f0..f7008bda8 100644 --- a/docs/src/content/docs/specs/scouter-queue.md +++ b/docs/src/content/docs/specs/scouter-queue.md @@ -7,7 +7,7 @@ The Scouter Queue is the primary interface for sending real-time data to the Sco ## Component Architecture -Scouter Queue Architecture +Scouter Queue Architecture ## How it works diff --git a/docs/src/content/docs/tracing/genai-semantics.mdx b/docs/src/content/docs/tracing/genai-semantics.mdx index b398ac615..2aff0483b 100644 --- a/docs/src/content/docs/tracing/genai-semantics.mdx +++ b/docs/src/content/docs/tracing/genai-semantics.mdx @@ -290,7 +290,7 @@ LangchainInstrumentor().instrument() -See the [ScouterInstrumentor guide](/tracing/instrumentor/) for full setup and framework coverage. +See the [ScouterInstrumentor guide](/scouter/tracing/instrumentor/) for full setup and framework coverage. --- @@ -480,4 +480,4 @@ Scouter's GenAI span data is consumed by [OpsML](https://github.com/demml/opsml) ## TraceAssertionTask -GenAI spans stored in Scouter are queryable by `TraceAssertionTask` during offline or online evaluation — useful for writing assertions against token budgets, latency SLAs, or model behavior across production traces. See [TraceAssertionTask](/agents/tasks/#traceassertiontask) for details. +GenAI spans stored in Scouter are queryable by `TraceAssertionTask` during offline or online evaluation — useful for writing assertions against token budgets, latency SLAs, or model behavior across production traces. See [TraceAssertionTask](/scouter/agents/tasks/#traceassertiontask) for details. diff --git a/docs/src/content/docs/tracing/instrumentor.md b/docs/src/content/docs/tracing/instrumentor.md index 3b51e979b..690850789 100644 --- a/docs/src/content/docs/tracing/instrumentor.md +++ b/docs/src/content/docs/tracing/instrumentor.md @@ -4,7 +4,7 @@ description: "Reference for the OpenTelemetry-compatible Scouter instrumentor." --- `ScouterInstrumentor` implements the standard OpenTelemetry `BaseInstrumentor` interface. Once instrumented, it registers Scouter's `TracerProvider` as the global OTEL provider, so any library that calls `opentelemetry.trace.get_tracer()` routes spans through Scouter automatically. -See the [overview](/tracing/overview/#scouterinstrumentor-lifecycle) for details on the `ScouterInstrumentor` lifecycle. +See the [overview](/scouter/tracing/overview/#scouterinstrumentor-lifecycle) for details on the `ScouterInstrumentor` lifecycle. For normal application code, this is the recommended tracing entrypoint. Call `instrument()` once at startup, get tracers from `opentelemetry.trace`, and shut the provider down once at process exit. diff --git a/docs/src/content/docs/tracing/overview.mdx b/docs/src/content/docs/tracing/overview.mdx index 25f353ffb..c0a70c072 100644 --- a/docs/src/content/docs/tracing/overview.mdx +++ b/docs/src/content/docs/tracing/overview.mdx @@ -6,7 +6,7 @@ import { Tabs, TabItem } from "@astrojs/starlight/components"; Scouter provides OpenTelemetry-compatible distributed tracing built on a Rust core with Python OTEL wrappers. The recommended setup is to install Scouter as the process-wide OpenTelemetry `TracerProvider` with `ScouterInstrumentor()`, then get tracers from `opentelemetry.trace`. -Traces captured by Scouter can be evaluated offline or in production using [`TraceAssertionTask`](/agents/tasks/#traceassertiontask) — validating span execution order, latency SLAs, token budgets, and more. +Traces captured by Scouter can be evaluated offline or in production using [`TraceAssertionTask`](/scouter/agents/tasks/#traceassertiontask) — validating span execution order, latency SLAs, token budgets, and more. ## Architecture @@ -161,7 +161,7 @@ Scouter's tracing layer is built on top of the OpenTelemetry SDK. You can use Sc `ScouterInstrumentor` is a singleton — calling it multiple times returns the same instance. -Full API reference, framework integration examples (OpenAI Agents SDK, LangChain, LlamaIndex, CrewAI, FastAPI), default attributes, and local span capture patterns are covered in the [ScouterInstrumentor guide](/tracing/instrumentor/). +Full API reference, framework integration examples (OpenAI Agents SDK, LangChain, LlamaIndex, CrewAI, FastAPI), default attributes, and local span capture patterns are covered in the [ScouterInstrumentor guide](/scouter/tracing/instrumentor/). ```python from scouter.tracing import ScouterInstrumentor diff --git a/docs/src/styles/custom.css b/docs/src/styles/custom.css index e74e3712a..fe116f05f 100644 --- a/docs/src/styles/custom.css +++ b/docs/src/styles/custom.css @@ -253,13 +253,18 @@ h3 { /* ═══════════════════════════════════════════════════════ TABLES ═══════════════════════════════════════════════════════ */ -table { - display: table; - width: 100%; +.sl-markdown-content table { + display: block; + width: max-content; + min-width: 100%; + max-width: 100%; + overflow-x: auto; + -webkit-overflow-scrolling: touch; border-collapse: collapse; + white-space: nowrap; } -table th { +.sl-markdown-content table th { background: var(--sl-color-bg-nav); font-weight: 600; letter-spacing: 0.03em; @@ -269,24 +274,24 @@ table th { text-align: left; } -table td { +.sl-markdown-content table td { padding: 0.65rem 1rem; } -table td, -table th { +.sl-markdown-content table td, +.sl-markdown-content table th { border-bottom: 1px solid var(--sl-color-hairline-light); } -table tr:last-child td { +.sl-markdown-content table tr:last-child td { border-bottom: none; } -table tbody tr { +.sl-markdown-content table tbody tr { transition: background 100ms ease; } -table tbody tr:hover { +.sl-markdown-content table tbody tr:hover { background: color-mix(in srgb, var(--sl-color-accent) 5%, transparent); }