diff --git a/internal/rating/rater_test.go b/internal/rating/rater_test.go index 00738b8..c828d45 100644 --- a/internal/rating/rater_test.go +++ b/internal/rating/rater_test.go @@ -440,9 +440,12 @@ func supersededReconcileStore() (*oracleStore, time.Time, time.Time) { // TestRater_RoutineReconcileDeleteLogsError pins the LOUD half of the reconcile // observability contract (option (c)): a ROUTINE run (windowExplicit == false) that // reconcile-DELETES a previously-billed rollup must emit an ERROR line (page someone), -// even with NO other anomaly. RED before FIX 1: the old code logged the reconcile-delete -// UNCONDITIONALLY at INFO (testLogger discards INFO), so the ERROR stream was empty and -// this assertion failed — the page never fired on a routine bill rewrite. +// even with NO other anomaly. It uses captureLogger() (which buffers BOTH the INFO and +// ERROR streams, so the assertion is on WHICH severity the line was emitted at — not on +// a level filter discarding it). RED before FIX 1: the old code logged the +// reconcile-delete UNCONDITIONALLY at INFO, so the captured ERROR stream stayed empty +// and the errBuf.Len()==0 assertion failed — the page never fired on a routine bill +// rewrite. func TestRater_RoutineReconcileDeleteLogsError(t *testing.T) { store, ws, we := supersededReconcileStore() log, infoBuf, errBuf := captureLogger() diff --git a/internal/rating/store_integration_test.go b/internal/rating/store_integration_test.go index 6392ef7..c202fc4 100644 --- a/internal/rating/store_integration_test.go +++ b/internal/rating/store_integration_test.go @@ -935,51 +935,51 @@ func TestIntegration_RatingInstantIndexServesScan(t *testing.T) { exec(t, db, schemaDDL) exec(t, db, "SET enable_seqscan = off") - rows, err := db.Query(`EXPLAIN SELECT * FROM billing_event + plan := explainPlan(t, db, `EXPLAIN SELECT * FROM billing_event WHERE COALESCE(event_ts, created_at) >= '2026-06-08T10:00:00Z' AND COALESCE(event_ts, created_at) < '2026-06-08T11:00:00Z'`) - if err != nil { - t.Fatalf("explain: %v", err) - } - defer rows.Close() - var plan string - for rows.Next() { - var line string - if err := rows.Scan(&line); err != nil { - t.Fatalf("scan: %v", err) - } - plan += line + "\n" - } - if err := rows.Err(); err != nil { - t.Fatalf("rows: %v", err) - } if !strings.Contains(plan, "billing_event_rating_instant_ix") { t.Fatalf("plan does not use billing_event_rating_instant_ix:\n%s", plan) } } -// TestIntegration_ReconcileDeleteUsesWindowStartIndex: the reconcile DELETE (the +// TestIntegration_ReconcileDeleteCanUseWindowStartIndex: the reconcile DELETE (the // `deleted` CTE) filters rated_usage on window_start ALONE. Every other index leads -// with auth_id, leaving window_start trailing → no index can serve a -// window_start-only range scan, so without rated_usage_window_start_ix the reconcile -// seq-scans rated_usage (and takes a full-trailing-window lock footprint) on every -// run. This pins that the window_start-leading index exists AND the planner uses it -// for the reconcile's EXACT predicate. +// with auth_id, leaving window_start trailing → no auth-leading index can serve a +// window_start-only range scan as a tight slice. This pins the STRUCTURAL invariant +// that the dedicated window_start-leading index exists AND can serve the reconcile's +// EXACT predicate — proven by forcing the planner's hand with `enable_seqscan=off`. +// +// CORRECTNESS (hard) vs PERFORMANCE (soft) — what each pass actually proves: +// +// - HARD (t.Fatalf): with seqscan DISABLED, the plan MUST scan rated_usage via +// rated_usage_window_start_ix. This is a structural/correctness invariant — it +// proves the index CAN serve the window_start-only predicate at all. Its failure +// mode (no usable index for the predicate) is a real defect, so it gates. +// +// - SOFT (t.Logf only): whether the planner PREFERS the dedicated index at DEFAULT +// cost (seqscan enabled) over the auth-leading composite is a PERFORMANCE signal, +// not a correctness gate. Empirically the planner picks rated_usage_window_start_ix +// here (dropping it falls back to the costlier composite full-index scan, ~280 vs +// ~8), but that choice is the planner's cost model — version- and GUC-fragile. Its +// failure mode is a SLOW reconcile, not a wrong bill, so per rigor-by-cost-of- +// being-wrong it must NOT be a fragile correctness-style gate. We log the plan and +// which index it chose so a regression is visible, without flaking the suite when +// Postgres's costing shifts across major versions. // // REAL PROOF, not vacuous: an earlier version EXPLAINed a `SELECT 1` against a // freshly-created EMPTY rated_usage. On an empty (or tiny) table the planner // seqscans regardless of any index — and the standalone SELECT does not even // resemble the reconcile DELETE — so a passing assertion proved nothing about the // actual statement. Here we (a) POPULATE rated_usage with many rows across many -// hours so a narrow window-slice is genuinely cheaper by index than a seqscan, then -// (b) EXPLAIN the ACTUAL reconcile DELETE from store.go (the `deleted` CTE's -// rated_usage table-access: the window_start range plus the NOT EXISTS anti-join -// against this run's priced rows) and assert it scans rated_usage via -// rated_usage_window_start_ix. We assert the index is chosen by DEFAULT cost -// (seqscan ENABLED) — that is the load-bearing proof the index PAYS OFF for the -// reconcile's predicate, not just that one CAN be forced. A seqscan-disabled pass -// follows as a belt-and-braces check that the index can serve the predicate at all. -func TestIntegration_ReconcileDeleteUsesWindowStartIndex(t *testing.T) { +// hours so a narrow window-slice is genuinely index-servable, then (b) EXPLAIN the +// ACTUAL reconcile DELETE from store.go (the `deleted` CTE's rated_usage +// table-access: the window_start range plus the NOT EXISTS anti-join against this +// run's priced rows). The default-cost pass is checked by index NAME via an index +// scan node — `Index Scan using rated_usage_window_start_ix on rated_usage ru` — +// not a bare substring that the composite would also match; that discriminating +// check is the SOFT signal. The seqscan-off pass is the HARD structural gate. +func TestIntegration_ReconcileDeleteCanUseWindowStartIndex(t *testing.T) { dsn := os.Getenv("PHOEBE_TEST_DATABASE_URL") if dsn == "" { t.Skip("PHOEBE_TEST_DATABASE_URL not set; skipping live-Postgres conformance") @@ -1037,29 +1037,33 @@ func TestIntegration_ReconcileDeleteUsesWindowStartIndex(t *testing.T) { AND p.window_start = ru.window_start )` - // (1) Default-cost plan (seqscan ENABLED): the LOAD-BEARING proof the index PAYS - // OFF — the planner chooses rated_usage_window_start_ix on cost, not because - // seqscan was forbidden. This is discriminating: verified empirically, dropping - // the index makes the planner fall back to the auth-leading composite - // (auth_id, window_start) as a far costlier full-index scan (~280 vs ~8), so this - // assertion FAILS without the window_start index — it is not vacuous. (The - // fallback is an index scan, not a literal Seq Scan, because the composite still - // covers window_start as a non-leading column; the point stands — the dedicated - // window_start index is what the reconcile's window-only predicate should use.) + // (1) Default-cost plan (seqscan ENABLED) — SOFT PERFORMANCE SIGNAL, not a gate. + // Whether the planner PREFERS the dedicated window_start index over the auth-leading + // composite is a cost-model decision that is fragile across PG major versions / GUCs; + // its failure mode is a slow reconcile, not a wrong bill. So we LOG (never t.Fatalf) + // whether the default plan uses the dedicated index by NAME via an index scan node + // — "using rated_usage_window_start_ix" — which is discriminating (it would NOT match + // the auth-leading composite). Empirically the planner picks it here; if a future PG + // stops preferring it, this log makes the regression visible without flaking the run. plan := explainPlan(t, db, reconcileDelete) - if !strings.Contains(plan, "rated_usage_window_start_ix") { - t.Fatalf("reconcile DELETE did not choose rated_usage_window_start_ix at DEFAULT cost on a populated table (a window_start-only slice should prefer the dedicated index):\n%s", plan) - } - if strings.Contains(plan, "Seq Scan on rated_usage ru") { - t.Fatalf("reconcile DELETE seq-scans rated_usage at default cost — the window_start index is not serving the range predicate:\n%s", plan) - } - - // (2) Belt-and-braces: with seqscan disabled the index must STILL be able to serve - // the predicate (a remaining seqscan would mean no usable index at all). + if strings.Contains(plan, "using rated_usage_window_start_ix") { + t.Logf("default-cost reconcile DELETE plan uses the dedicated window_start index (perf signal OK):\n%s", plan) + } else { + t.Logf("PERF SIGNAL: default-cost reconcile DELETE did NOT choose rated_usage_window_start_ix "+ + "(planner cost model may have shifted; reconcile still correct, possibly slower):\n%s", plan) + } + + // (2) Seqscan DISABLED — HARD STRUCTURAL/CORRECTNESS GATE. With seqscan forbidden the + // plan MUST scan rated_usage through rated_usage_window_start_ix, proving the index + // CAN serve the window_start-only predicate at all. A remaining seqscan (or any other + // index) here means no usable index for the reconcile's exact predicate — a real + // defect — so THIS is what we t.Fatalf on. We assert by NAME via the index scan node + // ("using rated_usage_window_start_ix") so a plan that fell through to the composite + // would NOT satisfy it. exec(t, db, "SET enable_seqscan = off") plan2 := explainPlan(t, db, reconcileDelete) - if !strings.Contains(plan2, "rated_usage_window_start_ix") { - t.Fatalf("reconcile DELETE predicate cannot be served by rated_usage_window_start_ix even with seqscan off:\n%s", plan2) + if !strings.Contains(plan2, "using rated_usage_window_start_ix") { + t.Fatalf("reconcile DELETE predicate cannot be served by rated_usage_window_start_ix even with seqscan off (no usable window_start index for the window-only predicate):\n%s", plan2) } }