diff --git a/examples/gastown/maintenance_scripts_test.go b/examples/gastown/maintenance_scripts_test.go index 23f74d071f..646c6146ff 100644 --- a/examples/gastown/maintenance_scripts_test.go +++ b/examples/gastown/maintenance_scripts_test.go @@ -7335,3 +7335,338 @@ exit 0 t.Fatalf("cross-rig-deps summary missing or wrong (subshell counter regression?)\nwant substring: %q\ngot output:\n%s\nbd log:\n%s", want, out, logData) } } + +// reaperEscalationEnv builds the env map shared by the escalation-dedupe tests. +// The session-prune anomaly path is the simplest way to drive reaper.sh through +// the ESCALATION branch: writeMaintenanceBdStub honors BD_PRUNE_COUNT, and any +// value > 1000 triggers a record_anomaly call. +func reaperEscalationEnv(cityDir, binDir, stateDir, doltLog, bdLog, gcLog string) map[string]string { + return map[string]string{ + "BD_CALL_LOG": bdLog, + "BD_PRUNE_COUNT": "1500", + "DOLT_ARGS_LOG": doltLog, + "DOLT_DBS": "beads", + "GC_CALL_LOG": gcLog, + "GC_CITY": cityDir, + "GC_CITY_PATH": cityDir, + "GC_DOLT_HOST": "127.0.0.1", + "GC_DOLT_PORT": "3307", + "GC_DOLT_USER": "root", + "GC_DOLT_PASSWORD": "", + "GC_PACK_STATE_DIR": stateDir, + "PATH": binDir + string(os.PathListSeparator) + os.Getenv("PATH"), + } +} + +// countEscalationMails returns the number of ESCALATION lines in the gc log +// targeting the reaper subject. Existing tests rely on substring matching on a +// single line, so the same approach generalises here. +func countEscalationMails(t *testing.T, gcLog, subject string) int { + t.Helper() + data, err := os.ReadFile(gcLog) + if err != nil { + t.Fatalf("ReadFile(%s): %v", gcLog, err) + } + count := 0 + for _, line := range strings.Split(string(data), "\n") { + if strings.Contains(line, "mail send mayor/") && strings.Contains(line, subject) { + count++ + } + } + return count +} + +func TestReaperEscalationSuppressesRepeatAnomalyWithinCooldown(t *testing.T) { + cityDir := t.TempDir() + writeCityBeadsMetadata(t, cityDir, "beads") + binDir := t.TempDir() + stateDir := t.TempDir() + doltLog := filepath.Join(t.TempDir(), "dolt-args.log") + bdLog := filepath.Join(t.TempDir(), "bd.log") + gcLog := filepath.Join(t.TempDir(), "gc.log") + + writeMaintenanceDoltStub(t, filepath.Join(binDir, "dolt")) + writeMaintenanceBdStub(t, filepath.Join(binDir, "bd")) + writeExecutable(t, filepath.Join(binDir, "gc"), `#!/bin/sh +printf '%s\n' "$*" >> "$GC_CALL_LOG" +exit 0 +`) + + env := reaperEscalationEnv(cityDir, binDir, stateDir, doltLog, bdLog, gcLog) + + script := filepath.Join(exampleDir(), "packs", "maintenance", "assets", "scripts", "reaper.sh") + runScript(t, script, env) + runScript(t, script, env) + + subject := "ESCALATION: Reaper anomalies detected [MEDIUM]" + if got := countEscalationMails(t, gcLog, subject); got != 1 { + t.Fatalf("expected one ESCALATION send across two ticks (dedupe), got %d:\n%s", got, mustReadFile(t, gcLog)) + } + + stateFile := filepath.Join(stateDir, "reaper-state.json") + stateBytes, err := os.ReadFile(stateFile) + if err != nil { + t.Fatalf("ReadFile(state): %v", err) + } + var state struct { + Escalations map[string]struct { + Subject string `json:"subject"` + LastSentAt string `json:"last_sent_at"` + SuppressedCount int `json:"suppressed_count"` + } `json:"escalations"` + } + if err := json.Unmarshal(stateBytes, &state); err != nil { + t.Fatalf("Unmarshal state: %v\n%s", err, stateBytes) + } + if len(state.Escalations) != 1 { + t.Fatalf("expected exactly one dedupe entry, got %d:\n%s", len(state.Escalations), stateBytes) + } + var only struct { + Subject string + SuppressedCount int + } + for _, v := range state.Escalations { + only.Subject = v.Subject + only.SuppressedCount = v.SuppressedCount + } + if only.Subject != subject { + t.Fatalf("dedupe entry subject mismatch: want %q, got %q", subject, only.Subject) + } + if only.SuppressedCount != 1 { + t.Fatalf("expected suppressed_count=1 after one suppressed tick, got %d:\n%s", only.SuppressedCount, stateBytes) + } +} + +func TestReaperEscalationClearsStateWhenAnomaliesResolve(t *testing.T) { + cityDir := t.TempDir() + writeCityBeadsMetadata(t, cityDir, "beads") + binDir := t.TempDir() + stateDir := t.TempDir() + doltLog := filepath.Join(t.TempDir(), "dolt-args.log") + bdLog := filepath.Join(t.TempDir(), "bd.log") + gcLog := filepath.Join(t.TempDir(), "gc.log") + + writeMaintenanceDoltStub(t, filepath.Join(binDir, "dolt")) + writeMaintenanceBdStub(t, filepath.Join(binDir, "bd")) + writeExecutable(t, filepath.Join(binDir, "gc"), `#!/bin/sh +printf '%s\n' "$*" >> "$GC_CALL_LOG" +exit 0 +`) + + script := filepath.Join(exampleDir(), "packs", "maintenance", "assets", "scripts", "reaper.sh") + subject := "ESCALATION: Reaper anomalies detected [MEDIUM]" + + // Tick 1: anomaly present (BD_PRUNE_COUNT=1500 > 1000 threshold). Sends fresh. + env1 := reaperEscalationEnv(cityDir, binDir, stateDir, doltLog, bdLog, gcLog) + runScript(t, script, env1) + if got := countEscalationMails(t, gcLog, subject); got != 1 { + t.Fatalf("tick 1 should send a fresh escalation, got %d in log:\n%s", got, mustReadFile(t, gcLog)) + } + + // Tick 2: no anomaly (BD_PRUNE_COUNT below threshold). clear_escalation_state + // must wipe the dedupe entry so the next anomaly escalates fresh. + env2 := reaperEscalationEnv(cityDir, binDir, stateDir, doltLog, bdLog, gcLog) + env2["BD_PRUNE_COUNT"] = "0" + runScript(t, script, env2) + + stateFile := filepath.Join(stateDir, "reaper-state.json") + stateBytes, err := os.ReadFile(stateFile) + if err != nil { + t.Fatalf("ReadFile(state): %v", err) + } + var stateAfterClear struct { + Escalations map[string]any `json:"escalations"` + } + if err := json.Unmarshal(stateBytes, &stateAfterClear); err != nil { + t.Fatalf("Unmarshal: %v\n%s", err, stateBytes) + } + if len(stateAfterClear.Escalations) != 0 { + t.Fatalf("expected escalations to be cleared after no-anomaly tick, got %d entries:\n%s", len(stateAfterClear.Escalations), stateBytes) + } + + // Tick 3: anomaly returns. Must escalate fresh (no suppression carried over). + env3 := reaperEscalationEnv(cityDir, binDir, stateDir, doltLog, bdLog, gcLog) + runScript(t, script, env3) + if got := countEscalationMails(t, gcLog, subject); got != 2 { + t.Fatalf("expected 2 total escalations after fresh anomaly tick, got %d:\n%s", got, mustReadFile(t, gcLog)) + } +} + +func TestReaperEscalationLabelsBeadAfterSend(t *testing.T) { + cityDir := t.TempDir() + writeCityBeadsMetadata(t, cityDir, "beads") + binDir := t.TempDir() + stateDir := t.TempDir() + doltLog := filepath.Join(t.TempDir(), "dolt-args.log") + bdLog := filepath.Join(t.TempDir(), "bd.log") + gcLog := filepath.Join(t.TempDir(), "gc.log") + + writeMaintenanceDoltStub(t, filepath.Join(binDir, "dolt")) + writeMaintenanceBdStub(t, filepath.Join(binDir, "bd")) + // gc stub mirrors real `gc mail send`'s success line so the helper's + // awk parser can extract the bead id and follow up with `bd label add`. + writeExecutable(t, filepath.Join(binDir, "gc"), `#!/bin/sh +printf '%s\n' "$*" >> "$GC_CALL_LOG" +if [ "$1" = "mail" ] && [ "$2" = "send" ]; then + printf 'Sent message %s to mayor/\n' "${GC_STUB_MAIL_ID:-stub-id}" +fi +exit 0 +`) + + env := reaperEscalationEnv(cityDir, binDir, stateDir, doltLog, bdLog, gcLog) + env["GC_STUB_MAIL_ID"] = "stub-anomaly-id" + + runScript(t, filepath.Join(exampleDir(), "packs", "maintenance", "assets", "scripts", "reaper.sh"), env) + + bdData, err := os.ReadFile(bdLog) + if err != nil { + t.Fatalf("ReadFile(bd log): %v", err) + } + bdText := string(bdData) + want := "label add stub-anomaly-id wisp_type:escalation" + if !strings.Contains(bdText, want) { + t.Fatalf("reaper did not label the escalation bead with wisp_type:escalation; bd log:\n%s", bdText) + } +} + +func TestReaperEscalationReleaseFooterReportsSuppressedCount(t *testing.T) { + cityDir := t.TempDir() + writeCityBeadsMetadata(t, cityDir, "beads") + binDir := t.TempDir() + stateDir := t.TempDir() + doltLog := filepath.Join(t.TempDir(), "dolt-args.log") + bdLog := filepath.Join(t.TempDir(), "bd.log") + gcLog := filepath.Join(t.TempDir(), "gc.log") + + writeMaintenanceDoltStub(t, filepath.Join(binDir, "dolt")) + writeMaintenanceBdStub(t, filepath.Join(binDir, "bd")) + writeExecutable(t, filepath.Join(binDir, "gc"), `#!/bin/sh +printf '%s\n' "$*" >> "$GC_CALL_LOG" +exit 0 +`) + + subject := "ESCALATION: Reaper anomalies detected [MEDIUM]" + env := reaperEscalationEnv(cityDir, binDir, stateDir, doltLog, bdLog, gcLog) + script := filepath.Join(exampleDir(), "packs", "maintenance", "assets", "scripts", "reaper.sh") + + // Run once so the helper records its own dedupe entry — we use this + // rather than a hand-computed sha256 so the test is robust to changes in + // the helper's exact payload normalisation. + runScript(t, script, env) + if got := countEscalationMails(t, gcLog, subject); got != 1 { + t.Fatalf("first run should send one fresh escalation, got %d", got) + } + + // Mutate the state file in place: backdate last_sent_at to far past the + // cooldown window and pump suppressed_count to 3. The next tick must + // release with the cadence footer. + stateFile := filepath.Join(stateDir, "reaper-state.json") + stateBytes, err := os.ReadFile(stateFile) + if err != nil { + t.Fatalf("ReadFile(state): %v", err) + } + var state map[string]map[string]map[string]any + if err := json.Unmarshal(stateBytes, &state); err != nil { + t.Fatalf("Unmarshal: %v\n%s", err, stateBytes) + } + entries, ok := state["escalations"] + if !ok || len(entries) != 1 { + t.Fatalf("expected one dedupe entry after first run, got %v:\n%s", entries, stateBytes) + } + staleISO := "2025-01-01T00:00:00Z" + for key := range entries { + entries[key]["last_sent_at"] = staleISO + entries[key]["suppressed_count"] = 3 + } + mutated, err := json.Marshal(state) + if err != nil { + t.Fatalf("Marshal: %v", err) + } + if err := os.WriteFile(stateFile, mutated, 0o644); err != nil { + t.Fatalf("WriteFile(state): %v", err) + } + + // Truncate the gc log so the next-run assertion sees only the released send. + if err := os.WriteFile(gcLog, nil, 0o644); err != nil { + t.Fatalf("Truncate(gc log): %v", err) + } + + runScript(t, script, env) + gcData, err := os.ReadFile(gcLog) + if err != nil { + t.Fatalf("ReadFile(gc log): %v", err) + } + gcText := string(gcData) + if !strings.Contains(gcText, subject) { + t.Fatalf("reaper did not send the released ESCALATION:\n%s", gcText) + } + if !strings.Contains(gcText, "Suppressed 3 time(s)") { + t.Fatalf("released ESCALATION missing suppressed-count footer in gc log:\n%s", gcText) + } + if !strings.Contains(gcText, staleISO) { + t.Fatalf("released ESCALATION footer missing stale last_sent_at %q:\n%s", staleISO, gcText) + } +} + +func TestJsonlSpikeEscalationSuppressesRepeats(t *testing.T) { + cityDir := t.TempDir() + binDir := t.TempDir() + stateDir := t.TempDir() + archiveRepo := filepath.Join(t.TempDir(), "archive") + gcLog := filepath.Join(t.TempDir(), "gc.log") + mailLog := filepath.Join(t.TempDir(), "gc-mail.log") + + // Seed the archive with a small prev count so the next export's record + // count drives a 100%+ percentage delta (well above the 20% threshold). + initSeedArchive(t, archiveRepo, 10) + // Current export reports 50 rows — a 400% delta against prev_count=10. + writeMultiRecordDoltStub(t, binDir, 50) + writeJsonlExportGCStub(t, binDir) + writeExecutable(t, filepath.Join(binDir, "bd"), `#!/bin/sh +if [ -n "${BD_LOG:-}" ]; then + printf '%s\n' "$*" >> "$BD_LOG" +fi +exit 0 +`) + linkTestPathTool(t, binDir, "git") + linkTestPathTool(t, binDir, "jq") + linkTestPathTool(t, binDir, "awk") + linkTestPathTool(t, binDir, "mktemp") + linkTestPathTool(t, binDir, "shasum") + + env := jsonlExportEnv(t, cityDir, binDir, stateDir, archiveRepo, gcLog, mailLog) + // The legacy path makes new STATE_FILE coexist with $CITY/.gc/...; our + // helper only sees STATE_FILE, so we need the script to land on the new + // PACK_STATE_DIR path. jsonlExportEnv already sets GC_PACK_STATE_DIR. + + script := filepath.Join(exampleDir(), "packs", "maintenance", "assets", "scripts", "jsonl-export.sh") + runScript(t, script, env) + runScript(t, script, env) + + subject := "ESCALATION: JSONL spike detected [HIGH]" + if got := countEscalationMails(t, mailLog, subject); got != 1 { + t.Fatalf("expected one JSONL spike escalation across two runs (dedupe), got %d:\n%s", got, mustReadFile(t, mailLog)) + } + + stateBytes, err := os.ReadFile(filepath.Join(stateDir, "jsonl-export-state.json")) + if err != nil { + t.Fatalf("ReadFile(state): %v", err) + } + if !strings.Contains(string(stateBytes), `"escalations"`) { + t.Fatalf("expected escalations key in jsonl-export state after first send:\n%s", stateBytes) + } + if !strings.Contains(string(stateBytes), subject) { + t.Fatalf("expected stored subject %q in jsonl-export state:\n%s", subject, stateBytes) + } +} + +// mustReadFile reads a path or fatals — handy for one-line assertion error +// messages that need to dump the failing log alongside the diagnosis. +func mustReadFile(t *testing.T, path string) string { + t.Helper() + data, err := os.ReadFile(path) + if err != nil { + return fmt.Sprintf("(read %s: %v)", path, err) + } + return string(data) +} diff --git a/examples/gastown/packs/maintenance/assets/scripts/escalation.sh b/examples/gastown/packs/maintenance/assets/scripts/escalation.sh new file mode 100755 index 0000000000..60b61552f5 --- /dev/null +++ b/examples/gastown/packs/maintenance/assets/scripts/escalation.sh @@ -0,0 +1,254 @@ +#!/usr/bin/env bash +# escalation — shared helper for ESCALATION mails from maintenance scripts. +# +# Wraps `gc mail send` with three layers: +# (1) per-(subject,body) dedupe state — sha256(subject + body) is the key, +# a configurable cooldown (default 6h, GC_ESCALATION_COOLDOWN_SECONDS +# env override) suppresses repeats. Suppressed sends increment a counter +# that the next fresh send appends as a footer so the operator sees the +# cadence. +# (2) best-effort bd labelling — after a successful send, parse the new +# bead id from `gc mail send`'s "Sent message to " stdout +# line and apply wisp_type:escalation so wisp-compact treats the bead +# as long-lived (7d) instead of the default (24h). +# (3) auto-clear hook — callers signal "underlying condition cleared" by +# invoking clear_escalation_state, which drops dedupe entries so the +# next firing escalates fresh. +# +# This file is sourced by reaper.sh and jsonl-export.sh; do not run it +# directly. + +# Wisp-compact (packs/maintenance/assets/scripts/wisp-compact.sh) treats +# beads carrying this label as the 7d "escalation" class. Without it, +# escalation mails fall into the 24h default bucket and get reaped early. +ESCALATION_LABEL="wisp_type:escalation" + +# Default cooldown is 6 hours; operators can tune per-factory via +# GC_ESCALATION_COOLDOWN_SECONDS without editing the scripts. +_escalation_cooldown_seconds() { + local raw="${GC_ESCALATION_COOLDOWN_SECONDS:-21600}" + case "$raw" in + ''|*[!0-9]*) + printf '%s\n' "21600" + ;; + *) + printf '%s\n' "$raw" + ;; + esac +} + +# sha256 over the subject and body. Output is just the hex digest, no +# trailing filename. Available on macOS (shasum) and Linux (sha256sum); +# falls back to openssl if neither is on PATH. +_escalation_key() { + local subject="$1" + local body="$2" + local payload + payload=$(printf '%s\n%s' "$subject" "$body") + if command -v sha256sum >/dev/null 2>&1; then + printf '%s' "$payload" | sha256sum | awk '{print $1}' + elif command -v shasum >/dev/null 2>&1; then + printf '%s' "$payload" | shasum -a 256 | awk '{print $1}' + elif command -v openssl >/dev/null 2>&1; then + printf '%s' "$payload" | openssl dgst -sha256 | awk '{print $NF}' + else + # Last-resort: hash via cksum — collision-prone but better than + # disabling dedupe entirely. We log to stderr so a smoke-test env + # missing all of {sha256sum,shasum,openssl} is visible. + echo "escalation: sha256 unavailable, falling back to cksum (dedupe weakened)" >&2 + printf '%s' "$payload" | cksum | awk '{print $1 "-" $2}' + fi +} + +# Read the state JSON object from a path, returning '{}' if the file is +# missing or unparseable. Kept independent of jsonl-export's read_state_json +# so reaper.sh (which has no state-IO helpers of its own) can use it too. +_escalation_read_state() { + local path="$1" + if [ -f "$path" ] && command -v jq >/dev/null 2>&1; then + if jq -e 'type == "object"' "$path" >/dev/null 2>&1; then + cat "$path" + return + fi + fi + printf '%s\n' '{}' +} + +# Atomic JSON state write: tmp file in the same dir, then mv. Mirrors the +# pattern jsonl-export.sh uses so concurrent ticks can't tear the file. +_escalation_write_state() { + local path="$1" + local content="$2" + local dir + local tmpfile + dir=$(dirname "$path") + mkdir -p "$dir" 2>/dev/null || true + if ! tmpfile=$(mktemp "${path}.tmp.XXXXXX" 2>/dev/null); then + echo "escalation: failed to create tmp state file under $dir" >&2 + return 1 + fi + if ! printf '%s\n' "$content" > "$tmpfile"; then + rm -f "$tmpfile" + return 1 + fi + if ! mv -f "$tmpfile" "$path"; then + rm -f "$tmpfile" + return 1 + fi +} + +# Convert an RFC3339 UTC timestamp to a unix-epoch second count. Tolerates +# the macOS and GNU date dialects; on failure echoes 0 so the caller sees +# "very old, cooldown elapsed". +_escalation_epoch() { + local ts="$1" + [ -z "$ts" ] && { printf '%s\n' "0"; return; } + date -d "$ts" +%s 2>/dev/null \ + || date -ju -f "%Y-%m-%dT%H:%M:%SZ" "$ts" +%s 2>/dev/null \ + || printf '%s\n' "0" +} + +# Send an ESCALATION mail to mayor/, deduped against a script-local state +# JSON. The caller chooses the state file so two scripts (reaper.sh and +# jsonl-export.sh) cannot cross-suppress each other's escalations. +# +# Args: +# $1 — state file path (will be created with mode-default perms if absent) +# $2 — subject (used both as the mail subject and as part of the dedupe key) +# $3 — body +# +# Returns: +# 0 if a mail was sent (fresh or released-after-suppression). +# 1 if the mail was suppressed by the cooldown. +# 2 on a tooling failure (jq missing, gc unreachable, etc.). In this case +# the dedupe state is not updated, so the next tick will retry. +send_escalation_mail() { + local state_file="$1" + local subject="$2" + local body="$3" + local cooldown + local key + local now_ts + local now_iso + local state_json + local last_ts + local last_epoch + local suppressed_count + local final_body + local sent_line + local bead_id + + if ! command -v jq >/dev/null 2>&1; then + echo "escalation: jq required but missing; sending without dedupe" >&2 + gc mail send mayor/ -s "$subject" -m "$body" 2>/dev/null || return 2 + return 0 + fi + + cooldown=$(_escalation_cooldown_seconds) + key=$(_escalation_key "$subject" "$body") + now_ts=$(date -u +%s) + now_iso=$(date -u +%Y-%m-%dT%H:%M:%SZ) + state_json=$(_escalation_read_state "$state_file") + + last_ts=$(printf '%s' "$state_json" | jq -r --arg k "$key" '.escalations[$k].last_sent_at // ""') + suppressed_count=$(printf '%s' "$state_json" | jq -r --arg k "$key" '.escalations[$k].suppressed_count // 0') + + if [ -n "$last_ts" ]; then + last_epoch=$(_escalation_epoch "$last_ts") + if [ "$((now_ts - last_epoch))" -lt "$cooldown" ]; then + # Still inside the cooldown window — bump the suppressed counter + # and return without sending. The next caller outside the window + # will report the cadence in the released-after-suppression footer. + state_json=$( + printf '%s' "$state_json" \ + | jq -c \ + --arg k "$key" \ + --arg subject "$subject" \ + --argjson n "$((suppressed_count + 1))" \ + '.escalations[$k] = ( + (.escalations[$k] // {}) + + { subject: $subject, suppressed_count: $n } + )' + ) || return 2 + _escalation_write_state "$state_file" "$state_json" || return 2 + return 1 + fi + fi + + # Cooldown elapsed (or first send). Build the final body: if we had any + # suppressed sends, prepend a one-line footer naming the cadence. + final_body="$body" + if [ "$suppressed_count" -gt 0 ] && [ -n "$last_ts" ]; then + final_body="$body + +[Suppressed $suppressed_count time(s) since $last_ts; cooldown ${cooldown}s.]" + fi + + if ! sent_line=$(gc mail send mayor/ -s "$subject" -m "$final_body" 2>/dev/null); then + # Mail send itself failed. Leave dedupe state alone so the next tick + # retries fresh; do not increment suppressed_count (that would lie + # about cadence). Return 2 so the caller can distinguish "tooling + # broke" from "intentionally suppressed". + return 2 + fi + + # Reset state for this key: counter back to zero, last_sent_at now. + state_json=$( + printf '%s' "$state_json" \ + | jq -c \ + --arg k "$key" \ + --arg subject "$subject" \ + --arg at "$now_iso" \ + '.escalations[$k] = { subject: $subject, last_sent_at: $at, suppressed_count: 0 }' + ) || return 2 + _escalation_write_state "$state_file" "$state_json" || return 2 + + # Best-effort labelling. `gc mail send` writes "Sent message to " + # to stdout; parse the id and apply wisp_type:escalation so wisp-compact + # treats this bead as the 7d retention class. + bead_id=$(printf '%s\n' "$sent_line" | awk '/^Sent message / {print $3; exit}') + if [ -n "$bead_id" ]; then + bd label add "$bead_id" "$ESCALATION_LABEL" >/dev/null 2>&1 || true + fi + return 0 +} + +# Drop dedupe state for a script. If a subject is provided, only entries +# whose stored subject matches are removed; without a subject, all dedupe +# entries are cleared. Callers invoke this when they know the underlying +# condition is no longer active (reaper sees empty anomalies; jsonl-export +# pushes successfully or clears a pending spike). +clear_escalation_state() { + local state_file="$1" + local subject_filter="${2:-}" + local state_json + local updated + + if [ ! -f "$state_file" ] || ! command -v jq >/dev/null 2>&1; then + return 0 + fi + state_json=$(_escalation_read_state "$state_file") + if [ -z "$subject_filter" ]; then + updated=$(printf '%s' "$state_json" | jq -c 'del(.escalations)') + else + updated=$( + printf '%s' "$state_json" \ + | jq -c \ + --arg subject "$subject_filter" \ + ' + if (.escalations // {}) | length == 0 then + . + else + .escalations |= with_entries(select(.value.subject != $subject)) + | if (.escalations // {}) == {} then + del(.escalations) + else + . + end + end + ' + ) + fi + [ -z "$updated" ] && return 0 + _escalation_write_state "$state_file" "$updated" || return 1 +} diff --git a/examples/gastown/packs/maintenance/assets/scripts/jsonl-export.sh b/examples/gastown/packs/maintenance/assets/scripts/jsonl-export.sh index 0c9e0544b6..a31d761a38 100755 --- a/examples/gastown/packs/maintenance/assets/scripts/jsonl-export.sh +++ b/examples/gastown/packs/maintenance/assets/scripts/jsonl-export.sh @@ -11,6 +11,7 @@ set -euo pipefail CITY="${GC_CITY:-.}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" . "$SCRIPT_DIR/dolt-target.sh" +. "$SCRIPT_DIR/escalation.sh" # jq is a hard dependency: count_jsonl_rows below relies on it, and a missing # jq would silently zero every record count and could mask spikes on a stale @@ -209,7 +210,10 @@ truncate_push_stderr_for_state() { # Record a successful push in state so `gc doctor` can surface a timestamp for # the archive health check. Clears any stale stderr from previous failures and -# any prior escalation marker so the next failure-cycle escalates fresh. +# any prior escalation marker so the next failure-cycle escalates fresh — +# including the send_escalation_mail dedupe entries for the push-failure +# subject so the next push-failure cycle escalates without spurious suppression +# carried over from the resolved one. record_archive_push_success() { local now now=$(date -u +%Y-%m-%dT%H:%M:%SZ) @@ -223,6 +227,7 @@ record_archive_push_success() { | .last_push_at = $now | del(.last_push_stderr)' )" + clear_escalation_state "$STATE_FILE" "ESCALATION: JSONL push failed [HIGH]" || true } set_pending_archive_push() { @@ -406,9 +411,9 @@ send_spike_alert() { local delta="$4" local threshold="$5" - gc mail send mayor/ -s "ESCALATION: JSONL spike detected [HIGH]" \ - -m "Database: $db, prev: $prev_count, current: $current_count, delta: ${delta}%, threshold: ${threshold}%" \ - 2>/dev/null + send_escalation_mail "$STATE_FILE" \ + "ESCALATION: JSONL spike detected [HIGH]" \ + "Database: $db, prev: $prev_count, current: $current_count, delta: ${delta}%, threshold: ${threshold}%" } retry_pending_spike_alert() { @@ -529,9 +534,9 @@ Remediation: - See docs/getting-started/troubleshooting.md#jsonl-archive-push-failures ESCALATION ) - if gc mail send mayor/ -s "ESCALATION: JSONL push failed [HIGH]" \ - -m "$body" \ - 2>/dev/null; then + if send_escalation_mail "$STATE_FILE" \ + "ESCALATION: JSONL push failed [HIGH]" \ + "$body"; then mark_push_failure_escalated fi fi diff --git a/examples/gastown/packs/maintenance/assets/scripts/reaper.sh b/examples/gastown/packs/maintenance/assets/scripts/reaper.sh index 83b4b176df..cf3aeacf81 100755 --- a/examples/gastown/packs/maintenance/assets/scripts/reaper.sh +++ b/examples/gastown/packs/maintenance/assets/scripts/reaper.sh @@ -11,6 +11,7 @@ set -euo pipefail CITY="${GC_CITY_PATH:-${GC_CITY:-.}}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" . "$SCRIPT_DIR/dolt-target.sh" +. "$SCRIPT_DIR/escalation.sh" CITY_ABS="$(cd "$CITY" 2>/dev/null && pwd -P || printf '%s\n' "$CITY")" CITY_BEADS_DIR="$CITY_ABS/.beads" @@ -22,6 +23,14 @@ SESSION_PURGE_AGE="${GC_REAPER_SESSION_PURGE_AGE:-720h}" ALERT_THRESHOLD="${GC_REAPER_ALERT_THRESHOLD:-500}" DRY_RUN="${GC_REAPER_DRY_RUN:-}" +# Dedupe + retention state for ESCALATION mails. send_escalation_mail (in +# escalation.sh) reads/writes this file; ticks within the cooldown window +# suppress repeat sends so an unchanging anomaly condition does not flood +# the mayor inbox. +PACK_STATE_DIR="${GC_PACK_STATE_DIR:-${GC_CITY_RUNTIME_DIR:-$CITY/.gc/runtime}/packs/maintenance}" +REAPER_STATE_FILE="$PACK_STATE_DIR/reaper-state.json" +REAPER_ANOMALY_SUBJECT="ESCALATION: Reaper anomalies detected [MEDIUM]" + # Convert Go durations to SQL INTERVAL hours for Dolt. duration_to_hours() { local dur="$1" @@ -550,10 +559,14 @@ if [ "$HAD_DATABASES" -eq 0 ] && [ "$SESSION_PRUNE_ATTEMPTED" -eq 0 ]; then exit 0 fi -# Report. +# Report. send_escalation_mail handles per-(subject,body) dedupe so an +# unchanging anomaly condition (e.g. an hq schema gap that persists across +# many ticks) does not flood the mayor inbox. When anomalies clear we wipe +# the dedupe state so the next real anomaly escalates fresh. if [ -n "$ANOMALIES" ]; then - gc mail send mayor/ -s "ESCALATION: Reaper anomalies detected [MEDIUM]" \ - -m "$ANOMALIES" 2>/dev/null || true + send_escalation_mail "$REAPER_STATE_FILE" "$REAPER_ANOMALY_SUBJECT" "$ANOMALIES" || true +else + clear_escalation_state "$REAPER_STATE_FILE" "$REAPER_ANOMALY_SUBJECT" || true fi SUMMARY="reaper — stale_wisps:$TOTAL_STALE_WISPS, closed_wisps:$TOTAL_CLOSED_WISPS, purged:$TOTAL_PURGED, sessions-pruned:$TOTAL_SESSIONS_PRUNED, closed:$TOTAL_ISSUES_CLOSED, skipped_non_city_issues:$TOTAL_STALE_ISSUES_SKIPPED"