diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3760e5b62..2134fa442 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -230,6 +230,51 @@ jobs: path: fuzz/artifacts retention-days: 14 + # ┌──────────────────────────────────────────────────────────────┐ + # │ Group C′ — cargo audit: surface RUSTSEC advisories on the │ + # │ whole workspace including the `tui` feature surface, which │ + # │ pulls 12 optional crates (color-eyre, crossterm, ratatui, │ + # │ tui-input, tui-big-text, …) outside the default build's │ + # │ coverage. Treats every advisory as a warning; the job │ + # │ fails only on `--deny warnings` so a maintainer can react │ + # │ before the next release lands. │ + # └──────────────────────────────────────────────────────────────┘ + cargo-audit: + name: cargo audit + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - name: Install system dependencies + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + - name: Checkout sources + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - name: Install rust 1.88.0 + uses: dtolnay/rust-toolchain@1.88.0 + + - name: Cache cargo target + uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + cache-all-crates: true + prefix-key: ci-audit + workspaces: . -> target/ + + - name: Install cargo-audit + run: cargo install cargo-audit --locked + + - name: Run cargo audit (default features) + run: cargo audit --deny warnings + + - name: Run cargo audit (all features incl. tui) + run: | + # `cargo audit` reads `Cargo.lock`, which already includes the + # `tui`-feature transitive deps since this PR landed them as + # workspace-locked entries. Re-running with the same lockfile + # produces a consistent advisory set; the duplicate invocation + # is kept symmetrical with the per-feature build matrix above. + cargo audit --deny warnings + # ┌──────────────────────────────────────────────────────────────┐ # │ Group D — Bombardier load benches (folded from benchmark.yml) │ # └──────────────────────────────────────────────────────────────┘ diff --git a/CHANGELOG.md b/CHANGELOG.md index c80a5aeb5..77c737422 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -69,6 +69,134 @@ upgrade. See `doc/upgrade/1.x-to-2.0.md` for the full migration guide. `*.com`) is intentionally not implemented — that is a browser-policy concern, not a reverse-proxy responsibility. +- **`QueryMetrics` now folds single-worker fleets correctly**: the master's + `QueryMetricsTask::on_finish` (`bin/src/command/requests.rs`) used to gate + `AggregatedMetrics::merge_metrics()` on `server.workers.len() > 1`, leaving + one-worker setups with empty `clusters` / `proxying` maps and stranding all + per-worker data in `workers`. Every CLI/TUI consumer that reads the merged + shape (`sozu metrics get` without `--workers`, every `sozu top` pane) silently + showed zero data. The fix drops the `> 1` guard — `merge_metrics()` correctly + handles one worker (the relocation loop runs once) and zero workers (no-op). + Regression test `merge_relocates_single_worker_to_top_level` in + `command/src/proto/mod.rs` pins the contract. +- **`sozu top` BACKENDS bandwidth column shows a per-second rate in Mbps**: + the column was previously a cumulative byte count since worker start — + `1.4G/5.6G` after a few hours, monotonically increasing, almost never + what an operator needs while monitoring live traffic. Mirror the + cluster-RPS pattern: the shared `RateCalculator` derives per-second + deltas from the cumulative `BYTES_IN`/`OUT` counters under + `__backend...` keys, the App + caches the rates on `backend_rate_in_bps` / `backend_rate_out_bps`, + and the renderer formats `bytes/sec × 8` as Kbps / Mbps / Gbps + base-1000 to match the networking convention (`nload`, `iftop`, + Grafana panels). Stale per-backend keys are pruned on every snapshot + alongside the cluster-RPS sweep so `RateCalculator.history` cannot + grow unbounded as a fleet churns. Column header changes from + `bw down/up B` to `bw down/up Mbps`; the BACKENDS-sort `Bandwidth` + comparator now uses the rate. +- **`sozu top` CLUSTERS rps column auto-scales to K/M/G**: dashboards + with traffic over 1k req/s were truncating the `rps` cell at the + 8-character width. The column is now 14 wide and the value renders + with a base-1000 suffix (`94.8K req/s`, `1.20M req/s`) so the cell + stays readable for any cluster scale. +- **`sozu top` BACKENDS pane "bw down/up B" now populates**: the renderer + was reading `names::backend::BACK_BYTES_IN` / `BACK_BYTES_OUT` from the + per-backend filing, but those keys are emitted as no-label proxy + counters via `count!(key, value)` at every byte-flow site in + `lib/src/protocol/{pipe,kawa_h1/mod,mux/mod,proxy_protocol/{send,relay}}.rs` + — they land in `worker.proxy[…]` and are never tagged + `(cluster_id, backend_id)`. The per-backend cumulative bytes flow + through `record_backend_metrics!` (`lib/src/metrics/mod.rs`) at request + end under the keys `BYTES_IN` / `BYTES_OUT` with `$bin` / `$bout` + sourced from `SessionMetrics.backend_bin` / `backend_bout` — i.e. + the same backend-socket accumulators. The TUI now reads those. +- **`sozu top` overview sparklines fill the full pane width**: the + default `RenderDirection::LeftToRight` left-anchored samples, which + made the graph appear to use only the left half of its cell until + the ring filled out to column-width. Switch to `RightToLeft` so the + newest sample sits on the right edge and history scrolls leftward + as new ticks arrive — the familiar `vmstat` / `htop` direction. +- **`sozu top` F-key bar actually does things**: F1 Help and F10 Quit + were wired but F2-F9 were placeholders. F2 cycles the resolved + glyph mode (Block → Braille → Tty); F3 and F4 open the colon + palette so operators can type `:cluster ` etc.; F5 (also `p`) + pauses snapshot ingest without dropping the transport lease so the + TUI freezes on the last frame for closer inspection, and the label + flips to "Resume" while paused; F6 cycles the active pane's sort + column. F7/F8/F9 remain reserved slots (`·`) so the bar width + stays stable across builds. +- **`sozu top` CLUSTERS column now shows req/s, not cumulative counter**: + the column header was `rps` but the cell rendered the cumulative + `names::backend::REQUESTS` count, which only changes when a backend + responds. Per-cluster `RateCalculator` keyed by `__cluster..requests` + derives the per-second delta once per ingest and the renderer prints + ` req/s`. Stale cluster keys are pruned on each snapshot so the + rate history cannot grow unbounded as clusters churn. +- **`sozu top` BACKENDS up/total now reads the cluster rollup gauge**: + the renderer was reading `names::backend::AVAILABLE` (a per-backend + gauge set on backend up/down transitions) at the cluster level, which + was always `0` because the gauge is keyed `(cluster, backend)` and + only fires on transitions. A new `names::cluster::AVAILABLE_BACKENDS` + constant captures the canonical per-cluster rollup (`available` + argument of `gauge!` in `lib/src/backends.rs::notify_availability`), + which is the authoritative aggregate and refreshes every health-check + tick. Per-backend `backend.available` (under backend-detail filing) + remains a fallback for the very first snapshot. +- **`sozu top` OVERVIEW replaces 5xx panel with sozu service-time + p99**: the 5xx cell duplicated the LATENCY p99 cell's intent (both + read worst-case quality). Operators want sozu's own request-processing + time distinct from backend latency, so the bottom-left OVERVIEW cell + now plots p99 of `names::event_loop::SERVICE_TIME` from the proxy + map. The critical-banner threshold reuses `latency_p99_critical_ms` + with the `SOZU SLOW` headline so a stalled event loop still trips + the alert overlay. +- **`sozu top` CERTS pane renders addresses as `ip:port`**: the address + column was printing `{:?}` of the proto `SocketAddress` (`SocketAddress + { ip: IpAddress { … } }`), which is unreadable. The pane now goes + through the existing `From for SocketAddr` conversion + so v4 prints as `1.2.3.4:443` and v6 as `[::1]:443`. +- **`sozu top` H2 pane trend column actually plots a trend**: the + trend cells rendered as `"—"` placeholders. Each H2-pane metric now + has a 60-sample `SparkRing` populated by `App::fold_h2_trends` per + ingest; the renderer prints Unicode bars (`▁▂▃▄▅▆▇█`) scaled to the + ring's max sample so even zero-flat series stay distinguishable from + the cold-start placeholder. +- **`sozu top` OVERVIEW and CLUSTERS read counters under backend-detail + filing**: the TUI auto-leases `MetricDetail::Backend` on startup so the + BACKENDS pane has per-row data, which routes every per-cluster counter + (`requests`, `http.status.5xx`, `backend_response_time`) into the worker's + `clusters[].backends[].metrics` map rather than `clusters[].cluster`. + The OVERVIEW and CLUSTERS panes were reading the cluster-level map + exclusively, so the REQUESTS / SEC big-numeral, 5xx ratio, latency p99, and + every per-cluster row read 0 the moment the lease elevated detail. New + helpers in `bin/src/ctl/top/app.rs` (`cluster_count_total`, `cluster_p99_max`, + `cluster_p50_max`) sum across both filings; magic-string keys are replaced + with `sozu_lib::metrics::names::*` constants so a future name rename cannot + silently zero the TUI again. The `proxying` fallback now uses + `names::http::REQUESTS` (`http.requests`, incremented at request-receive + time) so traffic surfaces even before the first backend round-trip + completes. +- **`SetMetricDetail` no longer flaps systemd `RELOADING=1` every TTL/2**: the + cardinality-lease verb was in the `is_mutating_verb` allowlist + (`bin/src/command/requests.rs`), which brackets each call with + `sd_notify(STATE_RELOADING)` / `READY=1`. The TUI auto-renews its lease every + `ttl/2` seconds (≈ 30 s by default) to keep the lease alive, so a long-lived + `sozu top` session caused the unit to flap `reloading`/`active` every 30 s in + `systemctl status` and external monitors. `SetMetricDetail` is now excluded: + it does not change cluster/listener/cert state or fleet topology, so it does + not belong in the systemd bracket set. The audit-log trail + (`EventKind::METRIC_DETAIL_CHANGED`, proto tag 30) still records every apply, + clear, and TTL expiry, so SOC visibility is unaffected. Regression test + `set_metric_detail_is_not_mutating` in `bin/src/command/requests.rs` pins + membership. +- **Cross-worker `Percentiles` aggregation no longer silently drops on + `merge_metrics()`**: `command/src/proto/mod.rs::is_mergeable` returned `false` + for `Inner::Percentiles`, so any cluster metric emitted in percentile shape + (notably `backend_response_time`) was discarded when more than one worker + contributed. The merge now propagates the element-wise max per quantile and + accumulates `samples` + `sum`, preserving the "worst observed quantile" + upper bound across workers. The companion `_histogram` value remains + the source of truth for statistically accurate aggregation. - **Response-side header edits no longer re-injected on every prepare cycle (`H2BlockConverter::finalize` "out buffer not empty" leak)**: `apply_response_header_edits` was called inconditionally before @@ -346,6 +474,53 @@ upgrade. See `doc/upgrade/1.x-to-2.0.md` for the full migration guide. ### 🌟 Added +- **`sozu top` btop/htop-style live operator TUI**: a new clap subcommand + gated by the optional `tui` Cargo feature on `bin/` + (default builds remain lean — `sozu --version` reports `+tui` when the + subcommand is linked in). Surfaces metrics that Sōzu already emits in + six panes — OVERVIEW (sparklines + big numerals), CLUSTERS / BACKENDS + (sortable tables), LISTENERS (5 s ListListeners poll), H2 (streams + + flow control + CVE flood mitigations), CERTS (15 s certificates + poll), EVENTS (colour-coded SubscribeEvents tail). Built on + `ratatui = "0.30"` + `crossterm = "0.29"` over four synchronous + transport threads (snapshots, listeners, certs, events — no `tokio` + runtime in v1 by design); render loop is capped at 30 fps with + DEC-mode-2026 synchronized output for tmux-free flicker. The TUI auto-elevates + metric cardinality via a `SetMetricDetail` TTL lease (`client_id`- + keyed; default 60 s TTL, server clamp 300 s; auto-renewed at half-TTL; + self-expires server-side on crash). A 4-tick pulse tint surfaces + cluster appearance / disappearance / backend-went-down transitions + visually. A `tui-big-text` alert banner overlays the active pane when + the threshold table fires (p99 > 500 ms / 5xx > 1 % / saturation > + 80 %). `--skin ` (with `SOZU_TOP_SKIN` env override) resolves + TOML skins under `$XDG_CONFIG_HOME/sozu/skins/`. Default Okabe-Ito + colour-blind safe categorical + Viridis-shaped continuous ramp; + trend glyphs `▲ ▼ ●` back up the colour cue. Three glyph modes + (Braille / Block / TTY-ASCII) auto-detect against `TERM` / `LANG`. + Full operator guide: [`doc/sozu-top.md`](doc/sozu-top.md). + +- **`SetMetricDetail` runtime cardinality verb** + (`command/src/command.proto:55`, `ResponseContent::MetricDetailStatus = + 16`, `EventKind::METRIC_DETAIL_CHANGED = 30`): operators (and any TUI + client) can elevate the worker's `MetricDetail` floor at runtime via + a TTL-bounded lease. Multiple clients lease independently — the + effective level is `max(configured, max(active leases))`. Leases + self-expire server-side after `ttl_seconds` so a crashed client + cannot permanently elevate cardinality. Workers that pre-date the + verb return the standard `unknown request type` error, which surfaces + in the normal fan-out error tally on `MetricDetailStatus`; production + deployments keep master + workers in sync via the `UpgradeMain` + hot-upgrade flow, so this mixed-version state is transient. Full + semantics in `command.proto`'s `SetMetricDetail` doc comment. Audit-scope: every + cardinality transition emits `EventKind::METRIC_DETAIL_CHANGED` — + operator-initiated transitions are logged by the master at the + dispatch site, AND the worker now emits the same event for janitor- + driven lease expiries and post-fan-out worker-arm apply/clear via + the new worker→master audit IPC, so the audit log carries the full + history regardless of origin. Lease ownership is bound to the + connecting peer's PID + session ULID (`SO_PEERCRED`-derived); + cross-operator `clear` requests are refused at the worker. + - **Pre-built binaries for tagged releases ([#1089](https://github.com/sozu-proxy/sozu/issues/1089))**: a new `.github/workflows/release.yml` triggers on tag push (`X.Y.Z` and diff --git a/Cargo.lock b/Cargo.lock index 059545cd2..1fd6957c5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "addr2line" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" +dependencies = [ + "gimli", +] + [[package]] name = "adler2" version = "2.0.1" @@ -17,6 +26,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "anes" version = "0.1.6" @@ -124,6 +139,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "atomic" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89cbf775b137e9b968e67227ef7f775587cde3fd31b0d8599dbd0f598a48340" +dependencies = [ + "bytemuck", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -173,6 +197,21 @@ dependencies = [ "fs_extra", ] +[[package]] +name = "backtrace" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-link", +] + [[package]] name = "base64" version = "0.21.7" @@ -191,7 +230,7 @@ version = "0.72.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" dependencies = [ - "bitflags", + "bitflags 2.11.1", "cexpr", "clang-sys", "itertools 0.10.5", @@ -205,12 +244,42 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "block-buffer" version = "0.12.0" @@ -220,12 +289,27 @@ dependencies = [ "hybrid-array", ] +[[package]] +name = "block2" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdeb9d870516001442e364c5220d3574d2da8dc765554b4a617230d33fa58ef5" +dependencies = [ + "objc2", +] + [[package]] name = "bumpalo" version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + [[package]] name = "byteorder" version = "1.5.0" @@ -244,6 +328,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "castaway" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" +dependencies = [ + "rustversion", +] + [[package]] name = "cc" version = "1.2.60" @@ -284,7 +377,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.3.0", "rand_core 0.10.1", ] @@ -381,18 +474,68 @@ dependencies = [ "cc", ] +[[package]] +name = "color-eyre" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5920befb47832a6d61ee3a3a846565cfa39b331331e68a3b1d1116630f2f26d" +dependencies = [ + "backtrace", + "color-spantrace", + "eyre", + "indenter", + "once_cell", + "owo-colors", + "tracing-error", +] + +[[package]] +name = "color-spantrace" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8b88ea9df13354b55bc7234ebcce36e6ef896aca2e42a15de9e10edce01b427" +dependencies = [ + "once_cell", + "owo-colors", + "tracing-core", + "tracing-error", +] + [[package]] name = "colorchoice" version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" +[[package]] +name = "compact_str" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb1325a1cece981e8a296ab8f0f9b63ae357bd0784a9faaf548cc7b480707a" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "static_assertions", +] + [[package]] name = "const-oid" version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c" +[[package]] +name = "convert_case" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "633458d4ef8c78b72454de2d54fd6ab2e60f9e02be22f3c6104cdc8a4e0fceb9" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "cookie-factory" version = "0.3.3" @@ -402,6 +545,15 @@ dependencies = [ "futures", ] +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + [[package]] name = "cpufeatures" version = "0.3.0" @@ -490,12 +642,49 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crossterm" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b" +dependencies = [ + "bitflags 2.11.1", + "crossterm_winapi", + "derive_more", + "document-features", + "mio", + "parking_lot", + "rustix", + "signal-hook", + "signal-hook-mio", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + [[package]] name = "crunchy" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + [[package]] name = "crypto-common" version = "0.2.1" @@ -505,12 +694,143 @@ dependencies = [ "hybrid-array", ] +[[package]] +name = "csscolorparser" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb2a7d3066da2de787b7f032c736763eb7ae5d355f81a68bab2675a96008b0bf" +dependencies = [ + "lab", + "phf", +] + +[[package]] +name = "ctrlc" +version = "3.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0b1fab2ae45819af2d0731d60f2afe17227ebb1a1538a236da84c93e9a60162" +dependencies = [ + "dispatch2", + "nix 0.31.2", + "windows-sys 0.61.2", +] + +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core 0.20.11", + "darling_macro 0.20.11", +] + +[[package]] +name = "darling" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +dependencies = [ + "darling_core 0.21.3", + "darling_macro 0.21.3", +] + +[[package]] +name = "darling" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" +dependencies = [ + "darling_core 0.23.0", + "darling_macro 0.23.0", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.117", +] + +[[package]] +name = "darling_core" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.117", +] + +[[package]] +name = "darling_core" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" +dependencies = [ + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.117", +] + +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core 0.20.11", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "darling_macro" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +dependencies = [ + "darling_core 0.21.3", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "darling_macro" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" +dependencies = [ + "darling_core 0.23.0", + "quote", + "syn 2.0.117", +] + [[package]] name = "data-encoding" version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea" +[[package]] +name = "deltae" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5729f5117e208430e437df2f4843f5e5952997175992d1414f94c57d61e270b4" + [[package]] name = "der-parser" version = "10.0.0" @@ -534,15 +854,101 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derive-getters" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74ef43543e701c01ad77d3a5922755c6a1d71b22d942cb8042be4994b380caff" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling 0.20.11", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn 2.0.117", +] + +[[package]] +name = "derive_more" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134" +dependencies = [ + "derive_more-impl", +] + +[[package]] +name = "derive_more-impl" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb" +dependencies = [ + "convert_case", + "proc-macro2", + "quote", + "rustc_version", + "syn 2.0.117", +] + +[[package]] +name = "derive_setters" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7e6f6fa1f03c14ae082120b84b3c7fbd7b8588d924cf2d7c3daf9afd49df8b9" +dependencies = [ + "darling 0.21.3", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer 0.10.4", + "crypto-common 0.1.7", +] + [[package]] name = "digest" version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4850db49bf08e663084f7fb5c87d202ef91a3907271aff24a94eb97ff039153c" dependencies = [ - "block-buffer", + "block-buffer 0.12.0", "const-oid", - "crypto-common", + "crypto-common 0.2.1", ] [[package]] @@ -566,6 +972,18 @@ dependencies = [ "winapi", ] +[[package]] +name = "dispatch2" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38" +dependencies = [ + "bitflags 2.11.1", + "block2", + "libc", + "objc2", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -577,6 +995,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "document-features" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61" +dependencies = [ + "litrs", +] + [[package]] name = "dunce" version = "1.0.5" @@ -631,18 +1058,70 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "euclid" +version = "0.22.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1a05365e3b1c6d1650318537c7460c6923f1abdd272ad6842baa2b509957a06" +dependencies = [ + "num-traits", +] + +[[package]] +name = "eyre" +version = "0.6.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd915d99f24784cdc19fd37ef22b97e3ff0ae756c7e492e9fbfe897d61e2aec" +dependencies = [ + "indenter", + "once_cell", +] + +[[package]] +name = "fancy-regex" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b95f7c0680e4142284cf8b22c14a476e87d61b004a3a0861872b32ef7ead40a2" +dependencies = [ + "bit-set", + "regex", +] + [[package]] name = "fastrand" version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" +[[package]] +name = "filedescriptor" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e40758ed24c9b2eeb76c35fb0aebc66c626084edd827e07e1552279814c6682d" +dependencies = [ + "libc", + "thiserror 1.0.69", + "winapi", +] + [[package]] name = "find-msvc-tools" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "finl_unicode" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9844ddc3a6e533d62bba727eb6c28b5d360921d5175e9ff0f1e621a5c590a4d5" + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + [[package]] name = "fixedbitset" version = "0.5.7" @@ -671,6 +1150,18 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + +[[package]] +name = "font8x8" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875488b8711a968268c7cf5d139578713097ca4635a76044e8fe8eedf831d07e" + [[package]] name = "foreign-types" version = "0.3.2" @@ -780,6 +1271,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -817,6 +1318,12 @@ dependencies = [ "wasip3", ] +[[package]] +name = "gimli" +version = "0.32.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" + [[package]] name = "glob" version = "0.3.3" @@ -859,7 +1366,18 @@ version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "foldhash", + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", ] [[package]] @@ -1101,6 +1619,12 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.1.0" @@ -1122,6 +1646,12 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "indenter" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "964de6e86d545b246d84badc0fef527924ace5134f30641c203ef52ba83f58d5" + [[package]] name = "indexmap" version = "2.14.0" @@ -1134,6 +1664,39 @@ dependencies = [ "serde_core", ] +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + +[[package]] +name = "insta" +version = "1.47.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b4a6248eb93a4401ed2f37dfe8ea592d3cf05b7cf4f8efa867b6895af7e094e" +dependencies = [ + "once_cell", + "similar", + "tempfile", +] + +[[package]] +name = "instability" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb2d60ef19920a3a9193c3e371f726ec1dafc045dac788d0fb3704272458971" +dependencies = [ + "darling 0.23.0", + "indoc", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "is-terminal" version = "0.4.17" @@ -1215,6 +1778,17 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kasuari" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bde5057d6143cc94e861d90f591b9303d6716c6b9602309150bd068853c10899" +dependencies = [ + "hashbrown 0.16.1", + "portable-atomic", + "thiserror 2.0.18", +] + [[package]] name = "kawa" version = "0.6.8" @@ -1225,6 +1799,12 @@ dependencies = [ "nom", ] +[[package]] +name = "lab" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf36173d4167ed999940f804952e6b08197cae5ad5d572eb4db150ce8ad5d58f" + [[package]] name = "lazy_static" version = "1.5.0" @@ -1262,6 +1842,15 @@ dependencies = [ "libc", ] +[[package]] +name = "line-clipping" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f50e8f47623268b5407192d26876c4d7f89d686ca130fdc53bced4814cd29f8" +dependencies = [ + "bitflags 2.11.1", +] + [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -1274,6 +1863,12 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" +[[package]] +name = "litrs" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092" + [[package]] name = "lock_api" version = "0.4.14" @@ -1299,12 +1894,37 @@ dependencies = [ "tracing", ] +[[package]] +name = "lru" +version = "0.16.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39" +dependencies = [ + "hashbrown 0.16.1", +] + +[[package]] +name = "mac_address" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0aeb26bf5e836cc1c341c8106051b573f1766dfa05aa87f0b98be5e51b02303" +dependencies = [ + "nix 0.29.0", + "winapi", +] + [[package]] name = "memchr" version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "memmem" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a64a92489e2744ce060c349162be1c5f33c6969234104dbd99ddb5feb08b8c15" + [[package]] name = "memoffset" version = "0.9.1" @@ -1348,13 +1968,26 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" +[[package]] +name = "nix" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46" +dependencies = [ + "bitflags 2.11.1", + "cfg-if", + "cfg_aliases", + "libc", + "memoffset", +] + [[package]] name = "nix" version = "0.31.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d6d0705320c1e6ba1d912b5e37cf18071b6c2e9b7fa8215a1e8a7651966f5d3" dependencies = [ - "bitflags", + "bitflags 2.11.1", "cfg-if", "cfg_aliases", "libc", @@ -1387,6 +2020,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" +[[package]] +name = "num-derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "num-integer" version = "0.1.46" @@ -1415,12 +2059,45 @@ dependencies = [ "libc", ] +[[package]] +name = "num_threads" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +dependencies = [ + "libc", +] + [[package]] name = "numtoa" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6aa2c4e539b869820a2b82e1aef6ff40aa85e65decdd5185e83fb4b1249cd00f" +[[package]] +name = "objc2" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f" +dependencies = [ + "objc2-encode", +] + +[[package]] +name = "objc2-encode" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33" + +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + [[package]] name = "oid-registry" version = "0.8.1" @@ -1454,7 +2131,7 @@ version = "0.10.78" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f38c4372413cdaaf3cc79dd92d29d7d9f5ab09b51b10dded508fb90bb70b9222" dependencies = [ - "bitflags", + "bitflags 2.11.1", "cfg-if", "foreign-types", "libc", @@ -1486,6 +2163,21 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "ordered-float" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951" +dependencies = [ + "num-traits", +] + +[[package]] +name = "owo-colors" +version = "4.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d211803b9b6b570f68772237e415a029d5a50c65d382910b879fb19d3271f94d" + [[package]] name = "parking_lot" version = "0.12.5" @@ -1509,12 +2201,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "paste" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" - [[package]] name = "paw" version = "1.0.0" @@ -1542,17 +2228,112 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f0b59668fe80c5afe998f0c0bf93322bf2cd66cafeeb80581f291716f3467f2" +[[package]] +name = "pest" +version = "2.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0848c601009d37dfa3430c4666e147e49cdcf1b92ecd3e63657d8a5f19da662" +dependencies = [ + "memchr", + "ucd-trie", +] + +[[package]] +name = "pest_derive" +version = "2.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11f486f1ea21e6c10ed15d5a7c77165d0ee443402f0780849d1768e7d9d6fe77" +dependencies = [ + "pest", + "pest_generator", +] + +[[package]] +name = "pest_generator" +version = "2.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8040c4647b13b210a963c1ed407c1ff4fdfa01c31d6d2a098218702e6664f94f" +dependencies = [ + "pest", + "pest_meta", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "pest_meta" +version = "2.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89815c69d36021a140146f26659a81d6c2afa33d216d736dd4be5381a7362220" +dependencies = [ + "pest", + "sha2 0.10.9", +] + [[package]] name = "petgraph" version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ - "fixedbitset", + "fixedbitset 0.5.7", "hashbrown 0.15.5", "indexmap", ] +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros", + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared", + "rand 0.8.6", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project-lite" version = "0.2.17" @@ -1599,6 +2380,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7ac1531a0016945992b4e816e81538dfad0b9f00d280bcb707d711839f1536d" +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + [[package]] name = "potential_utf" version = "0.1.5" @@ -1652,7 +2439,7 @@ dependencies = [ "is-terminal", "lazy_static", "term", - "unicode-width", + "unicode-width 0.1.14", ] [[package]] @@ -1794,6 +2581,91 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" +[[package]] +name = "ratatui" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1ce67fb8ba4446454d1c8dbaeda0557ff5e94d39d5e5ed7f10a65eb4c8266bc" +dependencies = [ + "instability", + "ratatui-core", + "ratatui-crossterm", + "ratatui-macros", + "ratatui-termwiz", + "ratatui-widgets", +] + +[[package]] +name = "ratatui-core" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ef8dea09a92caaf73bff7adb70b76162e5937524058a7e5bff37869cbbec293" +dependencies = [ + "bitflags 2.11.1", + "compact_str", + "hashbrown 0.16.1", + "indoc", + "itertools 0.14.0", + "kasuari", + "lru", + "strum", + "thiserror 2.0.18", + "unicode-segmentation", + "unicode-truncate", + "unicode-width 0.2.2", +] + +[[package]] +name = "ratatui-crossterm" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "577c9b9f652b4c121fb25c6a391dd06406d3b092ba68827e6d2f09550edc54b3" +dependencies = [ + "cfg-if", + "crossterm", + "instability", + "ratatui-core", +] + +[[package]] +name = "ratatui-macros" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7f1342a13e83e4bb9d0b793d0ea762be633f9582048c892ae9041ef39c936f4" +dependencies = [ + "ratatui-core", + "ratatui-widgets", +] + +[[package]] +name = "ratatui-termwiz" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f76fe0bd0ed4295f0321b1676732e2454024c15a35d01904ddb315afd3d545c" +dependencies = [ + "ratatui-core", + "termwiz", +] + +[[package]] +name = "ratatui-widgets" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7dbfa023cd4e604c2553483820c5fe8aa9d71a42eea5aa77c6e7f35756612db" +dependencies = [ + "bitflags 2.11.1", + "hashbrown 0.16.1", + "indoc", + "instability", + "itertools 0.14.0", + "line-clipping", + "ratatui-core", + "strum", + "time", + "unicode-segmentation", + "unicode-width 0.2.2", +] + [[package]] name = "rayon" version = "1.12.0" @@ -1820,7 +2692,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags", + "bitflags 2.11.1", ] [[package]] @@ -1877,12 +2749,27 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rustc-demangle" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" + [[package]] name = "rustc-hash" version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rusticata-macros" version = "4.1.0" @@ -1898,7 +2785,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags", + "bitflags 2.11.1", "errno", "libc", "linux-raw-sys", @@ -1935,15 +2822,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "rustls-pemfile" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" -dependencies = [ - "rustls-pki-types", -] - [[package]] name = "rustls-pki-types" version = "1.14.0" @@ -1982,6 +2860,12 @@ dependencies = [ "time", ] +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + [[package]] name = "same-file" version = "1.0.6" @@ -2096,6 +2980,17 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures 0.2.17", + "digest 0.10.7", +] + [[package]] name = "sha2" version = "0.11.0" @@ -2103,8 +2998,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4" dependencies = [ "cfg-if", - "cpufeatures", - "digest", + "cpufeatures 0.3.0", + "digest 0.11.2", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", ] [[package]] @@ -2113,12 +3017,55 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "signal-hook" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d881a16cf4426aa584979d30bd82cb33429027e42122b169753d6ef1085ed6e2" +dependencies = [ + "libc", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-mio" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b75a19a7a740b25bc7944bdee6172368f988763b744e3d4dfe753f6b4ece40cc" +dependencies = [ + "libc", + "mio", + "signal-hook", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno", + "libc", +] + [[package]] name = "simd-adler32" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" +[[package]] +name = "similar" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" + +[[package]] +name = "siphasher" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649" + [[package]] name = "slab" version = "0.4.12" @@ -2146,25 +3093,38 @@ name = "sozu" version = "1.1.1" dependencies = [ "clap", + "color-eyre", + "crossbeam-channel", + "crossterm", + "ctrlc", + "insta", "jemallocator", "libc", "log", "mio", - "nix", + "nix 0.31.2", "nom", "num_cpus", "paw", "prost", + "ratatui", "regex", "rusty_ulid", "serde", "serde_json", - "sha2", + "sha2 0.11.0", "sozu-command-lib", "sozu-lib", "tempfile", "termion", "thiserror 2.0.18", + "throbber-widgets-tui", + "toml", + "tui-big-text", + "tui-input", + "tui-popup", + "tui-scrollview", + "tui-tree-widget", ] [[package]] @@ -2176,7 +3136,7 @@ dependencies = [ "log", "memchr", "mio", - "nix", + "nix 0.31.2", "nom", "pool", "poule", @@ -2188,7 +3148,7 @@ dependencies = [ "serde", "serde_json", "serial_test", - "sha2", + "sha2 0.11.0", "tempfile", "thiserror 2.0.18", "time", @@ -2209,10 +3169,9 @@ dependencies = [ "hyper-util", "libc", "mio", - "paste", "rustls", "serial_test", - "sha2", + "sha2 0.11.0", "sozu-command-lib", "sozu-lib", "tempfile", @@ -2244,10 +3203,9 @@ dependencies = [ "regex", "rustls", "rustls-openssl", - "rustls-pemfile", "rusty_ulid", "serial_test", - "sha2", + "sha2 0.11.0", "slab", "socket2", "sozu-command-lib", @@ -2263,12 +3221,39 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "subtle" version = "2.6.1" @@ -2332,6 +3317,18 @@ dependencies = [ "winapi", ] +[[package]] +name = "terminfo" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4ea810f0692f9f51b382fff5893887bb4580f5fa246fde546e0b13e7fcee662" +dependencies = [ + "fnv", + "nom", + "phf", + "phf_codegen", +] + [[package]] name = "termion" version = "4.0.6" @@ -2342,6 +3339,57 @@ dependencies = [ "numtoa", ] +[[package]] +name = "termios" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "411c5bf740737c7918b8b1fe232dca4dc9f8e754b8ad5e20966814001ed0ac6b" +dependencies = [ + "libc", +] + +[[package]] +name = "termwiz" +version = "0.23.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4676b37242ccbd1aabf56edb093a4827dc49086c0ffd764a5705899e0f35f8f7" +dependencies = [ + "anyhow", + "base64 0.22.1", + "bitflags 2.11.1", + "fancy-regex", + "filedescriptor", + "finl_unicode", + "fixedbitset 0.4.2", + "hex", + "lazy_static", + "libc", + "log", + "memmem", + "nix 0.29.0", + "num-derive", + "num-traits", + "ordered-float", + "pest", + "pest_derive", + "phf", + "sha2 0.10.9", + "signal-hook", + "siphasher", + "terminfo", + "termios", + "thiserror 1.0.69", + "ucd-trie", + "unicode-segmentation", + "vtparse", + "wezterm-bidi", + "wezterm-blob-leases", + "wezterm-color-types", + "wezterm-dynamic", + "wezterm-input-types", + "winapi", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -2382,6 +3430,24 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "throbber-widgets-tui" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1e6941f74491a80911cb8821cb1f55f7e13bca867b28a2b14e5a1daaf691eb3" +dependencies = [ + "ratatui", +] + [[package]] name = "time" version = "0.3.47" @@ -2390,7 +3456,9 @@ checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", "itoa", + "libc", "num-conv", + "num_threads", "powerfmt", "serde_core", "time-core", @@ -2568,6 +3636,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", + "valuable", +] + +[[package]] +name = "tracing-error" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1581020d7a273442f5b45074a6a57d5757ad0a47dac0e9f0bd57b81936f3db" +dependencies = [ + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" +dependencies = [ + "sharded-slab", + "thread_local", + "tracing-core", ] [[package]] @@ -2576,24 +3666,112 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "tui-big-text" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6833ec23415d48753f28caec76fa149d0d319ebaedec77ad7d09f7e2094bee8a" +dependencies = [ + "derive_builder", + "font8x8", + "itertools 0.14.0", + "ratatui-core", + "ratatui-widgets", +] + +[[package]] +name = "tui-input" +version = "0.15.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bd014a652e31cf25ea68d11b10a7b09549863449b19387505c9933f11eb05fa" +dependencies = [ + "ratatui", + "unicode-segmentation", + "unicode-width 0.2.2", +] + +[[package]] +name = "tui-popup" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "440ccb456a6e4c6141e985c37b9b93378c0f108303c0448f64dfc7959ed7fa3a" +dependencies = [ + "derive-getters", + "derive_setters", + "document-features", + "ratatui-core", + "ratatui-widgets", +] + +[[package]] +name = "tui-scrollview" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94a94f467c7ac7c291039b0733e3b2d379c77884e34fc27d167921fc1ab4842f" +dependencies = [ + "indoc", + "ratatui-core", + "ratatui-widgets", +] + +[[package]] +name = "tui-tree-widget" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deca119555009eee2e0cfb9c020f39f632444dc4579918d5fc009d51d75dff92" +dependencies = [ + "ratatui-core", + "ratatui-widgets", + "unicode-width 0.2.2", +] + [[package]] name = "typenum" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" +[[package]] +name = "ucd-trie" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" + [[package]] name = "unicode-ident" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-segmentation" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" + +[[package]] +name = "unicode-truncate" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b380a1238663e5f8a691f9039c73e1cdae598a30e9855f541d29b08b53e9a5" +dependencies = [ + "itertools 0.14.0", + "unicode-segmentation", + "unicode-width 0.2.2", +] + [[package]] name = "unicode-width" version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "unicode-xid" version = "0.2.6" @@ -2618,12 +3796,45 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "uuid" +version = "1.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" +dependencies = [ + "atomic", + "getrandom 0.4.2", + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + [[package]] name = "vcpkg" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "vtparse" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d9b2acfb050df409c972a37d3b8e08cdea3bddb0c09db9d53137e504cfabed0" +dependencies = [ + "utf8parse", +] + [[package]] name = "walkdir" version = "2.5.0" @@ -2740,7 +3951,7 @@ version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "bitflags", + "bitflags 2.11.1", "hashbrown 0.15.5", "indexmap", "semver", @@ -2765,6 +3976,78 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "wezterm-bidi" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0a6e355560527dd2d1cf7890652f4f09bb3433b6aadade4c9b5ed76de5f3ec" +dependencies = [ + "log", + "wezterm-dynamic", +] + +[[package]] +name = "wezterm-blob-leases" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "692daff6d93d94e29e4114544ef6d5c942a7ed998b37abdc19b17136ea428eb7" +dependencies = [ + "getrandom 0.3.4", + "mac_address", + "sha2 0.10.9", + "thiserror 1.0.69", + "uuid", +] + +[[package]] +name = "wezterm-color-types" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7de81ef35c9010270d63772bebef2f2d6d1f2d20a983d27505ac850b8c4b4296" +dependencies = [ + "csscolorparser", + "deltae", + "lazy_static", + "wezterm-dynamic", +] + +[[package]] +name = "wezterm-dynamic" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f2ab60e120fd6eaa68d9567f3226e876684639d22a4219b313ff69ec0ccd5ac" +dependencies = [ + "log", + "ordered-float", + "strsim", + "thiserror 1.0.69", + "wezterm-dynamic-derive", +] + +[[package]] +name = "wezterm-dynamic-derive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c0cf2d539c645b448eaffec9ec494b8b19bd5077d9e58cb1ae7efece8d575b" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "wezterm-input-types" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7012add459f951456ec9d6c7e6fc340b1ce15d6fc9629f8c42853412c029e57e" +dependencies = [ + "bitflags 1.3.2", + "euclid", + "lazy_static", + "serde", + "wezterm-dynamic", +] + [[package]] name = "winapi" version = "0.3.9" @@ -2948,7 +4231,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", - "bitflags", + "bitflags 2.11.1", "indexmap", "log", "serde", diff --git a/Cargo.toml b/Cargo.toml index 00dda00a3..8c929d9b1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,7 +38,6 @@ quickcheck = "^1.1.0" rand = "^0.10.1" regex = "^1.12.3" rustls-openssl = { version = "^0.3.0", default-features = false, features = ["tls12"] } -rustls-pemfile = "^2.2.0" rustls = { version = "^0.23.38", default-features = false, features = [ "logging", "std", @@ -63,6 +62,22 @@ tokio = "^1.52.1" toml = { version = "^1.1.2", features = ["preserve_order"] } x509-parser = "^0.18.1" +# Dependencies for the `sozu top` TUI subcommand. All gated behind the +# opt-in `tui` Cargo feature on `bin/`; default `sozu` builds do not pull +# any of them in. Versions pinned 2026-05-10 against crates.io HEAD; bump +# the patch component locally if cargo update lands a fix. +color-eyre = "^0.6.5" +crossbeam-channel = "^0.5.15" +crossterm = { version = "^0.29.0", default-features = false } +ctrlc = "^3.4.5" +ratatui = { version = "^0.30.0", default-features = false } +throbber-widgets-tui = "^0.11.0" +tui-big-text = "^0.8.4" +tui-input = "^0.15.3" +tui-popup = "^0.7.4" +tui-scrollview = "^0.6.4" +tui-tree-widget = "^0.24.0" + [profile.release] lto = true codegen-units = 1 diff --git a/README.md b/README.md index 398b7c821..3bf4b47de 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ Two key dependencies have been optimized in this way: - [Kawa](https://github.com/CleverCloud/kawa) is a generic HTTP representation library that parses and translates HTTP messages with zero copy - [Rustls](https://github.com/rustls/rustls) is a TLS library that encrypts/decrypts TLS traffic with as little intermediate memory usage as it gets +- **Live operator TUI:** `sozu top` (build with `--features tui`) is a btop/htop-style live dashboard over the existing command socket — sparklines, sortable cluster + backend tables, H2 flood-mitigation counters, and a colour-coded event tail in a single screen. See [`doc/sozu-top.md`](doc/sozu-top.md). To get started check out our [documentation](./doc/README.md) ! diff --git a/bin/Cargo.toml b/bin/Cargo.toml index d76445782..4a78d6c2f 100644 --- a/bin/Cargo.toml +++ b/bin/Cargo.toml @@ -46,8 +46,31 @@ thiserror = { workspace = true } sozu-command-lib = { workspace = true } sozu-lib = { workspace = true } +# `sozu top` TUI dependencies. All optional; activated only by the `tui` Cargo +# feature below. Default `sozu` builds do not pull any of these in, keeping the +# production binary lean. None of these crates pull in a `tokio` runtime — the +# TUI is intentionally built on synchronous transport threads + crossterm +# events to stay aligned with Sōzu's no-`async` lib/ posture. +color-eyre = { workspace = true, optional = true } +crossbeam-channel = { workspace = true, optional = true } +crossterm = { workspace = true, optional = true, features = ["events", "bracketed-paste"] } +ctrlc = { workspace = true, optional = true, features = ["termination"] } +ratatui = { workspace = true, optional = true, features = ["crossterm", "macros", "underline-color"] } +throbber-widgets-tui = { workspace = true, optional = true } +toml = { workspace = true, optional = true } +tui-big-text = { workspace = true, optional = true } +tui-input = { workspace = true, optional = true } +tui-popup = { workspace = true, optional = true } +tui-scrollview = { workspace = true, optional = true } +tui-tree-widget = { workspace = true, optional = true } + [dev-dependencies] regex = { workspace = true } +tempfile = { workspace = true } +# `insta` is gated to `cfg(feature = "tui")` snapshot tests under `bin/src/ctl/top/`. +# Auditor-locked: keep it on `[dev-dependencies]` only — never a production dep. +# Tests run without `cargo insta review`; snapshots fail loudly the first time. +insta = { version = "^1.43.2", default-features = false } [target.'cfg(target_os = "linux")'.dependencies] num_cpus = { workspace = true } @@ -77,3 +100,23 @@ opentelemetry = ["sozu-lib/opentelemetry"] tolerant-http1-parser = ["sozu-lib/tolerant-http1-parser"] simd = ["sozu-lib/simd"] splice = ["sozu-lib/splice"] + +# `sozu top` btop/htop-style TUI subcommand. Opt-in: production builds typically +# omit this. Pulls ratatui + crossterm + crossbeam-channel + ctrlc + a curated +# set of TUI polish crates. Deliberately does NOT pull `tokio` — the TUI runs on +# two synchronous threads (collector + events) plus the UI thread, matching the +# existing `bin/` style and avoiding an async runtime for v1. +tui = [ + "dep:color-eyre", + "dep:crossbeam-channel", + "dep:crossterm", + "dep:ctrlc", + "dep:ratatui", + "dep:throbber-widgets-tui", + "dep:toml", + "dep:tui-big-text", + "dep:tui-input", + "dep:tui-popup", + "dep:tui-scrollview", + "dep:tui-tree-widget", +] diff --git a/bin/README.md b/bin/README.md index 5dfe4d503..f8be56577 100644 --- a/bin/README.md +++ b/bin/README.md @@ -36,3 +36,12 @@ reconfig fan-out, FD-passing for hot upgrades — is planned alongside the in-tree code in `bin/src/command/`. Until that lands the canonical sources are the module-level comments in `bin/src/command/{mod,server,sessions}.rs` and the control-plane audit-log section of `doc/observability.md`. + +## `sozu top` — live operator TUI + +Build with `--features tui` to get a btop/htop-style live console. `sozu top` +talks to the same command socket as the rest of the CLI and renders clusters, +backends, listeners, certificates, H2 flood counters, and a `SubscribeEvents` +tail in a single screen. The TUI is read-only and self-clears its cardinality +lease on exit. See [`../doc/sozu-top.md`](../doc/sozu-top.md) and the flag +reference in [`../doc/configure_cli.md`](../doc/configure_cli.md). diff --git a/bin/build.rs b/bin/build.rs index e67844d3d..6ff618834 100644 --- a/bin/build.rs +++ b/bin/build.rs @@ -49,6 +49,11 @@ fn main() { ("LOGS_DEBUG", "logs-debug"), ("LOGS_TRACE", "logs-trace"), ("UNSTABLE", "unstable"), + // `tui` gates the `sozu top` btop/htop-style subcommand. Surfaced in + // `--version` so operators can spot whether ratatui/crossterm and the + // polish crates were linked in (it materially changes the binary + // size and dep graph). + ("TUI", "tui"), ]; // Crypto-provider features follow the runtime precedence chain diff --git a/bin/src/cli.rs b/bin/src/cli.rs index 8584fc49f..4eb3d1803 100644 --- a/bin/src/cli.rs +++ b/bin/src/cli.rs @@ -238,6 +238,77 @@ pub enum SubCmd { #[clap(subcommand)] cmd: ConnectionLimitCmd, }, + /// Live operator TUI: btop/htop-style overview of clusters, backends, + /// listeners, and H2 health. Built behind the `tui` Cargo feature so + /// production binaries stay lean. v1 is read-only; the cardinality lease + /// is auto-applied (DETAIL_BACKEND, TTL ~60s) and self-clears on exit. + #[cfg(feature = "tui")] + #[clap( + name = "top", + about = "live operator TUI (btop/htop-style) for clusters, backends, listeners, H2" + )] + Top { + /// Data poll interval in milliseconds; the render loop is capped + /// independently at 30 fps regardless of this value. + #[clap(long = "refresh-ms", default_value_t = 1000)] + refresh_ms: u64, + /// Disable mouse capture. Useful inside multiplexers that mis-route + /// SGR mouse events. + #[clap(long = "no-mouse")] + no_mouse: bool, + /// Skin name. Looked up under `$XDG_CONFIG_HOME/sozu/skins/.toml`; + /// `SOZU_TOP_SKIN` env var takes precedence when both are set. + #[clap(long = "skin")] + skin: Option, + /// Override the cardinality lease level. Default: `Backend` + /// (auto-elevate, lease self-expires server-side). + #[clap(long = "detail", value_enum)] + detail: Option, + /// Lease TTL in seconds; the renewer halves this for renewals. + /// Server clamps at 300s. + #[clap(long = "lease-ttl-seconds", default_value_t = 60)] + lease_ttl_seconds: u32, + /// Render N frames to stdout and exit (test affordance, no terminal + /// control). Mutually exclusive with interactive mode. + #[clap(long = "snapshot")] + snapshot: Option, + /// Drive one data tick + one render tick and exit (test affordance). + #[clap(long = "tick-once")] + tick_once: bool, + /// Force a glyph mode; auto-detect by default + /// (Braille → Block → TTY-ASCII). + #[clap(long = "glyphs", value_enum)] + glyphs: Option, + }, +} + +/// `--detail` clap value enum for `sozu top`. Mirrors `MetricDetailLevel` +/// without leaking the proto-generated type into the CLI surface. +#[cfg(feature = "tui")] +#[derive(clap::ValueEnum, PartialEq, Eq, Clone, Copy, Debug)] +pub enum TopDetail { + /// Proxy-only counters (smallest keyspace). + Process, + /// Adds per-listener (frontend) breakdown. + Frontend, + /// Adds per-cluster aggregation (current Sōzu default). + Cluster, + /// Adds per-backend aggregation (cluster + backend, highest cardinality). + Backend, +} + +/// `--glyphs` clap value enum for `sozu top`. Three modes mirroring btop: +/// Braille (highest density), Block (compatible Unicode), TTY-ASCII fallback. +#[cfg(feature = "tui")] +#[derive(clap::ValueEnum, PartialEq, Eq, Clone, Copy, Debug)] +pub enum TopGlyphs { + /// Highest-density Unicode Braille mosaics. Default when the terminal + /// reports Unicode-capable locale + adequate font. + Braille, + /// Plain Unicode block elements; broadest Unicode terminal compatibility. + Block, + /// 7-bit ASCII fallback for `linux`/`dumb` TERMs and serial consoles. + Tty, } #[derive(Subcommand, PartialEq, Eq, Clone, Debug)] diff --git a/bin/src/command/requests.rs b/bin/src/command/requests.rs index b52342ad9..aa8fdd64c 100644 --- a/bin/src/command/requests.rs +++ b/bin/src/command/requests.rs @@ -27,11 +27,12 @@ use sozu_command_lib::{ parser::parse_several_requests, proto::command::{ AggregatedMetrics, AvailableMetrics, CertificatesWithFingerprints, ClusterHashes, - ClusterInformations, Event, EventKind, FrontendFilters, HardStop, MetricsConfiguration, - QueryCertificatesFilters, QueryHealthChecks, QueryMetricsOptions, Request, ResponseContent, - ResponseStatus, RunState, SoftStop, Status, UpdateHttpListenerConfig, - UpdateHttpsListenerConfig, UpdateTcpListenerConfig, WorkerInfo, WorkerInfos, WorkerRequest, - WorkerResponses, request::RequestType, response_content::ContentType, + ClusterInformations, Event, EventKind, FrontendFilters, HardStop, MetricDetail, + MetricDetailStatus, MetricsConfiguration, QueryCertificatesFilters, QueryHealthChecks, + QueryMetricsOptions, Request, ResponseContent, ResponseStatus, RunState, SetMetricDetail, + SoftStop, Status, UpdateHttpListenerConfig, UpdateHttpsListenerConfig, + UpdateTcpListenerConfig, WorkerInfo, WorkerInfos, WorkerRequest, WorkerResponses, + request::RequestType, response_content::ContentType, }, sd_notify, }; @@ -42,7 +43,7 @@ use crate::command::{ DefaultGatherer, Gatherer, GatheringTask, MessageClient, Server, ServerState, Timeout, WorkerId, }, - sessions::{ClientSession, OptionalClient, sanitize_for_audit}, + sessions::{ClientSession, OptionalClient, sanitize_for_audit, sanitize_for_audit_kv}, upgrade::{upgrade_main, upgrade_worker}, }; @@ -84,7 +85,7 @@ macro_rules! audit_verb { /// second audit line. macro_rules! audit_log_context { ($server:expr, $client:expr, $request_id:expr, $entry:expr, $result:expr) => {{ - use $crate::command::sessions::sanitize_for_audit; + use $crate::command::sessions::{sanitize_for_audit, sanitize_for_audit_kv}; let (open, reset, grey, gray, white) = ::sozu_command_lib::logging::ansi_palette(); let log_ctx = ::sozu_command_lib::logging::LogContext { session_id: $client.session_ulid, @@ -148,6 +149,38 @@ macro_rules! audit_log_context { hash = hash, )); } + if let Some(lease_id) = $entry.extras.metric_detail_lease_id.as_deref() { + let sanitized = sanitize_for_audit_kv(lease_id); + let truncated = if sanitized.chars().count() > AUDIT_LEASE_ID_MAX_CHARS { + let cut: String = sanitized.chars().take(AUDIT_LEASE_ID_MAX_CHARS).collect(); + format!("{cut}…") + } else { + sanitized + }; + extras.push_str(&format!( + ", {gray}lease_id{reset}={white}{value}{reset}", + gray = gray, + reset = reset, + white = white, + value = truncated, + )); + } + if let Some(detail_reason) = $entry.extras.metric_detail_reason.as_deref() { + let sanitized = sanitize_for_audit_kv(detail_reason); + let truncated = if sanitized.chars().count() > AUDIT_REASON_MAX_CHARS { + let cut: String = sanitized.chars().take(AUDIT_REASON_MAX_CHARS).collect(); + format!("{cut}…") + } else { + sanitized + }; + extras.push_str(&format!( + ", {gray}metric_detail_reason{reset}={white}{value}{reset}", + gray = gray, + reset = reset, + white = white, + value = truncated, + )); + } let now_ts = rfc3339_utc(std::time::SystemTime::now()); let connect_ts = $client.connect_ts_display(); format!( @@ -186,6 +219,15 @@ macro_rules! audit_log_context { /// against the change. Read-only verbs (Status / List* / Query* / /// CountRequests / SubscribeEvents / QueryMaxConnectionsPerIp) return /// `false` because they're dashboard polls, not transitions. +/// +/// `SetMetricDetail` is deliberately excluded: it is a runtime +/// observability knob, not a state transition, and the `sozu top` TUI +/// renews its lease every `ttl/2` seconds (≈ 30 s by default). Including +/// it in this set would flap the systemd unit through `reloading` +/// every renewal for the whole TUI session lifetime. The audit trail +/// for the verb still flows through the special-case inline emission +/// (`EventKind::MetricDetailChanged`, proto tag 30) so SOC visibility +/// is preserved without flapping the unit state. fn is_mutating_verb(req: &RequestType) -> bool { matches!( req, @@ -352,6 +394,16 @@ impl Server { RequestType::SetMaxConnectionsPerIp(_) | RequestType::QueryMaxConnectionsPerIp(_) => { worker_request(self, client, request_type); } + // `sozu top`'s runtime cardinality lease verb. Each worker maintains + // its own lease table and recomputes the effective `MetricDetail` as + // `max(configured, max(active leases))`. The master fans the verb out + // through a dedicated dispatcher that synthesises the aggregate + // `MetricDetailStatus` reply, captures the master's own + // configured/effective view, and emits the attempt-time + completion + // audit rows alongside the per-worker fan-out. + RequestType::SetMetricDetail(req) => { + set_metric_detail_request(self, client, req); + } } if mutating { @@ -977,6 +1029,17 @@ pub(crate) struct AuditExtras { /// Set for verbs that flow through `worker_request`; `None` for inline /// verbs that don't carry a payload worth hashing. pub(crate) request_sha256: Option, + /// Operator-supplied `SetMetricDetail.client_id` (the lease key). Set + /// only for the `MetricDetailChanged` audit verb. Distinct from the + /// connection-scoped `ClientSession.id` rendered in the outer audit + /// envelope: this one identifies the lease, that one identifies the + /// command-socket caller. Rendered as a dedicated `lease_id=…` field + /// so attacker-supplied `:` / `=` cannot smuggle a fake column. + pub(crate) metric_detail_lease_id: Option, + /// Operator-supplied `SetMetricDetail.reason`. Free-form human note. + /// Sanitised via [`sanitize_for_audit_kv`] (control bytes + `,` + `=` + /// stripped) and truncated to [`AUDIT_REASON_MAX_CHARS`]. + pub(crate) metric_detail_reason: Option, } /// A control-plane mutation, broken down into the pieces the audit trail @@ -1010,6 +1073,12 @@ struct AuditEntry { /// Render-only helper; see [`audit_log_context!`] for inclusion. const AUDIT_REASON_MAX_CHARS: usize = 256; +/// Hard cap on the rendered length of `lease_id` (operator-supplied +/// `SetMetricDetail.client_id`) in the audit log. The legitimate TUI +/// format is `top::<8-hex>` ≤ 24 bytes; 64 leaves headroom for +/// other operator-side scrapers while keeping the audit line bounded. +const AUDIT_LEASE_ID_MAX_CHARS: usize = 64; + /// Compile-time sozu version tag — rendered in every audit line so /// operators correlate which binary emitted which log during upgrades. const SOZU_VERSION: &str = env!("CARGO_PKG_VERSION"); @@ -1414,9 +1483,136 @@ fn audit_emit(server: &mut Server, client: &ClientSession, entry: AuditEntry, re cluster_id: entry.cluster_id, backend_id: entry.backend_id, address: entry.address, + // Master-emitted audit events do not carry the + // `metric_detail` transition; that field is populated by + // workers via `WorkerResponse::Event` on lease-tick / worker- + // arm transitions only. See `command.proto`'s `Event` and + // `MetricDetailTransition` comments. + metric_detail: None, }); } +/// Audit a worker-local `METRIC_DETAIL_CHANGED` transition. Workers emit +/// these via the `Event` channel when the polled lease janitor retires a +/// lease, or when the worker arm of `SetMetricDetail` applies / clears a +/// lease. The master folds them into the same audit log used for +/// operator-initiated transitions so SOC tooling sees a complete picture +/// of cardinality changes regardless of origin. +/// +/// Distinct from [`audit_emit`] / [`audit_emit_inline`] because there is +/// no `ClientSession` behind the event: the actor is the worker itself. +/// The line uses a `worker_id=` field in place of the +/// `actor_uid` / `actor_pid` / `client_id` block; everything else +/// (timestamps, sozu_version, fan-out to subscribers) matches the +/// canonical envelope. Both the text sink and the JSON sink receive a +/// record so SIEM ingest stays unified. +pub fn audit_worker_metric_detail_transition( + server: &mut Server, + worker_id: crate::command::server::WorkerId, + transition: &sozu_command_lib::proto::command::MetricDetailTransition, +) { + use sozu_command_lib::proto::command::MetricDetail; + + let (verb, counter) = audit_verb!("metric_detail_changed_worker_local"); + count!(counter, 1); + + let prev_label = MetricDetail::try_from(transition.previous_effective) + .map(|m| format!("{m:?}")) + .unwrap_or_else(|_| "".into()); + let eff_label = MetricDetail::try_from(transition.effective) + .map(|m| format!("{m:?}")) + .unwrap_or_else(|_| "".into()); + let kind_sanitized = sanitize_for_audit_kv(&transition.transition_kind); + let target = format!("metric_detail:{prev_label}->{eff_label}"); + let now_ts = rfc3339_utc(std::time::SystemTime::now()); + + // Truncate the optional lease client_id with the same cap used in the + // operator-initiated audit line so SIEM consumers see a consistent + // upper bound. + let lease_id = transition.client_id.as_deref().map(|c| { + let sanitized = sanitize_for_audit_kv(c); + let truncated: String = sanitized.chars().take(AUDIT_LEASE_ID_MAX_CHARS).collect(); + truncated + }); + + // Render the text-sink line. Match the operator-initiated envelope's + // KV shape so a SOC analyst can correlate worker-local and operator + // lines with a single regex. `worker_id` stands in for the + // `client_id=` block since the worker is its own + // actor. + let mut text = format!( + "[worker:{worker_id} request:- cluster:- backend:-]\tAUDIT\tCommand(ts={now_ts}, verb={verb}, \ + actor_uid=-, actor_gid=-, actor_pid=-, actor_role=worker, actor_user=sozu-worker, \ + actor_comm=sozu-worker, worker_id={worker_id}, socket=(worker-ipc), \ + target={target}, result=ok, transition_kind={kind_sanitized}", + ); + if let Some(id) = lease_id.as_deref() { + text.push_str(&format!(", lease_id={id}")); + } + text.push_str(&format!( + ", sozu_version={SOZU_VERSION}, build_git_sha={SOZU_BUILD_GIT_SHA}, boot_generation={})", + server.boot_generation, + )); + info!("{}", text); + server.append_audit_line(&text); + + if server.audit_log_json_writer.is_some() { + let mut record = serde_json::Map::new(); + record.insert("ts".to_owned(), serde_json::Value::String(now_ts.clone())); + record.insert( + "boot_generation".to_owned(), + serde_json::json!(server.boot_generation), + ); + record.insert( + "verb".to_owned(), + serde_json::Value::String(verb.to_owned()), + ); + record.insert( + "worker_id".to_owned(), + serde_json::json!(worker_id.to_string()), + ); + record.insert( + "actor".to_owned(), + serde_json::json!({ + "role": "worker", + "comm": "sozu-worker", + }), + ); + record.insert( + "target".to_owned(), + serde_json::Value::String(target.clone()), + ); + record.insert( + "result".to_owned(), + serde_json::Value::String("ok".to_owned()), + ); + record.insert( + "transition_kind".to_owned(), + serde_json::Value::String(kind_sanitized.clone()), + ); + record.insert( + "previous_effective".to_owned(), + serde_json::Value::String(prev_label.clone()), + ); + record.insert( + "effective".to_owned(), + serde_json::Value::String(eff_label.clone()), + ); + if let Some(id) = lease_id { + record.insert("lease_id".to_owned(), serde_json::Value::String(id)); + } + record.insert( + "sozu_version".to_owned(), + serde_json::Value::String(SOZU_VERSION.to_owned()), + ); + record.insert( + "build_git_sha".to_owned(), + serde_json::Value::String(SOZU_BUILD_GIT_SHA.to_owned()), + ); + server.append_audit_json(&serde_json::Value::Object(record).to_string()); + } +} + /// Build a single-line JSON record mirroring the audit line. Schema is /// stable: every key always present, missing values rendered as JSON /// `null`. Used by the dedicated JSON sink (`audit_logs_json_target`). @@ -1482,6 +1678,19 @@ fn audit_record_to_json( if let Some(hash) = entry.extras.request_sha256.as_deref() { map.insert("request_sha256".to_owned(), Value::String(hash.to_owned())); } + if let Some(lease_id) = entry.extras.metric_detail_lease_id.as_deref() { + // Operator-controlled. Sanitise with the strict KV helper and + // truncate so JSON consumers that re-emit flat (TSV/CSV) cannot + // forge an adjacent column. + let sanitized = sanitize_for_audit_kv(lease_id); + let truncated: String = sanitized.chars().take(AUDIT_LEASE_ID_MAX_CHARS).collect(); + map.insert("lease_id".to_owned(), Value::String(truncated)); + } + if let Some(detail_reason) = entry.extras.metric_detail_reason.as_deref() { + let sanitized = sanitize_for_audit_kv(detail_reason); + let truncated: String = sanitized.chars().take(AUDIT_REASON_MAX_CHARS).collect(); + map.insert("metric_detail_reason".to_owned(), Value::String(truncated)); + } Value::Object(map) }; // INFO-1: free-form attacker-influenced fields go through @@ -1598,13 +1807,106 @@ struct WorkerTask { /// and target. `None` for non-audited verbs (same filter as /// `audit_entry_for`). audit: Option, + /// Inline-audit target for verbs whose audit factory does not produce + /// a full `AuditEntry` (`ConfigureMetrics`, `SetMetricDetail`). The + /// completion handler emits a second `audit_emit_inline` line with + /// fan-out outcome attached. `None` for verbs that already carry an + /// `AuditEntry` in `audit`. + inline_audit: Option, + /// Operator-controlled SetMetricDetail audit fields (lease_id, + /// reason) folded into the completion-time `AuditExtras` so the + /// post-fanout audit row also carries the operator-supplied lease key + /// and human note in their dedicated columns rather than smuggling + /// them through `target`. `None` for any verb that is not + /// `SetMetricDetail`. + metric_detail_audit: Option, +} + +/// Carry the per-verb metadata needed to emit a completion-time audit +/// row for verbs that don't go through `audit_entry_for`. Mirrors the +/// shape `audit_emit_inline` expects (both `verb` and `counter` are +/// `&'static str` produced together by the `audit_verb!` macro so they +/// can never drift). +#[derive(Debug)] +struct InlineAuditTarget { + kind: EventKind, + verb: &'static str, + counter: &'static str, + target: String, +} + +/// Captured audit fields for `SetMetricDetail` whose operator-controlled +/// values flow into dedicated audit extras (NOT into `target`) so that +/// `:` / `=` / `,` smuggled by an attacker cannot forge an adjacent +/// audit column. `target` itself is kept master-controlled +/// (`metric_detail:` only). +#[derive(Debug, Clone)] +struct MetricDetailAuditFields { + /// `metric_detail:` — fully master-controlled (level is an enum). + target: String, + /// Operator-supplied `SetMetricDetail.client_id`. Sanitised at render + /// time via [`sanitize_for_audit_kv`] and truncated to + /// [`AUDIT_LEASE_ID_MAX_CHARS`]. + lease_id: String, + /// Operator-supplied `SetMetricDetail.reason` (free-form human note). + /// Sanitised + truncated at render time. + reason: Option, +} + +impl MetricDetailAuditFields { + /// Build an `AuditExtras` skeleton carrying the operator fields. The + /// caller layers `elapsed_ms` / `error_code` / `reason` (failure + /// reason — distinct from `metric_detail_reason`) on top as needed. + fn into_extras(self) -> AuditExtras { + AuditExtras { + metric_detail_lease_id: Some(self.lease_id), + metric_detail_reason: self.reason, + ..Default::default() + } + } } pub fn worker_request( server: &mut Server, client: &mut ClientSession, - request_content: RequestType, + mut request_content: RequestType, ) { + // Master-only enrichment: populate `SetMetricDetail`'s peer binding + // from the connecting `ClientSession` so the worker can authorise + // subsequent `clear` requests against the apply-time owner. Clients + // never set these fields themselves — see the proto comment on + // `SetMetricDetail.peer_pid` / `peer_session_ulid` for the trust + // model. A `None` actor_pid (non-Linux build, missing SO_PEERCRED) + // degrades to "binding unknown" on the worker side, which accepts + // any clear for backward compat. + if let RequestType::SetMetricDetail(req) = &mut request_content { + req.peer_pid = client.actor_pid; + req.peer_session_ulid = Some(client.session_ulid.to_string()); + // Master-side pre-validation: reject obviously bogus inputs + // BEFORE fan-out so a malicious or buggy caller cannot fan its + // mistake across every worker (N rejected fan-outs + N audit + // lines per request). The worker dispatch path still enforces + // these limits as defence-in-depth, but failing fast here saves + // the audit-noise amplifier and gives the operator a single + // clear error rather than N. + if req.client_id.len() > sozu_lib::metrics::LEASE_CLIENT_ID_MAX_BYTES { + client.finish_failure(format!( + "SetMetricDetail: client_id length {} exceeds {} bytes", + req.client_id.len(), + sozu_lib::metrics::LEASE_CLIENT_ID_MAX_BYTES, + )); + return; + } + if let Some(t) = req.ttl_seconds + && u64::from(t) > sozu_lib::metrics::LEASE_TTL_MAX.as_secs() + { + client.finish_failure(format!( + "SetMetricDetail: ttl_seconds={t} exceeds LEASE_TTL_MAX={}", + sozu_lib::metrics::LEASE_TTL_MAX.as_secs(), + )); + return; + } + } // Snapshot the audit entry before consuming `request_content` so we can // emit even when `state.dispatch` rejects the request AND so the // completion handler can re-emit with fanout + elapsed_ms. @@ -1622,6 +1924,33 @@ pub fn worker_request( None }; + // Special-case SetMetricDetail — the same shape as ConfigureMetrics above + // (no dedicated audit factory in `audit_entry_for`), so we synthesise the + // entry inline against the new `EventKind::MetricDetailChanged` variant. + // + // The `target` field captures the level only (`metric_detail:Backend` / + // `metric_detail:clear`); the operator-supplied `client_id` (lease key) + // and free-form `reason` flow into dedicated audit extras + // (`metric_detail_lease_id`, `metric_detail_reason`) so attacker-supplied + // `:` / `=` / `,` cannot smuggle a forged column into the audit log. + let metric_detail_audit = if let RequestType::SetMetricDetail(req) = &request_content { + let level = if req.clear.unwrap_or(false) { + "clear".to_owned() + } else { + req.detail + .and_then(|d| MetricDetail::try_from(d).ok()) + .map(|d| format!("{d:?}")) + .unwrap_or_else(|| "".into()) + }; + Some(MetricDetailAuditFields { + target: format!("metric_detail:{level}"), + lease_id: req.client_id.clone(), + reason: req.reason.clone().filter(|s| !s.is_empty()), + }) + } else { + None + }; + let request: sozu_command_lib::proto::command::Request = request_content.into(); let request_sha256 = compute_request_sha256(&request); @@ -1650,6 +1979,23 @@ pub fn worker_request( ..Default::default() }, ); + } else if let Some(fields) = metric_detail_audit.clone() { + let (verb, counter) = audit_verb!("metric_detail_changed"); + let target = fields.target.clone(); + let mut extras = fields.into_extras(); + extras.elapsed_ms = Some(elapsed_ms(started_at)); + extras.error_code = Some(AuditErrorCode::DispatchError); + extras.reason = Some(reason.clone()); + audit_emit_inline( + server, + client, + EventKind::MetricDetailChanged, + verb, + counter, + target, + AuditResult::Err, + extras, + ); } client.finish_failure(format!( "could not dispatch request on the main process state: {error}", @@ -1665,6 +2011,36 @@ pub fn worker_request( cloned.extras.request_sha256 = Some(request_sha256.clone()); cloned }); + // Stash an inline-audit target for the completion handler when the + // verb doesn't carry a full `AuditEntry`. The attempt-time line below + // emits with `AuditResult::Ok` (state.dispatch accepted); on_finish + // re-emits with the worker fan-out outcome. + let inline_audit = if let Some(target) = metrics_target.as_ref() { + let (verb, counter) = audit_verb!("metrics_configured"); + Some(InlineAuditTarget { + kind: EventKind::MetricsConfigured, + verb, + counter, + target: target.clone(), + }) + } else { + metric_detail_audit.as_ref().map(|fields| { + let (verb, counter) = audit_verb!("metric_detail_changed"); + InlineAuditTarget { + kind: EventKind::MetricDetailChanged, + verb, + counter, + target: fields.target.clone(), + } + }) + }; + // Operator-controlled SetMetricDetail fields (lease_id + reason) we + // need to thread into both the attempt-time Ok line below and the + // completion-time line in `on_finish`. Cloning once keeps the + // emission sites symmetric without re-deriving from `request_content` + // (already moved into `request: sozu_command_lib...::Request` above). + let metric_detail_audit_completion = metric_detail_audit.clone(); + if let Some(mut entry) = audit { entry.extras.request_sha256 = Some(request_sha256); audit_emit(server, client, entry, AuditResult::Ok); @@ -1680,6 +2056,20 @@ pub fn worker_request( AuditResult::Ok, AuditExtras::default(), ); + } else if let Some(fields) = metric_detail_audit { + let (verb, counter) = audit_verb!("metric_detail_changed"); + let target = fields.target.clone(); + let extras = fields.into_extras(); + audit_emit_inline( + server, + client, + EventKind::MetricDetailChanged, + verb, + counter, + target, + AuditResult::Ok, + extras, + ); } client.return_processing("Processing worker request..."); @@ -1691,6 +2081,8 @@ pub fn worker_request( gatherer: DefaultGatherer::default(), started_at, audit: audit_for_task, + inline_audit, + metric_detail_audit: metric_detail_audit_completion, }), Timeout::Default, None, @@ -1718,7 +2110,19 @@ impl GatheringTask for WorkerTask { match ResponseStatus::try_from(response.status) { Ok(ResponseStatus::Ok) => messages.push(format!("{worker_id}: OK")), Ok(ResponseStatus::Failure) | Ok(ResponseStatus::Processing) | Err(_) => { - messages.push(format!("{worker_id}: {}", response.message)) + // Worker error strings are partially operator- + // influenced (request-derived fields, IPC payloads). + // Run them through the column-boundary-aware + // sanitiser before joining into `extras.reason` so + // a `,` / `=` inside one worker's message cannot + // forge an additional audit-row column when a SIEM + // splits on `, ` / `=`. The strict variant also + // strips the bidi class so a Trojan-Source-flavoured + // payload cannot visually reorder the reason field. + messages.push(format!( + "{worker_id}: {}", + sanitize_for_audit_kv(&response.message) + )) } } } @@ -1732,26 +2136,28 @@ impl GatheringTask for WorkerTask { AuditResult::Ok }; + let fanout_status = if timed_out { + FanoutStatus::Timeout + } else if errors > 0 { + FanoutStatus::Partial + } else if expected == 0 { + FanoutStatus::LocalOnly + } else { + FanoutStatus::Ok + }; + let fanout_summary = FanoutSummary { + status: fanout_status, + workers_ok: u32::try_from(ok).unwrap_or(u32::MAX), + workers_err: u32::try_from(errors).unwrap_or(u32::MAX), + workers_expected: u32::try_from(expected).unwrap_or(u32::MAX), + }; + // Completion-time audit: attributes the same verb as the attempt-time // line but with fanout / worker counts / elapsed_ms filled in. Skip // when the client disconnected or the verb is not audited. if let (Some(client_ref), Some(mut entry)) = (client.as_deref(), self.audit) { - let fanout_status = if timed_out { - FanoutStatus::Timeout - } else if errors > 0 { - FanoutStatus::Partial - } else if expected == 0 { - FanoutStatus::LocalOnly - } else { - FanoutStatus::Ok - }; entry.extras.elapsed_ms = Some(elapsed_ms(self.started_at)); - entry.extras.fanout = Some(FanoutSummary { - status: fanout_status, - workers_ok: u32::try_from(ok).unwrap_or(u32::MAX), - workers_err: u32::try_from(errors).unwrap_or(u32::MAX), - workers_expected: u32::try_from(expected).unwrap_or(u32::MAX), - }); + entry.extras.fanout = Some(fanout_summary); if matches!(result, AuditResult::Err) { entry.extras.error_code = Some(if timed_out { AuditErrorCode::WorkerTimeout @@ -1761,6 +2167,39 @@ impl GatheringTask for WorkerTask { entry.extras.reason = Some(messages.join(", ")); } audit_emit(server, client_ref, entry, result); + } else if let (Some(client_ref), Some(inline)) = (client.as_deref(), self.inline_audit) { + // Completion-time inline audit for ConfigureMetrics + + // SetMetricDetail. Same shape as the entry-bearing arm above + // but routed through `audit_emit_inline` since these verbs + // don't synthesise a full `AuditEntry` at attempt time. The + // operator-supplied `SetMetricDetail` lease_id / reason live + // in their own audit columns (see `MetricDetailAuditFields`); + // pre-fill them when present, then layer the completion + // metadata on top. + let mut extras = self + .metric_detail_audit + .map(MetricDetailAuditFields::into_extras) + .unwrap_or_default(); + extras.elapsed_ms = Some(elapsed_ms(self.started_at)); + extras.fanout = Some(fanout_summary); + if matches!(result, AuditResult::Err) { + extras.error_code = Some(if timed_out { + AuditErrorCode::WorkerTimeout + } else { + AuditErrorCode::WorkerFailure + }); + extras.reason = Some(messages.join(", ")); + } + audit_emit_inline( + server, + client_ref, + inline.kind, + inline.verb, + inline.counter, + inline.target, + result, + extras, + ); } if errors > 0 || timed_out { @@ -1826,6 +2265,304 @@ fn clone_entry(entry: &AuditEntry) -> AuditEntry { } } +// ========================================================= +// SetMetricDetail — dedicated dispatcher. +// +// Performs master-side length / TTL pre-validation that mirrors +// `worker_request`, populates the peer binding from the connecting +// `ClientSession`, emits the attempt-time audit row, and fans the +// request out to every worker via the standard scatter path. Workers +// that pre-date this verb return `WorkerResponse::error("unknown +// request type")` which folds into the standard fan-out error tally +// (`extras.fanout.workers_err`); operators see "succeeded with errors" +// rather than a dedicated capability-skip list. Production keeps +// master + workers in sync via `UpgradeMain`, so the mixed-version +// state is transient. + +/// Gathers per-worker `SetMetricDetail` responses, synthesises a +/// `MetricDetailStatus` reply for the client, and audits the +/// completion alongside operator-initiated transitions. Wraps the +/// generic worker-task fields (`audit`, `inline_audit`, +/// `metric_detail_audit`) so the existing audit pipeline keeps +/// emitting the same shape it does for any other audited verb. +#[derive(Debug)] +struct SetMetricDetailTask { + pub client_token: Token, + pub gatherer: DefaultGatherer, + started_at: Instant, + /// Master-side `(configured, effective_before)` captured pre-apply + /// so the response can carry the `previous_effective` field that + /// `MetricDetailStatus` advertises. The master also runs an + /// `Aggregator`; its `effective` participates in operator-visible + /// cardinality alongside per-worker leases. + master_configured: MetricDetail, + master_previous_effective: MetricDetail, + /// Completion-time inline-audit target so the post-fanout audit + /// row carries the same `target` / verb shape as the attempt-time + /// line. Cloned from the `inline_audit` slot the generic + /// `worker_request` path uses. + inline_audit: InlineAuditTarget, + /// Operator-controlled audit fields (lease_id + reason) carried + /// into the completion-time `AuditExtras` so the post-fan-out + /// audit row also surfaces the lease key + free-form note in + /// their dedicated columns. + metric_detail_audit: MetricDetailAuditFields, +} + +/// Dispatch a `SetMetricDetail` request. Performs the same length / +/// TTL pre-validation that `worker_request` does, populates the peer +/// binding from the connecting `ClientSession`, emits the attempt-time +/// audit row, and fans out unconditionally to every worker through the +/// standard scatter path. +pub fn set_metric_detail_request( + server: &mut Server, + client: &mut ClientSession, + mut req: SetMetricDetail, +) { + // Master-side enrichment + pre-validation (mirrors `worker_request`). + req.peer_pid = client.actor_pid; + req.peer_session_ulid = Some(client.session_ulid.to_string()); + if req.client_id.len() > sozu_lib::metrics::LEASE_CLIENT_ID_MAX_BYTES { + client.finish_failure(format!( + "SetMetricDetail: client_id length {} exceeds {} bytes", + req.client_id.len(), + sozu_lib::metrics::LEASE_CLIENT_ID_MAX_BYTES, + )); + return; + } + if let Some(t) = req.ttl_seconds + && u64::from(t) > sozu_lib::metrics::LEASE_TTL_MAX.as_secs() + { + client.finish_failure(format!( + "SetMetricDetail: ttl_seconds={t} exceeds LEASE_TTL_MAX={}", + sozu_lib::metrics::LEASE_TTL_MAX.as_secs(), + )); + return; + } + + // Capture master-side cardinality view BEFORE we touch anything. + let (master_configured, master_previous_effective) = METRICS.with(|m| { + let m = m.borrow(); + ( + MetricDetail::from(m.detail_configured()), + MetricDetail::from(m.detail_effective()), + ) + }); + + // Build the audit-field skeleton used by both the attempt-time and + // completion-time emissions. Mirrors `worker_request`. + let level_label = if req.clear.unwrap_or(false) { + "clear".to_owned() + } else { + req.detail + .and_then(|d| MetricDetail::try_from(d).ok()) + .map(|d| format!("{d:?}")) + .unwrap_or_else(|| "".into()) + }; + let metric_detail_audit = MetricDetailAuditFields { + target: format!("metric_detail:{level_label}"), + lease_id: req.client_id.clone(), + reason: req.reason.clone().filter(|s| !s.is_empty()), + }; + + let started_at = Instant::now(); + let request: Request = RequestType::SetMetricDetail(req).into(); + + // Attempt-time dispatch gate (mirrors `state.dispatch` in worker_request). + if let Err(error) = server.state.dispatch(&request) { + let reason = error.to_string(); + let (verb, counter) = audit_verb!("metric_detail_changed"); + let target = metric_detail_audit.target.clone(); + let mut extras = metric_detail_audit.into_extras(); + extras.elapsed_ms = Some(elapsed_ms(started_at)); + extras.error_code = Some(AuditErrorCode::DispatchError); + extras.reason = Some(reason.clone()); + audit_emit_inline( + server, + client, + EventKind::MetricDetailChanged, + verb, + counter, + target, + AuditResult::Err, + extras, + ); + client.finish_failure(format!( + "could not dispatch request on the main process state: {error}", + )); + return; + } + + // Attempt-time audit Ok. + let (verb, counter) = audit_verb!("metric_detail_changed"); + { + let target = metric_detail_audit.target.clone(); + let extras = metric_detail_audit.clone().into_extras(); + audit_emit_inline( + server, + client, + EventKind::MetricDetailChanged, + verb, + counter, + target, + AuditResult::Ok, + extras, + ); + } + + client.return_processing("Processing SetMetricDetail..."); + + let inline_audit = InlineAuditTarget { + kind: EventKind::MetricDetailChanged, + verb, + counter, + target: metric_detail_audit.target.clone(), + }; + + // Fan out unconditionally to every worker through the standard + // scatter path. Workers that pre-date `SetMetricDetail` reply with + // `WorkerResponse::error("unknown request type")` which folds into + // the standard fan-out error tally; `on_finish` surfaces them via + // the existing fanout summary rather than a dedicated skip list. + let task = Box::new(SetMetricDetailTask { + client_token: client.token, + gatherer: DefaultGatherer::default(), + started_at, + master_configured, + master_previous_effective, + inline_audit, + metric_detail_audit, + }); + server.scatter(request, task, Timeout::Default, None); +} + +impl GatheringTask for SetMetricDetailTask { + fn client_token(&self) -> Option { + Some(self.client_token) + } + + fn get_gatherer(&mut self) -> &mut dyn Gatherer { + &mut self.gatherer + } + + fn on_finish( + self: Box, + server: &mut Server, + client: &mut OptionalClient, + timed_out: bool, + ) { + // Per-worker status: each worker now returns its own + // `WorkerMetricDetailStatus` payload via + // `ContentType::WorkerMetricDetailStatus` in the SetMetricDetail + // ok-with-content response (lib/src/server.rs::notify). Pull + // each worker's actual quartet — workers hold independent + // `Aggregator`s, so the master's view is NOT a reliable + // stand-in for the per-worker state. Workers that returned an + // error (e.g. peer-binding refusal) get skipped; the operator + // sees `MetricDetailStatus.workers` populated only for the + // ACK'd subset. + let mut workers_map = BTreeMap::new(); + for (worker_id, response) in &self.gatherer.responses { + if !matches!( + ResponseStatus::try_from(response.status), + Ok(ResponseStatus::Ok) + ) { + continue; + } + if let Some(ResponseContent { + content_type: Some(ContentType::WorkerMetricDetailStatus(status)), + }) = response.content.as_ref() + { + // `WorkerMetricDetailStatus` is `Copy` (four `i32`s + + // one `u32`); dereferencing avoids the `clippy::clone_on_copy` + // lint that CI's `-D warnings` rejects. + workers_map.insert(worker_id.to_string(), *status); + } + } + + let master_effective = METRICS.with(|m| MetricDetail::from(m.borrow().detail_effective())); + let status = MetricDetailStatus { + configured: self.master_configured as i32, + effective: master_effective as i32, + previous_effective: self.master_previous_effective as i32, + workers: workers_map, + }; + + // Completion-time audit row. Same shape as the generic WorkerTask + // completion path; reuses `metric_detail_audit` for the + // `lease_id` / `metric_detail_reason` columns and folds the + // fan-out summary on top. + let errors = self.gatherer.errors; + let ok = self.gatherer.ok; + let expected = self.gatherer.expected_responses; + let result = if errors > 0 || timed_out { + AuditResult::Err + } else { + AuditResult::Ok + }; + let fanout_status = if timed_out { + FanoutStatus::Timeout + } else if errors > 0 { + FanoutStatus::Partial + } else if expected == 0 { + FanoutStatus::LocalOnly + } else { + FanoutStatus::Ok + }; + let fanout_summary = FanoutSummary { + status: fanout_status, + workers_ok: u32::try_from(ok).unwrap_or(u32::MAX), + workers_err: u32::try_from(errors).unwrap_or(u32::MAX), + workers_expected: u32::try_from(expected).unwrap_or(u32::MAX), + }; + if let Some(client_ref) = client.as_deref() { + let mut extras = self.metric_detail_audit.into_extras(); + extras.elapsed_ms = Some(elapsed_ms(self.started_at)); + extras.fanout = Some(fanout_summary); + if matches!(result, AuditResult::Err) { + extras.error_code = Some(if timed_out { + AuditErrorCode::WorkerTimeout + } else { + AuditErrorCode::WorkerFailure + }); + let mut msgs = Vec::new(); + for (worker_id, response) in &self.gatherer.responses { + // Same column-boundary sanitisation as + // `WorkerTask::on_finish` above. + // `SetMetricDetail` is itself the + // operator-controlled verb most likely to be probed + // for SIEM column smuggling, so this site is the + // higher-leverage of the two reason-join paths. + msgs.push(format!( + "{worker_id}: {}", + sanitize_for_audit_kv(&response.message) + )); + } + extras.reason = Some(msgs.join(", ")); + } + audit_emit_inline( + server, + client_ref, + self.inline_audit.kind, + self.inline_audit.verb, + self.inline_audit.counter, + self.inline_audit.target, + result, + extras, + ); + } + + client.finish_ok_with_content( + ContentType::MetricDetailStatus(status).into(), + if errors > 0 || timed_out { + "SetMetricDetail completed with worker errors" + } else { + "Successfully applied SetMetricDetail to all workers" + }, + ); + } +} + // ========================================================= // Query Metrics @@ -1862,7 +2599,7 @@ impl GatheringTask for QueryMetricsTask { fn on_finish( self: Box, - server: &mut Server, + _server: &mut Server, client: &mut OptionalClient, _timed_out: bool, ) { @@ -1921,7 +2658,14 @@ impl GatheringTask for QueryMetricsTask { proxying: BTreeMap::new(), }; - if !self.options.workers && server.workers.len() > 1 { + // Always fold when the caller asked for merged data, regardless of + // worker count. `merge_metrics` relocates each worker's `clusters` + // and `proxying` into the top-level maps via `std::mem::take`; the + // previous `> 1` guard left single-worker fleets with empty + // top-level maps and stranded the per-worker data in `workers`, + // which silently zeroed every CLI/TUI consumer that reads + // `m.clusters` / `m.proxying`. + if !self.options.workers { aggregated_metrics.merge_metrics(); } @@ -2526,8 +3270,9 @@ mod audit_format_tests { //! empty strings and the rendered line is ANSI-free — stable to match //! with a plain regex. use super::{ - AUDIT_REASON_MAX_CHARS, AuditEntry, AuditErrorCode, AuditExtras, AuditResult, FanoutStatus, - FanoutSummary, SOZU_BUILD_GIT_SHA, SOZU_VERSION, actor_role, rfc3339_utc, + AUDIT_LEASE_ID_MAX_CHARS, AUDIT_REASON_MAX_CHARS, AuditEntry, AuditErrorCode, AuditExtras, + AuditResult, FanoutStatus, FanoutSummary, SOZU_BUILD_GIT_SHA, SOZU_VERSION, actor_role, + rfc3339_utc, }; use regex::Regex; use rusty_ulid::Ulid; @@ -2749,4 +3494,63 @@ mod audit_format_tests { "unexpected SOZU_BUILD_GIT_SHA: {s:?}" ); } + + #[test] + fn worker_message_join_sanitises_smuggled_kv_pair() { + // Both `WorkerTask::on_finish` and `SetMetricDetailTask::on_finish` + // route each worker's `response.message` through + // `sanitize_for_audit_kv` before joining into `extras.reason`. + // Verify the call-site shape catches the canonical SIEM-column- + // smuggling attempt — `,` and `=` inside the operator-influenced + // worker payload — without relying on a full Server / Gatherer + // ceremony to drive the on_finish path end-to-end. + let worker_id = 7u32; + let attacker_payload = "x,actor_user=mallory,sozu_version=hijacked"; + let formatted = format!( + "{worker_id}: {}", + super::sanitize_for_audit_kv(attacker_payload) + ); + assert!( + !formatted.contains("actor_user=mallory"), + "sanitised worker message must not propagate `=` into the \ + reason column (column-boundary forge defence)" + ); + assert!( + !formatted.contains(",actor_user"), + "sanitised worker message must not propagate `,` into the \ + reason column (column-boundary forge defence)" + ); + // The replacement character does survive — operators still see + // SOMETHING in the slot so the failure mode is visible. + assert!(formatted.contains('?')); + } +} + +#[cfg(test)] +mod mutating_verb_policy_tests { + //! Regression guard for the systemd `RELOADING=1` bracket policy: + //! `is_mutating_verb` must NOT include `SetMetricDetail`. The TUI + //! auto-renews its cardinality lease every `ttl/2` seconds, and a + //! mutating-verb bracket on each renewal would flap the systemd + //! unit state every 30 s for the whole TUI session lifetime. + use super::is_mutating_verb; + use sozu_command_lib::proto::command::{MetricDetail, SetMetricDetail, request::RequestType}; + + #[test] + fn set_metric_detail_is_not_mutating() { + let req = RequestType::SetMetricDetail(SetMetricDetail { + client_id: "top:1:abcdef01".to_owned(), + detail: Some(MetricDetail::DetailBackend as i32), + ttl_seconds: Some(60), + reason: Some("operator dashboard".to_owned()), + clear: Some(false), + peer_pid: None, + peer_session_ulid: None, + }); + assert!( + !is_mutating_verb(&req), + "SetMetricDetail is an observability knob, not a state transition; \ + keeping it out of is_mutating_verb prevents RELOADING flap on lease renewal" + ); + } } diff --git a/bin/src/command/server.rs b/bin/src/command/server.rs index e5c0db8fc..7d16b3dd5 100644 --- a/bin/src/command/server.rs +++ b/bin/src/command/server.rs @@ -34,14 +34,16 @@ use sozu_command_lib::{ channel::Channel, config::Config, proto::command::{ - Event, Request, ResponseContent, ResponseStatus, RunState, Status, WorkerRequest, - WorkerResponse, request::RequestType, response_content::ContentType, + Event, EventKind, Request, ResponseContent, ResponseStatus, RunState, Status, + WorkerRequest, WorkerResponse, request::RequestType, response_content::ContentType, }, ready::Ready, scm_socket::{Listeners, ScmSocket, ScmSocketError}, state::ConfigState, }; +use sozu_lib::metrics::names; + use super::upgrade::SerializedWorkerSession; use crate::{ command::{ @@ -544,6 +546,23 @@ impl CommandHub { content_type: Some(ContentType::Event(event)), }) = response.content { + // Worker-local METRIC_DETAIL_CHANGED transitions (lease tick + // expiry, worker arm apply/clear) are emitted by the worker + // via the same `Event` channel that carries backend health + // signals; the master folds them into the audit log here + // alongside operator-initiated transitions audited from + // `requests.rs::worker_request`. Without this, the worker's + // polled janitor expiring a lease would leave no audit + // trail, masking implicit cardinality changes from SOC tools. + if event.kind == EventKind::MetricDetailChanged as i32 { + if let Some(transition) = event.metric_detail.as_ref() { + crate::command::requests::audit_worker_metric_detail_transition( + &mut self.server, + worker_id, + transition, + ); + } + } for client_token in &self.server.event_subscribers { if let Some(client) = self.clients.get_mut(client_token) { client.return_processing_with_content( @@ -877,9 +896,12 @@ impl Server { /// count backends and frontends in the cache, update gauge metrics pub fn update_counts(&mut self) { - gauge!("configuration.clusters", self.state.clusters.len()); - gauge!("configuration.backends", self.state.count_backends()); - gauge!("configuration.frontends", self.state.count_frontends()); + gauge!(names::configuration::CLUSTERS, self.state.clusters.len()); + gauge!(names::configuration::BACKENDS, self.state.count_backends()); + gauge!( + names::configuration::FRONTENDS, + self.state.count_frontends() + ); } /// Queue an audit event for fan-out to subscribed clients. Drained by @@ -1328,9 +1350,9 @@ mod tests { // initially empty server.update_counts(); - assert_eq!(read_gauge("configuration.clusters"), Some(0)); - assert_eq!(read_gauge("configuration.backends"), Some(0)); - assert_eq!(read_gauge("configuration.frontends"), Some(0)); + assert_eq!(read_gauge(names::configuration::CLUSTERS), Some(0)); + assert_eq!(read_gauge(names::configuration::BACKENDS), Some(0)); + assert_eq!(read_gauge(names::configuration::FRONTENDS), Some(0)); // add a cluster server @@ -1390,12 +1412,12 @@ mod tests { .expect("Could not add TCP frontend"); // gauges are still stale until update_counts() is called - assert_eq!(read_gauge("configuration.clusters"), Some(0)); + assert_eq!(read_gauge(names::configuration::CLUSTERS), Some(0)); // update_counts should refresh gauges server.update_counts(); - assert_eq!(read_gauge("configuration.clusters"), Some(1)); - assert_eq!(read_gauge("configuration.backends"), Some(3)); - assert_eq!(read_gauge("configuration.frontends"), Some(2)); + assert_eq!(read_gauge(names::configuration::CLUSTERS), Some(1)); + assert_eq!(read_gauge(names::configuration::BACKENDS), Some(3)); + assert_eq!(read_gauge(names::configuration::FRONTENDS), Some(2)); } } diff --git a/bin/src/command/sessions.rs b/bin/src/command/sessions.rs index 5e1b8b56a..e9d15ea2a 100644 --- a/bin/src/command/sessions.rs +++ b/bin/src/command/sessions.rs @@ -183,20 +183,66 @@ impl ClientSession { /// audit field contains a literal `\t` or `\n`. Applied at render time by /// the `audit_log_context!` macro. pub fn sanitize_for_audit(s: &str) -> String { - if s.bytes().all(|b| b >= 0x20 && b != 0x7f) { + if s.chars().all(|c| !is_unsafe_line(c)) { return s.to_owned(); } s.chars() - .map(|c| { - if (c as u32) < 0x20 || c == '\x7f' { - '?' - } else { - c - } - }) + .map(|c| if is_unsafe_line(c) { '?' } else { c }) + .collect() +} + +/// Strict sanitiser for audit-log fields whose values participate in +/// column-boundary parsing, i.e. anything rendered as `, key={value}` in +/// the text sink. On top of [`sanitize_for_audit`]'s control-byte strip, +/// this also replaces `,` and `=` with `?` so an attacker-controlled value +/// cannot forge a fake adjacent KV pair when a SIEM splits on `, ` / `=`. +/// +/// Use this for any audit field whose source is operator-controlled +/// (request-derived strings) rather than master-controlled metadata. +/// Does NOT strip `:` because legitimate values (e.g. `target=address:...`) +/// use `:` as an in-value separator. +pub fn sanitize_for_audit_kv(s: &str) -> String { + if s.chars().all(|c| !is_unsafe_kv(c)) { + return s.to_owned(); + } + s.chars() + .map(|c| if is_unsafe_kv(c) { '?' } else { c }) .collect() } +/// Characters that would break the audit log's row-or-line shape if they +/// reached the text sink unsanitised. Covers the full Unicode control +/// category (`char::is_control()` matches C0, DEL, and C1 — NEL/CSI are in +/// C1 and would otherwise survive a byte-only `< 0x20 || == 0x7f` check), +/// three non-control codepoints that some SIEM normalisers treat as line +/// breaks (U+FEFF BOM, U+2028 LINE SEPARATOR, U+2029 PARAGRAPH SEPARATOR), +/// and the bidirectional override / isolate controls U+202A..=U+202E + +/// U+2066..=U+2069. The bidi class is Trojan-Source-flavoured (CVE-2021- +/// 42574): a Right-to-Left Override inside an audit value visually +/// reverses the bytes that follow when an operator tails the log in a +/// Unicode-aware terminal (`less`, `cat` under a UTF-8 locale, +/// `journalctl`), so the row appears to attribute the action to a +/// different field than it actually carries. The byte-based fast path +/// is gone on purpose: every problematic codepoint above U+007F is +/// multi-byte UTF-8 with every byte `>= 0x80`, so a byte-only `>= 0x20` +/// check would let them through. +#[inline] +fn is_unsafe_line(c: char) -> bool { + c.is_control() + || c == '\u{feff}' + || c == '\u{2028}' + || c == '\u{2029}' + || matches!(c, '\u{202A}'..='\u{202E}' | '\u{2066}'..='\u{2069}') +} + +/// Strict variant: line-unsafe characters plus the column separators (`,` +/// and `=`) that a SIEM consumer splits on. Does NOT strip `:` — see +/// [`sanitize_for_audit_kv`] for the legitimate-value rationale. +#[inline] +fn is_unsafe_kv(c: char) -> bool { + is_unsafe_line(c) || c == ',' || c == '=' +} + /// QW8 helper: render `Option` for audit output. `Some(v)` becomes /// `v.to_string()`, `None` becomes the literal `"unknown"`. Used by the /// `actor_*_display` accessors on `ClientSession` so the five near- @@ -437,3 +483,180 @@ pub fn wants_to_tick(channel: &Channel) -> bool { (channel.readiness.is_writable() && channel.back_buf.available_data() > 0) || (channel.readiness.is_hup() || channel.readiness.is_error()) } + +#[cfg(test)] +mod tests { + use super::{sanitize_for_audit, sanitize_for_audit_kv}; + + // ----------------------------------------------------------------- + // sanitize_for_audit_kv: strict, used for column-boundary fields + // ----------------------------------------------------------------- + + #[test] + fn kv_strips_column_comma() { + // Comma is the row-separator a SIEM splits the audit line on; an + // operator-supplied value containing `,` would forge a sibling KV + // pair against `, key=value` parsers. + assert_eq!(sanitize_for_audit_kv("x,y"), "x?y"); + } + + #[test] + fn kv_strips_column_equals() { + // Equals is the column separator inside a `key=value` pair. + assert_eq!(sanitize_for_audit_kv("x=y"), "x?y"); + } + + #[test] + fn kv_strips_c1_nel() { + // U+0085 NEL is a C1 control byte some normalisers treat as a + // line break. A byte-only `< 0x20 || == 0x7f` predicate would let + // it through because UTF-8 encodes NEL as `c2 85` (both >= 0x80). + assert_eq!(sanitize_for_audit_kv("x\u{0085}y"), "x?y"); + } + + #[test] + fn kv_strips_c1_csi() { + // U+009B CSI is the ANSI escape introducer — terminals interpret + // it as the start of a control sequence. Same C1 / byte-only- + // predicate trap as NEL above. + assert_eq!(sanitize_for_audit_kv("x\u{009B}y"), "x?y"); + } + + #[test] + fn kv_strips_bom() { + // U+FEFF is non-control by category but some pipelines treat a + // leading BOM as a delimiter; reject it conservatively. + assert_eq!(sanitize_for_audit_kv("x\u{FEFF}y"), "x?y"); + } + + #[test] + fn kv_strips_line_separator() { + // U+2028 LINE SEPARATOR splits the audit row in any consumer that + // honours the Unicode line-break property. + assert_eq!(sanitize_for_audit_kv("x\u{2028}y"), "x?y"); + } + + #[test] + fn kv_strips_paragraph_separator() { + // U+2029 PARAGRAPH SEPARATOR — same rationale as LINE SEPARATOR. + assert_eq!(sanitize_for_audit_kv("x\u{2029}y"), "x?y"); + } + + #[test] + fn kv_preserves_safe_ascii() { + // No control / column-boundary / line-break character present: + // the fast path returns the original string unchanged. + assert_eq!(sanitize_for_audit_kv("safe-id_42"), "safe-id_42"); + } + + #[test] + fn kv_preserves_in_value_colon() { + // `:` is not a column separator at the audit-line level (legit + // values like `target=host:8080` rely on it). + assert_eq!(sanitize_for_audit_kv("host:8080"), "host:8080"); + } + + // ----------------------------------------------------------------- + // sanitize_for_audit: line-only, no column-boundary stripping + // ----------------------------------------------------------------- + + #[test] + fn line_keeps_comma() { + // The weak variant feeds fields rendered outside the `, key=value` + // shape (the `reason=` column is one big quoted blob), so `,` is + // legal text and must survive sanitisation. + assert_eq!(sanitize_for_audit("x,y"), "x,y"); + } + + #[test] + fn line_keeps_equals() { + // Same reasoning as `line_keeps_comma`: `=` is legal text inside + // a quoted reason payload. + assert_eq!(sanitize_for_audit("x=y"), "x=y"); + } + + #[test] + fn line_strips_c1_nel() { + // The weak sanitiser MUST still catch C1 controls — the prior + // byte-only predicate let them through. + assert_eq!(sanitize_for_audit("x\u{0085}y"), "x?y"); + } + + #[test] + fn line_strips_c1_csi() { + assert_eq!(sanitize_for_audit("x\u{009B}y"), "x?y"); + } + + #[test] + fn line_strips_bom() { + assert_eq!(sanitize_for_audit("x\u{FEFF}y"), "x?y"); + } + + #[test] + fn line_strips_line_separator() { + assert_eq!(sanitize_for_audit("x\u{2028}y"), "x?y"); + } + + #[test] + fn line_strips_paragraph_separator() { + assert_eq!(sanitize_for_audit("x\u{2029}y"), "x?y"); + } + + #[test] + fn line_strips_c0_control() { + // C0 controls (tab, LF, NUL, etc.) were the original target of + // the byte-based predicate; the rewrite must keep covering them. + assert_eq!(sanitize_for_audit("x\ty\nz\0"), "x?y?z?"); + } + + #[test] + fn line_strips_del() { + // DEL (U+007F) is `char::is_control()` true. + assert_eq!(sanitize_for_audit("x\u{007F}y"), "x?y"); + } + + // ----------------------------------------------------------------- + // bidirectional override / isolate class — Trojan-Source defence + // ----------------------------------------------------------------- + + #[test] + fn line_strips_rtl_override() { + // U+202E RIGHT-TO-LEFT OVERRIDE visually reverses the bytes that + // follow when an operator tails the audit log in a Unicode-aware + // terminal. The CVE-2021-42574 class — strip before render. + assert_eq!(sanitize_for_audit("a\u{202E}b"), "a?b"); + assert_eq!(sanitize_for_audit_kv("a\u{202E}b"), "a?b"); + } + + #[test] + fn line_strips_bidi_override_range() { + // U+202A..=U+202E are the bidi override controls (LRE, RLE, PDF, + // LRO, RLO). All have the same audit-row reorder hazard. + for c in ['\u{202A}', '\u{202B}', '\u{202C}', '\u{202D}', '\u{202E}'] { + let input = format!("a{c}b"); + assert_eq!(sanitize_for_audit(&input), "a?b"); + assert_eq!(sanitize_for_audit_kv(&input), "a?b"); + } + } + + #[test] + fn line_strips_bidi_isolate_range() { + // U+2066..=U+2069 are the bidi isolate controls (LRI, RLI, FSI, + // PDI). Same hazard as the override class. + for c in ['\u{2066}', '\u{2067}', '\u{2068}', '\u{2069}'] { + let input = format!("a{c}b"); + assert_eq!(sanitize_for_audit(&input), "a?b"); + assert_eq!(sanitize_for_audit_kv(&input), "a?b"); + } + } + + #[test] + fn line_preserves_legitimate_bidi_text() { + // Plain RTL script content (Hebrew / Arabic) must round-trip + // through both sanitisers — only the explicit override / isolate + // controls are rejected. + let input = "héllo שלום مرحبا"; + assert_eq!(sanitize_for_audit(input), input); + assert_eq!(sanitize_for_audit_kv(input), input); + } +} diff --git a/bin/src/ctl/mod.rs b/bin/src/ctl/mod.rs index eb5b57600..a0f4d6491 100644 --- a/bin/src/ctl/mod.rs +++ b/bin/src/ctl/mod.rs @@ -3,6 +3,8 @@ mod command; mod request_builder; +#[cfg(feature = "tui")] +mod top; use std::time::Duration; @@ -60,6 +62,13 @@ pub enum CtlError { SetupLogging(LogError), #[error("could not resolve path for {0} : {1}")] ResolvePath(String, std::io::Error), + #[cfg(feature = "tui")] + #[error("failed to spawn thread `{label}`: {source}")] + SpawnFailed { + label: &'static str, + #[source] + source: std::io::Error, + }, } pub struct CommandManager { @@ -213,6 +222,26 @@ impl CommandManager { SubCmd::Config { cmd: _ } => Ok(()), // noop, handled at the beginning of the method SubCmd::Events => self.events(), SubCmd::ConnectionLimit { cmd } => self.connection_limit_command(cmd), + #[cfg(feature = "tui")] + SubCmd::Top { + refresh_ms, + no_mouse, + skin, + detail, + lease_ttl_seconds, + snapshot, + tick_once, + glyphs, + } => self.run_top(top::TopArgs { + refresh_ms, + no_mouse, + skin, + detail, + lease_ttl_seconds, + snapshot, + tick_once, + glyphs, + }), rest => { panic!("that command should have been handled earlier: {rest:x?}") } diff --git a/bin/src/ctl/top/app.rs b/bin/src/ctl/top/app.rs new file mode 100644 index 000000000..dbfec3412 --- /dev/null +++ b/bin/src/ctl/top/app.rs @@ -0,0 +1,1354 @@ +//! Application state for `sozu top`. +//! +//! Pure data — no I/O, no rendering. The render loop reads `App` to draw a +//! frame; the transport threads push `Snapshot`s and `TopEvent`s that +//! `App::ingest_*` folds into the ring buffers, rate calculator, threshold +//! table, and pulse tracker. +//! +//! Three derived state primitives: +//! +//! - [`SparkRing`] — fixed-capacity VecDeque for sparkline samples (60 by +//! default, one per second of history at the default `--refresh-ms = 1000`). +//! - [`RateCalculator`] — turns Sōzu's cumulative `Count` metrics into +//! per-second deltas. Detects the hourly `LocalDrain::clear` (which drops +//! `Count`/`Time` while preserving Gauges) by looking for monotonic-decrease +//! between samples and emits `0` for that tick instead of a negative spike. +//! - [`ThresholdTable`] — colour-coding rules: any time a value crosses a +//! threshold (5xx ratio > 1 %, slab.usage_percent > 80, h2 flood counters +//! non-zero) the relevant pane flips to a warning/critical hue. + +use std::collections::{HashMap, HashSet, VecDeque}; +use std::time::Instant; + +use sozu_command_lib::proto::command::{ + AggregatedMetrics, ClusterMetrics, FilteredMetrics, filtered_metrics, +}; +use sozu_lib::metrics::names; + +use super::theme::GlyphMode; +use super::transport::{CertsSnapshot, ListenersSnapshot, Snapshot, TopEvent}; + +/// Default ring depth for the on-screen sparkline series. 60 samples = one +/// minute of history at the default 1 s data tick. Matches the proto +/// `FilteredTimeSerie.last_minute` cadence so swapping to a server-side +/// time series later is a one-line change. +pub const SPARKLINE_DEPTH: usize = 60; + +/// Default capacity for the in-memory recent-events ring shown in the +/// EVENTS pane / overview footer. Larger than the transport's `bounded(64)` +/// so the UI can keep an audit-style scrollback without forcing the +/// transport to back-pressure on its own bound. +pub const EVENT_RING_DEPTH: usize = 200; + +/// Number of consecutive frames a pulse stays visible. The tick decrement +/// is `if value == 0 { drop } else { value -= 1 }`, so a pulse inserted +/// with `N` visits `N → N-1 → … → 1 → 0` (dropped on the next tick). +/// That is `N + 1` rendered frames; `5` renders ~5 s of background tint +/// at the default 1 s data-tick cadence — long enough to catch the eye +/// without monopolising the colour-coding when multiple events land +/// close together. The name reflects the observed frame count rather +/// than the seed value to avoid the off-by-one trap. +pub const PULSE_PERSIST_FRAMES: u32 = 5; + +/// Top-level pane selector mirrored to numbered tabs at the top of the +/// screen. Numbers match the keymap (`1 OVERVIEW`, `2 CLUSTERS`, …) so +/// muscle memory from hatop / btop / k9s carries over. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ActiveTab { + Overview, + Clusters, + Backends, + Listeners, + Certs, + H2, + Events, +} + +impl ActiveTab { + pub const ALL: &'static [Self] = &[ + Self::Overview, + Self::Clusters, + Self::Backends, + Self::Listeners, + Self::Certs, + Self::H2, + Self::Events, + ]; + + pub fn label(self) -> &'static str { + match self { + Self::Overview => "OVERVIEW", + Self::Clusters => "CLUSTERS", + Self::Backends => "BACKENDS", + Self::Listeners => "LISTENERS", + Self::Certs => "CERTS", + Self::H2 => "H2", + Self::Events => "EVENTS", + } + } + + /// Map a 1-based key (`1`..`7`) to a tab. Returns `None` for keys + /// outside the range so the caller can silently ignore. + pub fn from_digit(d: u8) -> Option { + Self::ALL.get(d.checked_sub(1)? as usize).copied() + } + + /// Resolve a `:command-palette` alias to a tab. Centralises the + /// alias table so adding a new tab only touches `ActiveTab` (label + + /// from_alias) instead of also patching `apply_palette`. + pub fn from_alias(alias: &str) -> Option { + match alias { + "overview" | "o" => Some(Self::Overview), + "cluster" | "clusters" | "c" => Some(Self::Clusters), + "backend" | "backends" | "b" => Some(Self::Backends), + "listener" | "listeners" | "l" => Some(Self::Listeners), + "cert" | "certs" => Some(Self::Certs), + "h2" => Some(Self::H2), + "event" | "events" | "e" => Some(Self::Events), + _ => None, + } + } + + pub fn cycle(self, forward: bool) -> Self { + let idx = Self::ALL.iter().position(|t| *t == self).unwrap_or(0); + let len = Self::ALL.len(); + let next = if forward { + (idx + 1) % len + } else { + (idx + len - 1) % len + }; + Self::ALL[next] + } +} + +/// Fixed-capacity sparkline ring. Newest sample at the back of the deque. +#[derive(Debug, Clone)] +pub struct SparkRing { + samples: VecDeque, + capacity: usize, +} + +impl SparkRing { + pub fn new(capacity: usize) -> Self { + Self { + samples: VecDeque::with_capacity(capacity), + capacity, + } + } + + pub fn push(&mut self, value: u64) { + if self.samples.len() == self.capacity { + self.samples.pop_front(); + } + self.samples.push_back(value); + } + + pub fn samples(&self) -> std::collections::vec_deque::Iter<'_, u64> { + self.samples.iter() + } + + pub fn last(&self) -> Option { + self.samples.back().copied() + } + + pub fn max(&self) -> u64 { + self.samples.iter().copied().max().unwrap_or(0) + } + + pub fn is_empty(&self) -> bool { + self.samples.is_empty() + } + + /// Render the ring as a `Vec` so ratatui's `Sparkline` widget can + /// consume it without holding a borrow. + pub fn to_vec(&self) -> Vec { + self.samples.iter().copied().collect() + } +} + +/// Derives per-second rates from Sōzu's cumulative `Count` metrics. Stores +/// the previous (value, instant) per metric key. `record` returns the rate +/// or `None` when: +/// +/// - this is the first observation (no baseline yet), +/// - or the new value is less than the stored one (the hourly +/// `LocalDrain::clear` reset the counter — emit `0` rather than a +/// negative spike). +#[derive(Debug, Default)] +pub struct RateCalculator { + history: HashMap, +} + +impl RateCalculator { + pub fn record(&mut self, key: &str, value: i64, sampled_at: Instant) -> Option { + let result = match self.history.get(key) { + Some((prev_value, prev_at)) if value >= *prev_value => { + let dt = sampled_at.saturating_duration_since(*prev_at).as_secs_f64(); + if dt > 0.0 { + Some(((value - *prev_value) as f64) / dt) + } else { + Some(0.0) + } + } + Some(_) => Some(0.0), // hourly reset: emit 0, not a negative + None => None, // first observation: caller can show "—" + }; + self.history.insert(key.to_owned(), (value, sampled_at)); + result + } + + /// Drop history entries whose key the predicate rejects. Called from + /// `App::ingest_snapshot` after recording with the per-cluster keys + /// present in the freshest snapshot so disappearing clusters do not + /// leave `(prev_value, prev_at)` lingering forever. + pub fn retain bool>(&mut self, mut keep: F) { + self.history.retain(|k, _| keep(k.as_str())); + } +} + +/// What-changed pulse tracker. Snapshot-to-snapshot diffs surface as a +/// short-lived tint on the affected rows so the eye catches transitions +/// even when the operator looked away for a moment. +/// +/// Three classes of pulse: +/// +/// - `ClusterDisappeared(cluster_id)` fires when a cluster id present in +/// the previous `AggregatedMetrics.clusters` map is absent in the new +/// one. Operationally meaningful: the master reconfig dropped a +/// cluster, or the worker fleet went silent on it. +/// - `BackendWentDown(cluster_id, backend_id)` fires when a backend's +/// `backend.available` gauge transitioned from `>= 1` to `0`. The +/// companion `EventKind::BackendDown` event lands in the EVENTS pane; +/// this pulse highlights the BACKENDS row in lockstep. +/// - `ClusterAppeared(cluster_id)` fires when a new cluster id arrives — +/// surfaces fresh additions without burying them at the bottom of the +/// sort order. Lower-priority pulse (uses `cool` tier, not `hot`). +#[derive(Debug, Default)] +pub struct PulseTracker { + cluster_disappeared: HashMap, + cluster_appeared: HashMap, + backend_down: HashMap<(String, String), u32>, + last_clusters: HashSet, + last_backend_up: HashSet<(String, String)>, +} + +impl PulseTracker { + /// Decrement every active pulse by one render frame. Drop entries that + /// reach zero. Called once per render tick by `App::tick_pulses`. + fn tick(&mut self) { + Self::tick_map(&mut self.cluster_disappeared); + Self::tick_map(&mut self.cluster_appeared); + Self::tick_map(&mut self.backend_down); + } + + /// In-place retain: zero-aged entries are filtered out and survivors + /// decrement by one. Same semantics for all three pulse maps; generic + /// over the key so `(String, String)` (backend_down) reuses it. + fn tick_map(map: &mut HashMap) { + map.retain(|_, v| { + if *v == 0 { + false + } else { + *v -= 1; + true + } + }); + } + + /// Diff a new snapshot against the previous seen set and emit fresh + /// pulses for every transition. Called from `App::ingest_snapshot` + /// BEFORE the snapshot replaces `last_metrics`. + fn diff(&mut self, m: &AggregatedMetrics) { + let mut new_clusters: HashSet = HashSet::new(); + let mut new_backend_up: HashSet<(String, String)> = HashSet::new(); + for (cluster_id, cm) in &m.clusters { + new_clusters.insert(cluster_id.clone()); + for bm in &cm.backends { + let available = bm + .metrics + .get(names::backend::AVAILABLE) + .and_then(|m| match m.inner.as_ref()? { + filtered_metrics::Inner::Gauge(v) => Some(*v), + _ => None, + }) + .unwrap_or(0); + if available >= 1 { + new_backend_up.insert((cluster_id.clone(), bm.backend_id.clone())); + } + } + } + // Cluster disappear: in last set, not in new set. Skip on the very + // first snapshot to avoid pulsing for "every cluster appeared" on + // startup (the first `diff` runs against an empty last_clusters). + if !self.last_clusters.is_empty() { + for missing in self.last_clusters.difference(&new_clusters) { + self.cluster_disappeared + .insert(missing.clone(), PULSE_PERSIST_FRAMES); + } + // Cluster appear: in new set, not in last set. + for fresh in new_clusters.difference(&self.last_clusters) { + self.cluster_appeared + .insert(fresh.clone(), PULSE_PERSIST_FRAMES); + } + } + // Backend down: was up in last snapshot, not up in new snapshot + // (either backend.available dropped to 0 OR the backend row went + // missing). Both cases surface as "this backend can't take traffic + // right now" so they share a pulse class. + for prev_up in &self.last_backend_up { + if !new_backend_up.contains(prev_up) { + self.backend_down + .insert(prev_up.clone(), PULSE_PERSIST_FRAMES); + } + } + self.last_clusters = new_clusters; + self.last_backend_up = new_backend_up; + } + + pub fn cluster_pulse(&self, cluster_id: &str) -> Option { + self.cluster_disappeared + .contains_key(cluster_id) + .then_some(PulseKind::Disappeared) + .or_else(|| { + self.cluster_appeared + .contains_key(cluster_id) + .then_some(PulseKind::Appeared) + }) + } + + pub fn backend_pulse(&self, cluster_id: &str, backend_id: &str) -> Option { + self.backend_down + .contains_key(&(cluster_id.to_owned(), backend_id.to_owned())) + .then_some(PulseKind::WentDown) + } + + /// True when at least one pulse is still animating. The render loop + /// uses this together with `App::take_dirty` to decide whether to + /// repaint on a frame that received no fresh snapshot or event — + /// without it, the fading background tint would freeze on screen + /// between snapshots. + pub fn has_active(&self) -> bool { + !self.cluster_disappeared.is_empty() + || !self.cluster_appeared.is_empty() + || !self.backend_down.is_empty() + } +} + +/// Visual class of a pulse, mapped to a skin tier by the renderer. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PulseKind { + /// Cluster row vanished — operationally surprising. Hot tier. + Disappeared, + /// Cluster row appeared. Cool tier. + Appeared, + /// Backend transitioned from up to down. Hot tier. + WentDown, +} + +/// Threshold table used by the colour-coding rules. Each field is the +/// boundary above which the relevant pane flips its row/cell into a +/// warning/critical hue. Defaults are documented in `doc/sozu-top.md` +/// (week 4); for now we ship sane fixed values and revisit once operators +/// see the TUI in anger. +#[derive(Debug, Clone)] +pub struct ThresholdTable { + /// % of requests that returned 5xx in the most recent tick. Rows above + /// this go critical. + pub error_ratio_critical_pct: f64, + /// `slab.usage_percent` value above which the saturation sparkline goes hot. + pub slab_critical_pct: f64, + /// p99 in ms above which the latency sparkline goes hot. + pub latency_p99_critical_ms: f64, +} + +impl Default for ThresholdTable { + fn default() -> Self { + Self { + error_ratio_critical_pct: 1.0, + slab_critical_pct: 80.0, + latency_p99_critical_ms: 500.0, + } + } +} + +impl ThresholdTable { + /// Return a short critical-banner headline if any of the configured + /// thresholds is currently crossed. Used by the renderer to drive the + /// big-text alert overlay. `None` means "everything's fine"; the + /// renderer skips the banner row in that case. + pub fn critical_message(&self, overview: &OverviewState) -> Option<&'static str> { + if overview + .latency_p99_ms + .last() + .is_some_and(|v| v as f64 >= self.latency_p99_critical_ms) + { + return Some("HIGH LATENCY"); + } + if overview + .service_time_p99_ms + .last() + .is_some_and(|v| v as f64 >= self.latency_p99_critical_ms) + { + return Some("SOZU SLOW"); + } + if overview + .saturation_pct + .last() + .is_some_and(|v| v as f64 >= self.slab_critical_pct) + { + return Some("SATURATION"); + } + None + } +} + +/// HTTP 5xx error-status counters Sōzu synthesises as default answers +/// (500, 502, 503, 504, 507). Hoisted out of the cluster_rows / +/// fold_overview iterators so both call sites share one source of +/// truth for "what counts as a 5xx error in operator dashboards" and a +/// new error variant can be added in one place. +const ERRORS_5XX: [&str; 5] = [ + names::http_status::S500, + names::http_status::S502, + names::http_status::S503, + names::http_status::S504, + names::http_status::S507, +]; + +/// Synthetic `RateCalculator` key for the aggregate overview RPS. +/// Namespaced so it cannot collide with the per-cluster keys +/// `cluster_rate_key` produces. Hidden from operator-facing surfaces. +const OVERVIEW_REQUESTS_KEY: &str = "__overview.requests"; + +/// Build the `RateCalculator` key for a cluster's requests counter. +/// Same `__cluster.` namespace as other future per-cluster series. +fn cluster_rate_key(cluster_id: &str) -> String { + format!("__cluster.{cluster_id}.requests") +} + +/// Build the `RateCalculator` key for a per-(cluster, backend) counter. +/// `suffix` distinguishes `bytes_in` vs `bytes_out` so both rates share +/// one namespace without aliasing. +fn backend_rate_key(cluster_id: &str, backend_id: &str, suffix: &str) -> String { + format!("__backend.{cluster_id}.{backend_id}.{suffix}") +} + +/// Proxy-level metric keys the H2 pane plots in its trend column. +/// Kept in one place so a key rename in `lib::metrics::names` cannot +/// silently drop a sparkline from the pane. +const H2_TRACKED_KEYS: &[&str] = &[ + names::h2::CONNECTION_ACTIVE_STREAMS, + names::http::ALPN_H2, + names::http::ALPN_HTTP11, + names::client::CONNECTIONS, + names::h2::CONNECTION_WINDOW_BYTES, + names::h2::CONNECTION_PENDING_WINDOW_UPDATES, + names::h2::FLOW_CONTROL_STALL, + names::h2::FRAMES_TX_WINDOW_UPDATE, + names::h2::FRAMES_TX_RST_STREAM, + names::h2::FRAMES_TX_GOAWAY, + names::h2::HEADERS_REJECTED_BUDGET_OVERRUN, + names::h2::FLOOD_VIOLATION_GLITCH_WINDOW, + names::h2::FLOOD_VIOLATION_RAPID_RESET, + names::h2::FLOOD_VIOLATION_CONTINUATION, + names::h2::FLOOD_VIOLATION_MADE_YOU_RESET, + names::h2::FLOOD_VIOLATION_PING, + names::h2::FLOOD_VIOLATION_SETTINGS, + names::h2::FLOOD_VIOLATION_PRIORITY, + names::h2::WINDOW_UPDATE_DROPPED, + names::h2::CLOSE_WITH_ACTIVE_STREAMS, +]; + +/// Inline sparkline as a single-line glyph string. Used in table cells +/// where the ratatui `Sparkline` widget is overkill. `alphabet` is the +/// glyph ramp from `GlyphMode::trend_alphabet` (Block / Braille / Tty); +/// samples are scaled to the ring's max so a flat-zero series prints +/// as the lowest glyph on every position. +fn render_spark_bars>(samples: I, alphabet: &[char]) -> String { + let samples: Vec = samples.into_iter().collect(); + if samples.is_empty() || alphabet.is_empty() { + return "—".to_owned(); + } + let max = samples.iter().copied().max().unwrap_or(0).max(1); + let last_idx = alphabet.len() - 1; + samples + .iter() + .map(|v| { + let idx = ((v * last_idx as u64) / max) as usize; + alphabet[idx.min(last_idx)] + }) + .collect() +} + +/// Top-level UI state. Pure data; the render loop snapshots it for each +/// frame and the transport threads push into it via `App::ingest_*`. +#[derive(Debug)] +pub struct App { + pub active_tab: ActiveTab, + pub overview: OverviewState, + pub events: VecDeque, + pub thresholds: ThresholdTable, + pub last_snapshot_at: Option, + /// Most recent `AggregatedMetrics` retained verbatim so panes that need + /// the live cluster / backend / listener map (CLUSTERS, BACKENDS, …) + /// don't have to maintain their own derivation. The OVERVIEW pane + /// reads ring buffers, not this; this is for table-shaped panes. + pub last_metrics: Option, + /// Most recent `ListenersList` from the listeners-collector thread. + /// Polled at a slower cadence than metrics (5 s) because listener + /// state changes are operator-paced. + pub last_listeners: Option, + /// Most recent certificate inventory from the certs-collector thread. + /// Polled every 30 s — cert mutations flow through the EVENTS pane + /// in real time, so the slow refresh is fine for the inventory view. + pub last_certs: Option, + pub status: String, + pub should_quit: bool, + pub help_visible: bool, + /// Cluster-table sort column. Default surfaces unhealthy/error-rate + /// first, then RPS — operators want the failing clusters at the top of + /// the pane so the eye lands on them without scrolling. + pub cluster_sort: ClusterSortKey, + /// Reverse the cluster-table sort ordering when `true`. + pub cluster_sort_reverse: bool, + /// Backend-table sort column. Default `Bandwidth` (busiest backend at + /// the top). + pub backend_sort: BackendSortKey, + pub backend_sort_reverse: bool, + /// "What changed" tracker — surfaces disappearing clusters and newly- + /// unhealthy backends as a short-lived row tint. + pub pulse: PulseTracker, + /// Resolved glyph mode (Braille / Block / TTY-ASCII). Set once at + /// startup by `GlyphMode::resolve(cfg.glyphs)`. Panes that draw + /// sparkline-adjacent custom glyphs read this to pick a ramp; the + /// rest of the UI uses ratatui's built-in `Sparkline` widget which + /// has its own internal ramp. + pub glyphs: GlyphMode, + /// k9s-style colon palette state. When `palette_open` is true the + /// renderer replaces the function-key bar with a one-line input + /// box; `palette_input` carries the in-progress text. + pub palette_open: bool, + pub palette_input: tui_input::Input, + /// Last unknown command or recoverable error from the palette. The + /// renderer surfaces this on the function-key bar so the operator + /// sees why their command bounced. + pub palette_error: Option, + /// Dirty flag for the render loop: set whenever an `ingest_*` call + /// folds new data into the state OR a pulse decrements in + /// `tick_pulses`. The renderer checks this via `take_dirty` and + /// skips the synchronized-update + draw cycle when neither path + /// produced a visible change AND no pulse is mid-animation. Avoids + /// burning ~2-3 % of one core for an idle TUI on a quiet system. + /// Initialised `true` so the first frame paints unconditionally. + is_dirty: bool, + rates: RateCalculator, + /// Per-cluster requests-per-second derived from `names::backend::REQUESTS` + /// counters. Computed once per `ingest_snapshot` via `RateCalculator` + /// and consumed by `cluster_rows` so the CLUSTERS pane shows a rate, + /// not a cumulative counter. + cluster_rps: HashMap, + /// Per-(cluster, backend) bytes-per-second download rate derived + /// from cumulative `names::backend::BYTES_IN` counters. Stored as + /// bytes/sec; the renderer scales to bits/sec at format time so + /// the column reads in Kbps / Mbps / Gbps. Cleared and rebuilt on + /// every `ingest_snapshot`. + backend_rate_in_bps: HashMap<(String, String), f64>, + /// Per-(cluster, backend) bytes-per-second upload rate derived + /// from cumulative `names::backend::BYTES_OUT`. + backend_rate_out_bps: HashMap<(String, String), f64>, + /// Per-metric sample rings driving the H2 pane's trend column. + /// Populated by `fold_h2_trends` once per `ingest_snapshot`. Keys + /// are the canonical `names::*` strings the pane reads, so a key + /// rename can never silently freeze a trend column. + h2_trends: HashMap<&'static str, SparkRing>, + /// Snapshot ingest is suspended when `true`. The transport keeps + /// polling and the cardinality lease keeps renewing, but the App + /// ignores incoming `Snapshot`s so the visible state is frozen on + /// the last frame. Toggled by F5 / `p`. + pub paused: bool, +} + +/// Columns the CLUSTERS pane can sort by. Cycled with `s`; reversed with +/// `S`. Default: `ErrorRate` desc → unhealthy clusters surface first. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ClusterSortKey { + ClusterId, + Rps, + ErrorRate, + LatencyP99, + BackendsAvailable, +} + +impl ClusterSortKey { + pub const ALL: &'static [Self] = &[ + Self::ErrorRate, + Self::Rps, + Self::LatencyP99, + Self::BackendsAvailable, + Self::ClusterId, + ]; + + pub fn label(self) -> &'static str { + match self { + Self::ClusterId => "id", + Self::Rps => "rps", + Self::ErrorRate => "err%", + Self::LatencyP99 => "p99", + Self::BackendsAvailable => "backends", + } + } + + pub fn cycle(self) -> Self { + let idx = Self::ALL.iter().position(|k| *k == self).unwrap_or(0); + Self::ALL[(idx + 1) % Self::ALL.len()] + } +} + +#[derive(Debug, Default)] +pub struct OverviewState { + pub rps: SparkRing, + pub latency_p99_ms: SparkRing, + /// p99 of sozu's own `service_time` (request processing inside the + /// proxy, NOT including the backend round-trip). Sparkline kept as + /// raw milliseconds for direct comparison with `latency_p99_ms`. + pub service_time_p99_ms: SparkRing, + pub saturation_pct: SparkRing, + /// Active session gauge (`http.active_requests` or fallback). Surfaced + /// as a big numeral above the saturation sparkline. + pub active_sessions: u64, + /// `client.connections` gauge. Surfaced as a big numeral above the + /// RPS sparkline. + pub client_connections: u64, +} + +impl Default for SparkRing { + fn default() -> Self { + Self::new(SPARKLINE_DEPTH) + } +} + +impl App { + pub fn new() -> Self { + Self { + active_tab: ActiveTab::Overview, + overview: OverviewState::default(), + events: VecDeque::with_capacity(EVENT_RING_DEPTH), + thresholds: ThresholdTable::default(), + last_snapshot_at: None, + last_metrics: None, + last_listeners: None, + last_certs: None, + status: String::new(), + should_quit: false, + help_visible: false, + cluster_sort: ClusterSortKey::ErrorRate, + cluster_sort_reverse: false, + backend_sort: BackendSortKey::Bandwidth, + backend_sort_reverse: false, + pulse: PulseTracker::default(), + glyphs: GlyphMode::Block, + palette_open: false, + palette_input: tui_input::Input::default(), + palette_error: None, + is_dirty: true, + rates: RateCalculator::default(), + cluster_rps: HashMap::new(), + backend_rate_in_bps: HashMap::new(), + backend_rate_out_bps: HashMap::new(), + h2_trends: HashMap::new(), + paused: false, + } + } + + /// Read-and-clear the dirty flag for the renderer. Returns `true` when + /// the App folded new data or aged a pulse since the last frame; the + /// renderer combines this with `PulseTracker::has_active()` to decide + /// whether to repaint. + pub fn take_dirty(&mut self) -> bool { + std::mem::replace(&mut self.is_dirty, false) + } + + /// Force the next frame to redraw. Used by the render loop when it + /// drains an out-of-band status message (renewer error) into + /// `App::status` and the operator needs the F-key bar repainted on + /// the next tick even though no snapshot landed. + pub fn mark_dirty(&mut self) { + self.is_dirty = true; + } + + /// Fold an inbound transport `Snapshot` into the ring buffers. Called by + /// the render loop on every drain of the `crossbeam_channel`. Cheap (one + /// linear scan over the proxy / cluster maps). + pub fn ingest_snapshot(&mut self, snap: &Snapshot) { + if self.paused { + // F5 / `p` freezes the visible state without dropping the + // transport lease. Skip the entire fold so sparklines and + // tables stay on the last frame instead of advancing. + return; + } + self.last_snapshot_at = Some(snap.received_at); + self.fold_overview(&snap.metrics, snap.received_at); + self.fold_h2_trends(&snap.metrics); + // Diff vs the last seen snapshot BEFORE replacing it, so the + // pulse tracker can emit ClusterDisappeared / BackendWentDown + // transitions against the previous set. + self.pulse.diff(&snap.metrics); + // Keep the latest `AggregatedMetrics` for the CLUSTERS / BACKENDS + // panes. Cloning isn't cheap for very-high-cardinality fleets + // (>1000 clusters), but in practice the master already paid this + // cost on the wire; revisit only if profile data justifies it. + self.last_metrics = Some(snap.metrics.clone()); + self.is_dirty = true; + } + + /// Push the latest sample for every H2-pane-tracked metric into its + /// SparkRing. Reads gauges and counters uniformly so flow-control + /// gauges and frame counters share one trend renderer. New keys are + /// allocated on first observation; the SparkRing rolls a 60-sample + /// window forward as snapshots arrive. + fn fold_h2_trends(&mut self, m: &AggregatedMetrics) { + for key in H2_TRACKED_KEYS { + let value = gauge_value(m.proxying.get(*key)) + .or_else(|| count_value(m.proxying.get(*key)).map(|c| c.max(0) as u64)) + .unwrap_or(0); + self.h2_trends + .entry(*key) + .or_insert_with(|| SparkRing::new(SPARKLINE_DEPTH)) + .push(value); + } + } + + /// Render the trend bars for an H2-pane metric. Returns "—" when no + /// samples have landed (cold start). Otherwise produces a string of + /// characters from the resolved `GlyphMode` alphabet, scaled to the + /// largest sample in the ring so a flat-line zero series prints as + /// the minimum-height character. + pub fn h2_trend_bars(&self, key: &str) -> String { + match self.h2_trends.get(key) { + Some(ring) if !ring.is_empty() => { + render_spark_bars(ring.samples().copied(), self.glyphs.trend_alphabet()) + } + _ => "—".to_owned(), + } + } + + /// Advance the pulse tracker by one render frame. Called from the + /// render loop's draw step so pulses decay even when no new snapshot + /// arrived this frame. Sets the dirty flag when at least one pulse + /// was still active before the tick so the next frame paints the + /// fading tint instead of freezing it on screen. + pub fn tick_pulses(&mut self) { + if self.pulse.has_active() { + self.is_dirty = true; + } + self.pulse.tick(); + } + + /// Build the per-cluster summary rows for the CLUSTERS pane. Computed + /// on demand (renderer call) rather than cached so the sort + reverse + /// state can change between frames without a full re-fold. + pub fn cluster_rows(&self) -> Vec { + let metrics = match self.last_metrics.as_ref() { + Some(m) => m, + None => return Vec::new(), + }; + let mut rows: Vec = metrics + .clusters + .iter() + .map(|(id, cm)| { + let requests = cluster_count_total(cm, names::backend::REQUESTS); + let errors_5xx: i64 = ERRORS_5XX.iter().map(|k| cluster_count_total(cm, k)).sum(); + let p99_ms = cluster_p99_max(cm); + let p50_ms = cluster_p50_max(cm); + // Backends-up vs backends-total. Three reading paths, + // tried in order, so the column populates whatever + // metric_detail level or worker lifecycle stage the + // operator is in. + // + // 1. `cluster.total_backends` / `cluster.available_backends` + // cluster-level rollup gauges. These are the + // authoritative per-cluster aggregates published by + // `BackendMap::record_cluster_availability` whenever a + // backend registers, transitions, or first serves a + // request. Survives the hourly counter clear. + // 2. Per-backend `backend.available` gauge summed across + // `cm.backends[].metrics`. Populated under backend- + // detail filing whenever a backend up/down event + // fires. Independent of the cluster-level rollup so a + // fresh worker before the first `record_cluster_…` + // call still surfaces something. + // 3. `cm.backends.len()` as a last-resort total. Under + // backend-detail every backend that emitted any + // metric (bytes, response_time, requests) lands in + // this Vec, so the cardinality is a useful lower + // bound on "backends the worker has seen this + // minute" even before any cluster-level gauge + // arrives. + let rollup_total = + gauge_value(cm.cluster.get(names::cluster::TOTAL_BACKENDS)).unwrap_or(0) as u32; + let rollup_available = + gauge_value(cm.cluster.get(names::cluster::AVAILABLE_BACKENDS)).unwrap_or(0) + as u32; + let backend_available_sum: u32 = cm + .backends + .iter() + .filter_map(|b| gauge_value(b.metrics.get(names::backend::AVAILABLE))) + .map(|v| v as u32) + .sum(); + let backends_total = if rollup_total > 0 { + rollup_total + } else { + cm.backends.len() as u32 + }; + // Final fallback: if the rollup gauge hasn't been + // published yet and no per-backend gauge is + // present either, assume every backend the worker + // has observed is up. The first health-check + // failure / `record_cluster_availability` call + // refreshes this with the authoritative value. + let backends_available = if rollup_total == 0 && backend_available_sum == 0 { + cm.backends.len() as u32 + } else { + rollup_available.max(backend_available_sum) + }; + let error_rate_pct = if requests > 0 { + (errors_5xx as f64 / requests as f64) * 100.0 + } else { + 0.0 + }; + ClusterRow { + cluster_id: id.clone(), + rps: self.cluster_rps.get(id).copied().unwrap_or(0), + error_rate_pct, + p50_ms, + p99_ms, + backends_total, + backends_available, + } + }) + .collect(); + rows.sort_by(|a, b| { + use std::cmp::Ordering; + let ord = match self.cluster_sort { + ClusterSortKey::ClusterId => a.cluster_id.cmp(&b.cluster_id), + ClusterSortKey::Rps => a.rps.cmp(&b.rps).reverse(), + ClusterSortKey::ErrorRate => a + .error_rate_pct + .partial_cmp(&b.error_rate_pct) + .unwrap_or(Ordering::Equal) + .reverse(), + ClusterSortKey::LatencyP99 => a.p99_ms.cmp(&b.p99_ms).reverse(), + ClusterSortKey::BackendsAvailable => { + a.backends_available.cmp(&b.backends_available) + } + }; + if self.cluster_sort_reverse { + ord.reverse() + } else { + ord + } + }); + rows + } + + /// Build the per-backend rows for the BACKENDS pane. Flattens every + /// `(cluster_id, BackendMetrics)` pair across the freshest snapshot. + /// Sorted per `backend_sort` / `backend_sort_reverse`. + pub fn backend_rows(&self) -> Vec { + let metrics = match self.last_metrics.as_ref() { + Some(m) => m, + None => return Vec::new(), + }; + let mut rows: Vec = Vec::new(); + for (cluster_id, cm) in &metrics.clusters { + for bm in &cm.backends { + // Per-second rate from the cumulative `BYTES_IN`/`OUT` + // counters. The rate map is populated by `fold_overview` + // on every snapshot so the lookup is constant-time and + // the row stays in lock-step with the freshest poll. + let key = (cluster_id.clone(), bm.backend_id.clone()); + let bw_in_bps = self.backend_rate_in_bps.get(&key).copied().unwrap_or(0.0); + let bw_out_bps = self.backend_rate_out_bps.get(&key).copied().unwrap_or(0.0); + rows.push(BackendRow { + cluster_id: cluster_id.clone(), + backend_id: bm.backend_id.clone(), + bw_in_bps, + bw_out_bps, + connections: gauge_value( + bm.metrics.get(names::backend::CONNECTIONS_PER_BACKEND), + ) + .unwrap_or(0), + p50_ms: percentile_p50_ms(bm.metrics.get(names::backend::RESPONSE_TIME)) + .unwrap_or(0), + p99_ms: percentile_p99_ms(bm.metrics.get(names::backend::RESPONSE_TIME)) + .unwrap_or(0), + requests_total: count_value(bm.metrics.get(names::backend::REQUESTS)) + .unwrap_or(0) as u64, + }); + } + } + rows.sort_by(|a, b| { + use std::cmp::Ordering; + let ord = match self.backend_sort { + BackendSortKey::ClusterId => a + .cluster_id + .cmp(&b.cluster_id) + .then(a.backend_id.cmp(&b.backend_id)), + BackendSortKey::BackendId => a.backend_id.cmp(&b.backend_id), + BackendSortKey::Bandwidth => { + let abw = a.bw_in_bps + a.bw_out_bps; + let bbw = b.bw_in_bps + b.bw_out_bps; + abw.partial_cmp(&bbw).unwrap_or(Ordering::Equal).reverse() + } + BackendSortKey::Connections => a.connections.cmp(&b.connections).reverse(), + BackendSortKey::LatencyP99 => a.p99_ms.cmp(&b.p99_ms).reverse(), + BackendSortKey::Requests => a.requests_total.cmp(&b.requests_total).reverse(), + }; + // `Ordering::reverse` chains nicely with `.then()` above; if all + // primary keys tie we drop to (cluster_id, backend_id) lex order + // for a deterministic on-screen layout. The comparator MUST + // return `Equal` on full-tie; the reverse direction is applied + // by `rows.reverse()` after the sort. Returning `Greater` on + // self-comparison would violate strict-weak-ordering. + ord.then_with(|| a.cluster_id.cmp(&b.cluster_id)) + .then_with(|| a.backend_id.cmp(&b.backend_id)) + }); + if self.backend_sort_reverse { + rows.reverse(); + } + rows + } + + fn fold_overview(&mut self, m: &AggregatedMetrics, sampled_at: Instant) { + // Sōzu emits the canonical per-request counter (`names::backend::REQUESTS`) + // from `record_backend_metrics!`. The cardinality knob routes the + // counter to either `cm.cluster[REQUESTS]` (detail = cluster, default) + // or `cm.backends[i].metrics[REQUESTS]` (detail = backend, the level + // `sozu top` auto-leases on startup). `cluster_count_total` handles + // both filings transparently so the OVERVIEW stays correct regardless + // of the active detail level. + let mut total_requests: i64 = 0; + let mut total_requests_observed: i64 = 0; + // Per-cluster RPS via the same RateCalculator. Stored on + // `cluster_rps` so the CLUSTERS pane can render rates without + // re-deriving from the cumulative counter. Keys are namespaced + // `__cluster..requests` to keep them separate from the + // overview-aggregate keys. + self.cluster_rps.clear(); + self.backend_rate_in_bps.clear(); + self.backend_rate_out_bps.clear(); + let mut live_rate_keys: HashSet = HashSet::new(); + live_rate_keys.insert(OVERVIEW_REQUESTS_KEY.to_owned()); + for (id, cm) in &m.clusters { + let cluster_requests = cluster_count_total(cm, names::backend::REQUESTS); + if cluster_requests > 0 { + total_requests = total_requests.saturating_add(cluster_requests); + total_requests_observed = total_requests_observed.saturating_add(cluster_requests); + } + let key = cluster_rate_key(id); + let rate = self + .rates + .record(&key, cluster_requests, sampled_at) + .unwrap_or(0.0) + .max(0.0); + self.cluster_rps.insert(id.clone(), rate as u64); + live_rate_keys.insert(key); + // Per-backend bytes-in / bytes-out rates. `record_backend_metrics!` + // (`lib/src/metrics/mod.rs`) emits the cumulative byte counters + // per (cluster, backend) at request end; the RateCalculator + // turns those cumulative counts into a per-second delta the + // BACKENDS pane renders as Mbps. First observation returns + // `None` so the row prints `0.00 / 0.00` until the second + // snapshot lands — matches the cluster_rps shape. + for bm in &cm.backends { + let bid = &bm.backend_id; + let bytes_in = count_value(bm.metrics.get(names::backend::BYTES_IN)).unwrap_or(0); + let bytes_out = count_value(bm.metrics.get(names::backend::BYTES_OUT)).unwrap_or(0); + let in_key = backend_rate_key(id, bid, "bytes_in"); + let out_key = backend_rate_key(id, bid, "bytes_out"); + let rate_in = self + .rates + .record(&in_key, bytes_in, sampled_at) + .unwrap_or(0.0) + .max(0.0); + let rate_out = self + .rates + .record(&out_key, bytes_out, sampled_at) + .unwrap_or(0.0) + .max(0.0); + let map_key = (id.clone(), bid.clone()); + self.backend_rate_in_bps.insert(map_key.clone(), rate_in); + self.backend_rate_out_bps.insert(map_key, rate_out); + live_rate_keys.insert(in_key); + live_rate_keys.insert(out_key); + } + } + // Drop history entries for clusters / backends that disappeared + // this tick; otherwise `RateCalculator.history` would accumulate + // stale `(prev_value, prev_at)` for every (cluster, backend) + // tuple ever seen. + self.rates.retain(|k| live_rate_keys.contains(k)); + // Final fallback when no per-cluster counter is populated yet (no + // backend round-trip completed since the worker started, OR + // `metric_detail = process` strips both labels). `http.requests` is + // incremented at request-receive time without cluster_id, so it + // surfaces traffic the user is generating even before the first + // response cycle finishes. + if total_requests_observed == 0 { + if let Some(v) = count_value(m.proxying.get(names::http::REQUESTS)) { + total_requests = total_requests.saturating_add(v); + } + } + + // Per-second rate from the cumulative counter. The first observation + // returns `None` and shows as 0 in the ring (nothing useful to plot + // before we have a baseline). + let rps = self + .rates + .record(OVERVIEW_REQUESTS_KEY, total_requests, sampled_at) + .unwrap_or(0.0) + .max(0.0); + self.overview.rps.push(rps as u64); + + // Sozu's own request-processing time (`service_time`, distinct + // from `backend_response_time`). p99 milliseconds matches the + // shape of the LATENCY p99 cell so operators can compare the + // two at a glance. + let service_p99 = + percentile_p99_ms(m.proxying.get(names::event_loop::SERVICE_TIME)).unwrap_or(0); + self.overview.service_time_p99_ms.push(service_p99); + + // Latency p99 — sum of cluster `backend_response_time` percentiles + // is not meaningful (you cannot average percentiles), so we take + // the max p99 across clusters. Operators reading the OVERVIEW want + // "is anyone slow?" not "average latency". `cluster_p99_max` also + // walks per-backend filings so the answer holds under any detail + // level. + let max_p99_ms = m.clusters.values().map(cluster_p99_max).max().unwrap_or(0); + self.overview.latency_p99_ms.push(max_p99_ms); + + // Saturation: prefer `slab.usage_percent`; fall back to + // `client.connections` / `client.connections_max` ratio when the + // gauge isn't surfaced. Both are gauges so they survive the hourly + // counter clear unchanged. + let saturation = gauge_value(m.proxying.get(names::slab::USAGE_PERCENT)) + .or_else(|| gauge_value(m.proxying.get(names::buffer::USAGE_PERCENT))) + .map(|v| v.min(100)) + .unwrap_or(0); + self.overview.saturation_pct.push(saturation as u64); + + self.overview.active_sessions = + gauge_value(m.proxying.get(names::http::ACTIVE_REQUESTS)).unwrap_or(0) as u64; + self.overview.client_connections = + gauge_value(m.proxying.get(names::client::CONNECTIONS)).unwrap_or(0) as u64; + } + + /// Append a transport-published `TopEvent` into the recent-events ring. + /// Drops the oldest when capacity is reached so the UI shows a sliding + /// window of the last `EVENT_RING_DEPTH` events. + pub fn ingest_event(&mut self, event: TopEvent) { + if self.events.len() == EVENT_RING_DEPTH { + self.events.pop_front(); + } + self.events.push_back(event); + self.is_dirty = true; + } + + /// Replace the cached listener inventory with a fresh snapshot. + pub fn ingest_listeners(&mut self, snap: ListenersSnapshot) { + self.last_listeners = Some(snap); + self.is_dirty = true; + } + + /// Replace the cached certificate inventory with a fresh snapshot. + pub fn ingest_certs(&mut self, snap: CertsSnapshot) { + self.last_certs = Some(snap); + self.is_dirty = true; + } + + /// Open the colon palette and clear any pending error. Called by the + /// renderer when the operator presses `:`. Marks the App dirty so the + /// render loop's dirty-gate paints the palette line on the next frame + /// instead of waiting for the next snapshot tick (~1 s on a quiet + /// system). + pub fn open_palette(&mut self) { + self.palette_open = true; + self.palette_input = tui_input::Input::default(); + self.palette_error = None; + self.is_dirty = true; + } + + /// Close the palette without applying the in-progress command. + /// Called on Escape / Ctrl-C while the palette is open. Marks dirty + /// so the F-key bar redraws over the dismissed palette immediately. + pub fn cancel_palette(&mut self) { + self.palette_open = false; + self.palette_input = tui_input::Input::default(); + self.is_dirty = true; + } + + /// Apply the in-progress palette command. Recognised commands: + /// + /// - `:overview` / `:o` — jump to OVERVIEW. + /// - `:cluster` / `:clusters` / `:c` — jump to CLUSTERS. + /// - `:backend` / `:backends` / `:b` — jump to BACKENDS. + /// - `:listener` / `:listeners` / `:l` — jump to LISTENERS. + /// - `:cert` / `:certs` — jump to CERTS. + /// - `:h2` — jump to H2. + /// - `:event` / `:events` / `:e` — jump to EVENTS. + /// - `:help` / `:h` / `:?` — toggle help. + /// - `:quit` / `:q` — exit cleanly. + /// + /// Unknown commands flip `palette_error`; the renderer surfaces + /// the message on the function-key bar. + pub fn apply_palette(&mut self) { + let raw = self.palette_input.value().trim().to_owned(); + let cmd = raw.trim_start_matches(':'); + if let Some(tab) = ActiveTab::from_alias(cmd) { + self.active_tab = tab; + } else { + match cmd { + "help" | "h" | "?" => self.help_visible = !self.help_visible, + "quit" | "q" => self.should_quit = true, + "" => {} // empty command — just close the palette + other => { + self.palette_error = Some(format!("unknown command: :{other}")); + self.palette_open = false; + // Clear the input on the unknown-command path too so + // the next `:` keypress opens a fresh palette rather + // than re-populating with the operator's previous + // typo. Mirrors the success path (line below). + self.palette_input = tui_input::Input::default(); + // palette_error is rendered on the F-key bar; the + // dirty-gate would otherwise hide the message until + // the next snapshot or pulse-tick. + self.is_dirty = true; + return; + } + } + } + self.palette_open = false; + self.palette_input = tui_input::Input::default(); + self.palette_error = None; + // Apply always mutates visible state (active_tab / help_visible / + // should_quit / palette_error), so the next frame must repaint + // regardless of which branch ran above. + self.is_dirty = true; + } +} + +impl Default for App { + fn default() -> Self { + Self::new() + } +} + +pub(super) fn count_value(metric: Option<&FilteredMetrics>) -> Option { + let inner = metric?.inner.as_ref()?; + match inner { + filtered_metrics::Inner::Count(v) => Some(*v), + _ => None, + } +} + +pub(super) fn gauge_value(metric: Option<&FilteredMetrics>) -> Option { + let inner = metric?.inner.as_ref()?; + match inner { + filtered_metrics::Inner::Gauge(v) => Some(*v), + _ => None, + } +} + +fn percentile_p99_ms(metric: Option<&FilteredMetrics>) -> Option { + let inner = metric?.inner.as_ref()?; + match inner { + filtered_metrics::Inner::Percentiles(p) => Some(p.p_99), + _ => None, + } +} + +fn percentile_p50_ms(metric: Option<&FilteredMetrics>) -> Option { + let inner = metric?.inner.as_ref()?; + match inner { + filtered_metrics::Inner::Percentiles(p) => Some(p.p_50), + _ => None, + } +} + +/// Sum a count counter across both filings of the cardinality knob: the +/// cluster-level entry (`metric_detail = cluster`, default) AND the +/// per-backend entries (`metric_detail = backend`, the level the TUI +/// auto-elevates to on startup). The two filings are disjoint under any +/// given detail level — only one is populated at a time — so summing +/// both yields the correct cluster-wide total in either configuration. +fn cluster_count_total(cm: &ClusterMetrics, key: &str) -> i64 { + let cluster_level = count_value(cm.cluster.get(key)).unwrap_or(0); + let backend_sum: i64 = cm + .backends + .iter() + .filter_map(|b| count_value(b.metrics.get(key))) + .sum(); + cluster_level.saturating_add(backend_sum) +} + +/// Max of the cluster-level percentile and every per-backend percentile. +/// Under cluster-detail the backends list is empty; under backend-detail +/// only the per-backend entries are populated. Taking the max matches +/// "is anyone slow?" — the operator-facing intent of the OVERVIEW pane. +fn cluster_p99_max(cm: &ClusterMetrics) -> u64 { + let cluster_level = percentile_p99_ms(cm.cluster.get(names::backend::RESPONSE_TIME)); + let backend_max = cm + .backends + .iter() + .filter_map(|b| percentile_p99_ms(b.metrics.get(names::backend::RESPONSE_TIME))) + .max(); + cluster_level + .into_iter() + .chain(backend_max) + .max() + .unwrap_or(0) +} + +fn cluster_p50_max(cm: &ClusterMetrics) -> u64 { + let cluster_level = percentile_p50_ms(cm.cluster.get(names::backend::RESPONSE_TIME)); + let backend_max = cm + .backends + .iter() + .filter_map(|b| percentile_p50_ms(b.metrics.get(names::backend::RESPONSE_TIME))) + .max(); + cluster_level + .into_iter() + .chain(backend_max) + .max() + .unwrap_or(0) +} + +/// Per-cluster row produced by `App::cluster_rows`. Pure data; the renderer +/// turns this into a `Row` widget. Pulse-tint state comes in week 3. +#[derive(Debug, Clone)] +pub struct ClusterRow { + pub cluster_id: String, + /// Per-cluster requests-per-second derived once on `ingest_snapshot` + /// via the shared `RateCalculator` and cached on `App.cluster_rps`. + /// `0` until two snapshots have landed for this cluster. + pub rps: u64, + pub error_rate_pct: f64, + pub p50_ms: u64, + pub p99_ms: u64, + pub backends_total: u32, + pub backends_available: u32, +} + +/// Per-backend row produced by `App::backend_rows`. Flattens every +/// `(cluster_id, BackendMetrics)` pair into a single sortable list. +#[derive(Debug, Clone)] +pub struct BackendRow { + pub cluster_id: String, + pub backend_id: String, + /// Per-second bytes received from the backend (`BYTES_IN`). The + /// renderer scales `× 8` to bits/sec and formats with a Kbps / + /// Mbps / Gbps suffix. `0.0` on the first observation and after + /// the hourly counter clear (the RateCalculator emits `Some(0.0)` + /// on monotonic decrease rather than a negative spike). + pub bw_in_bps: f64, + /// Per-second bytes sent to the backend (`BYTES_OUT`). + pub bw_out_bps: f64, + pub connections: u64, + pub p50_ms: u64, + pub p99_ms: u64, + pub requests_total: u64, +} + +/// Sort columns for the BACKENDS pane. Default `Bandwidth` (back_bytes_out +/// — the most operationally-loaded backend at the top). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BackendSortKey { + ClusterId, + BackendId, + Bandwidth, + Connections, + LatencyP99, + Requests, +} + +impl BackendSortKey { + pub const ALL: &'static [Self] = &[ + Self::Bandwidth, + Self::LatencyP99, + Self::Connections, + Self::Requests, + Self::ClusterId, + Self::BackendId, + ]; + + pub fn label(self) -> &'static str { + match self { + Self::ClusterId => "cluster", + Self::BackendId => "backend", + Self::Bandwidth => "bw", + Self::Connections => "conn", + Self::LatencyP99 => "p99", + Self::Requests => "req", + } + } + + pub fn cycle(self) -> Self { + let idx = Self::ALL.iter().position(|k| *k == self).unwrap_or(0); + Self::ALL[(idx + 1) % Self::ALL.len()] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn spark_ring_drops_oldest_at_capacity() { + let mut r = SparkRing::new(3); + r.push(1); + r.push(2); + r.push(3); + r.push(4); + assert_eq!(r.to_vec(), vec![2, 3, 4]); + assert_eq!(r.last(), Some(4)); + assert_eq!(r.max(), 4); + } + + #[test] + fn rate_calculator_first_observation_returns_none() { + let mut rc = RateCalculator::default(); + let now = Instant::now(); + assert!(rc.record("k", 100, now).is_none()); + } + + #[test] + fn rate_calculator_handles_monotonic_increase() { + let mut rc = RateCalculator::default(); + let t0 = Instant::now(); + let _ = rc.record("k", 100, t0); + let t1 = t0 + std::time::Duration::from_secs(1); + let r = rc.record("k", 150, t1).unwrap(); + assert!((r - 50.0).abs() < 0.001); + } + + #[test] + fn rate_calculator_emits_zero_on_hourly_reset() { + // Sōzu's `LocalDrain::clear` drops `Count`/`Time` every hour; a + // counter going backwards must not produce a negative rate. + let mut rc = RateCalculator::default(); + let t0 = Instant::now(); + let _ = rc.record("k", 500, t0); + let t1 = t0 + std::time::Duration::from_secs(1); + let r = rc.record("k", 10, t1).unwrap(); + assert_eq!(r, 0.0); + } + + #[test] + fn active_tab_round_trips_digits_and_cycle() { + assert_eq!(ActiveTab::from_digit(1), Some(ActiveTab::Overview)); + assert_eq!(ActiveTab::from_digit(7), Some(ActiveTab::Events)); + assert_eq!(ActiveTab::from_digit(8), None); + assert_eq!(ActiveTab::Overview.cycle(true), ActiveTab::Clusters); + assert_eq!(ActiveTab::Overview.cycle(false), ActiveTab::Events); + } +} diff --git a/bin/src/ctl/top/cardinality.rs b/bin/src/ctl/top/cardinality.rs new file mode 100644 index 000000000..dc2087e74 --- /dev/null +++ b/bin/src/ctl/top/cardinality.rs @@ -0,0 +1,601 @@ +//! Runtime cardinality lease lifecycle for `sozu top`. +//! +//! On startup the TUI elevates the metrics drain to `MetricDetail::Backend` +//! via the `SetMetricDetail` proto verb. The lease is `client_id`-keyed +//! with a configurable TTL; a renewer thread re-sends every `ttl/2` seconds +//! so the lease stays alive while the TUI runs. On Drop (clean shutdown, +//! panic, SIGINT/SIGTERM via `ctrlc::set_handler` registered by the +//! renderer) we send a best-effort `clear: true` revoke. Crash safety: the +//! lease self-expires server-side after `ttl_seconds` so a dead `sozu top` +//! never permanently elevates cardinality. +//! +//! # Single-owner topology +//! +//! One owner thread owns the `Channel`. Every write — +//! initial apply, periodic renew, and final clear — flows through that +//! thread via a `crossbeam_channel::Sender`. The renewer +//! thread holds only the sender clone and a stop flag; it owns no +//! `Channel` of its own. +//! +//! Why: the master stamps a fresh `peer_session_ulid` on every +//! `ClientSession` it accepts. The earlier design opened one `Channel` for +//! the apply path and a separate `Channel` for the renewer thread, so the +//! renewer's writes carried a *different* session ulid than the apply. +//! The worker's `PeerBinding` table was rebound on each renewal, with two +//! consequences this module structurally prevents now: +//! +//! 1. After the first renewal (~ttl/2 seconds), the apply channel's own +//! `clear`-on-Drop failed as `Unauthorized` because the lease's bound +//! `peer_session_ulid` no longer matched the apply channel. The TUI +//! could not revoke its own lease; the worker had to wait for TTL +//! expiry on every exit. +//! 2. A same-UID actor who guessed the `client_id` format +//! (`top::<8 hex>`; PID observable, 8 hex needs ~4 billion +//! attempts) could send their own renewal from a separate session, +//! overwrite the binding, and then clear the lease. The peer-credential +//! binding's stated purpose is exactly to stop same-UID actors from +//! clearing another lease; the renewal path defeated that promise. +//! +//! Routing every write through one owner thread means the worker always +//! sees the same `peer_session_ulid` for a given guard, so binding +//! overwrite from the legitimate path is structurally impossible. + +use std::process; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; +use std::thread::JoinHandle; +use std::time::Duration; + +use crossbeam_channel::{Receiver, Sender, after, bounded, select}; +use sozu_command_lib::{ + channel::Channel, + config::Config, + proto::command::{ + MetricDetail, Request, Response, ResponseStatus, SetMetricDetail, request::RequestType, + }, +}; + +use crate::cli::TopDetail; +use crate::ctl::create_channel; + +use super::CtlError; + +/// Shared status slot used by background threads (renewer, transport +/// collectors) to surface error / degraded-mode notes to the operator. +/// The render loop drains it once per tick and copies the most recent +/// message into `App::status` so the F-key bar shows it on the next +/// frame. Wrapped in `Arc>` because the writers (background +/// threads) and the reader (render-loop thread) live in different +/// scheduling contexts; contention is rare (one write on error, one +/// read per tick) so the lock is uncontended in practice. +pub type StatusSlot = Arc>>; + +/// Build an empty shared status slot. +pub fn new_status_slot() -> StatusSlot { + Arc::new(Mutex::new(None)) +} + +/// Atomically take the latest pending status message, if any. Used by +/// the render loop's per-tick drain. Returns `None` when no background +/// thread has published since the last drain. Silently passes a poisoned +/// lock through `into_inner` — a poisoned mutex here means a background +/// thread panicked while holding it, and we want to surface the residual +/// message rather than swallow it. +pub fn take_status(slot: &StatusSlot) -> Option { + match slot.lock() { + Ok(mut g) => g.take(), + Err(poison) => poison.into_inner().take(), + } +} + +/// Publish a status message from a background thread. Drops the +/// previous message if it had not yet been drained — render-loop +/// cadence (~30 fps) is much faster than realistic background-thread +/// error rates so overwriting is the right policy. A poisoned lock is +/// recovered the same way as `take_status`. +pub(super) fn publish_status(slot: &StatusSlot, msg: String) { + let mut g = match slot.lock() { + Ok(g) => g, + Err(poison) => poison.into_inner(), + }; + *g = Some(msg); +} + +/// Owner-thread mailbox messages. The owner thread holds the one +/// `Channel` for this guard; every write goes +/// through one of these variants. See module docs for the rationale. +enum DetailRequest { + /// Initial apply. The reply oneshot carries the outcome so + /// `DetailGuard::apply` can fail fast and surface the error to the + /// caller (worker too old, transport rejected the verb, …) before + /// returning the guard. + Apply { reply: Sender> }, + /// Periodic renew, sent by the renewer thread once per `ttl/2`. + /// The owner thread reconstructs the request from its cached + /// `client_id` / `detail` / `ttl_seconds` / `reason`. On failure + /// the owner publishes a status note and exits; that breaks the + /// renewer's send loop too (its sender clone fails). + Renew, + /// Best-effort revoke, sent by `Drop`. The owner attempts the + /// write and ignores the outcome — TTL expiry is the backstop. + Clear, + /// Graceful owner-thread exit, sent by `Drop` after `Clear`. + Stop, +} + +/// RAII guard that holds a runtime cardinality lease while the TUI runs. +/// Drop clears the lease (best-effort) so the worker drops back to its +/// configured floor. Crash-safe: even if Drop never runs, the lease +/// self-expires after `ttl_seconds`. +pub struct DetailGuard { + /// Mailbox to the owner thread. `Apply` / `Renew` / `Clear` / `Stop` + /// all flow through this one sender so every wire-level write is + /// emitted on a single `Channel` connection (and therefore a single + /// master-assigned `peer_session_ulid`). See module docs. + tx: Sender, + /// Owner-thread join handle. Joined on Drop after `Stop` is sent. + owner_handle: Option>, + /// Renewer-thread stop flag. The renewer checks this before each + /// send so a Drop racing with the timer's wake-up does not emit a + /// stale `Renew`. + renewer_stop: Arc, + /// Fast-wake signal for the renewer thread. The renewer waits on + /// `select!(after(ttl/2), recv(renewer_wake_rx))`; dropping the + /// sender wakes it immediately so Drop does not block for `ttl/2`. + renewer_wake_tx: Option>, + /// Renewer-thread join handle. + renewer_handle: Option>, + /// Stable identifier for this `sozu top` instance, of the shape + /// `top::`. Kept for any debug surface; the owner + /// thread caches its own copy for request construction. + #[allow(dead_code)] + client_id: String, + /// Shared status slot the renewer thread publishes degraded-mode + /// messages into. Stored on the guard so it stays alive for the + /// background threads' lifetime; the render loop drains it via the + /// free `take_status` function once per tick. + #[allow(dead_code)] + status: StatusSlot, +} + +impl DetailGuard { + /// Open a fresh `Channel` to the master, hand it to a single owner + /// thread, send the initial `SetMetricDetail` apply over that + /// thread, and spawn the renewer. Returns `Ok` once the master + /// acknowledges the apply; if the master rejects (e.g. mixed-version + /// fleet without the verb) `Err` is returned and the caller shows + /// the "lease unsupported" warning in the status bar. The `status` + /// slot is shared with the render loop so the renewer can surface + /// degraded-mode messages without writing to the wiped alt-screen. + pub fn apply( + config: &Config, + detail: TopDetail, + ttl_seconds: u32, + reason: impl Into, + status: StatusSlot, + ) -> Result { + let client_id = format!("top:{}:{}", process::id(), short_random_suffix()); + let proto_detail = match detail { + TopDetail::Process => MetricDetail::DetailProcess, + TopDetail::Frontend => MetricDetail::DetailFrontend, + TopDetail::Cluster => MetricDetail::DetailCluster, + TopDetail::Backend => MetricDetail::DetailBackend, + }; + let reason = reason.into(); + // One `Channel` connection for this guard's entire lifetime. + // Handed off to the owner thread, which is the sole writer. + let channel = create_channel(config)?; + + // Mailbox to the owner thread (apply / renew / clear / stop). + // Unbounded: traffic is one apply at startup, one renew every + // ttl/2, one clear + one stop at drop. The renewer never + // out-paces the owner. + let (tx, rx) = crossbeam_channel::unbounded::(); + + let owner_handle = spawn_owner( + channel, + rx, + client_id.clone(), + proto_detail, + ttl_seconds, + reason.clone(), + Arc::clone(&status), + ); + + // Initial apply: send through the mailbox and wait for the + // owner's reply so the caller observes the master's verdict + // before we return the guard. + let (apply_reply_tx, apply_reply_rx) = bounded::>(1); + if tx + .send(DetailRequest::Apply { + reply: apply_reply_tx, + }) + .is_err() + { + // Owner thread refused the message — it has already exited. + // Wait for the handle so we surface its outcome cleanly, + // then return a transport error. + let _ = owner_handle.join(); + return Err(CtlError::WriteRequest( + sozu_command_lib::channel::ChannelError::Connection(None), + )); + } + let apply_result = match apply_reply_rx.recv() { + Ok(r) => r, + Err(_) => { + let _ = owner_handle.join(); + return Err(CtlError::WriteRequest( + sozu_command_lib::channel::ChannelError::Connection(None), + )); + } + }; + if let Err(e) = apply_result { + // Owner thread exits on apply failure; reap it so we don't + // leave a zombie behind. + let _ = owner_handle.join(); + return Err(e); + } + + // Spawn the renewer now that the apply succeeded. + let renewer_stop = Arc::new(AtomicBool::new(false)); + let (renewer_wake_tx, renewer_wake_rx) = bounded::<()>(0); + let renewer_handle = spawn_renewer( + tx.clone(), + ttl_seconds, + Arc::clone(&renewer_stop), + renewer_wake_rx, + ); + + Ok(Self { + tx, + owner_handle: Some(owner_handle), + renewer_stop, + renewer_wake_tx: Some(renewer_wake_tx), + renewer_handle: Some(renewer_handle), + client_id, + status, + }) + } +} + +impl Drop for DetailGuard { + fn drop(&mut self) { + // 1. Stop the renewer before issuing the final write so its + // next tick cannot enqueue a `Renew` that would race the + // `Clear`. The atomic short-circuits the post-wake send; + // dropping the wake sender breaks the select! sleep so the + // renewer exits in <1 ms instead of waiting for ttl/2. + self.renewer_stop.store(true, Ordering::Relaxed); + drop(self.renewer_wake_tx.take()); + if let Some(handle) = self.renewer_handle.take() { + let _ = handle.join(); + } + + // 2. Best-effort revoke over the same owner-thread channel + // that did the apply. Sender errors are ignored: if the + // owner has already exited (apply or renew failed before + // Drop), TTL expiry is the backstop. + let _ = self.tx.send(DetailRequest::Clear); + + // 3. Graceful owner exit. + let _ = self.tx.send(DetailRequest::Stop); + if let Some(handle) = self.owner_handle.take() { + let _ = handle.join(); + } + } +} + +/// Spawn the owner thread. It owns the `Channel`, caches the lease +/// parameters, and dispatches `DetailRequest`s into wire-level +/// `SetMetricDetail` writes. +fn spawn_owner( + mut channel: Channel, + rx: Receiver, + client_id: String, + detail: MetricDetail, + ttl_seconds: u32, + reason: String, + status: StatusSlot, +) -> JoinHandle<()> { + std::thread::Builder::new() + .name("sozu-top-detail-owner".into()) + .spawn(move || { + while let Ok(msg) = rx.recv() { + match msg { + DetailRequest::Apply { reply } => { + let result = send_set_detail( + &mut channel, + &client_id, + Some(detail), + Some(ttl_seconds), + Some(&reason), + false, + ); + let failed = result.is_err(); + // Reply may be dropped if `apply` was cancelled + // (the caller already returned). Swallow. + let _ = reply.send(result); + if failed { + // The initial apply is load-bearing; if it + // failed there is no point keeping the + // owner alive. The renewer is not spawned + // yet (apply gates renewer spawn), so this + // is a clean exit. + return; + } + } + DetailRequest::Renew => { + if let Err(e) = send_set_detail( + &mut channel, + &client_id, + Some(detail), + Some(ttl_seconds), + Some(&format!("{reason} (renew)")), + false, + ) { + publish_status( + &status, + format!( + "renewer dropped: {e}; cardinality lapses in ≤ {ttl_seconds}s" + ), + ); + return; + } + } + DetailRequest::Clear => { + // Best-effort: server-side TTL expiry covers + // dropped revokes. We do not surface failures + // through the status slot because the TUI is + // already tearing down by this point. + let _ = send_set_detail( + &mut channel, + &client_id, + None, + None, + Some(&format!("{reason} (clear)")), + true, + ); + } + DetailRequest::Stop => return, + } + } + // Sender dropped without sending Stop: treat as implicit + // shutdown. + }) + .expect("spawn sozu-top owner") +} + +/// Spawn the renewer thread. It holds only a sender clone and a stop +/// flag — no `Channel` of its own. Every renewal write traverses the +/// owner thread, which preserves the apply-time `peer_session_ulid` and +/// therefore the worker's `PeerBinding`. +fn spawn_renewer( + tx: Sender, + ttl_seconds: u32, + stop: Arc, + wake_rx: Receiver<()>, +) -> JoinHandle<()> { + let renew_after = Duration::from_secs((ttl_seconds.max(2) / 2) as u64); + std::thread::Builder::new() + .name("sozu-top-detail-renewer".into()) + .spawn(move || { + loop { + let timer = after(renew_after); + select! { + recv(timer) -> _ => { + // Post-wake stop check: Drop may have set the + // flag between the timer's wake and our send. + if stop.load(Ordering::Relaxed) { + return; + } + if tx.send(DetailRequest::Renew).is_err() { + // Owner thread has exited (apply / renew + // failure, or Drop completed). Nothing + // more to do here; status was already + // published by the owner. + return; + } + } + recv(wake_rx) -> _ => { + // Wake-channel sender dropped (Drop path). + // The atomic is already set; exit cleanly. + return; + } + } + } + }) + .expect("spawn sozu-top renewer") +} + +fn send_set_detail( + channel: &mut Channel, + client_id: &str, + detail: Option, + ttl_seconds: Option, + reason: Option<&str>, + clear: bool, +) -> Result<(), CtlError> { + let req = Request { + request_type: Some(RequestType::SetMetricDetail(SetMetricDetail { + client_id: client_id.to_owned(), + detail: detail.map(|d| d as i32), + ttl_seconds, + clear: Some(clear), + reason: reason.map(|r| r.to_owned()), + // Master-populated fields; clients leave them empty. The + // master fills them in `worker_request` before fan-out from + // the connecting `ClientSession`. + peer_pid: None, + peer_session_ulid: None, + })), + }; + channel + .write_message(&req) + .map_err(CtlError::WriteRequest)?; + // Drain processing replies until the terminal Ok/Failure. SetMetricDetail + // is a quick fan-out; 5 s gives enough headroom for a slow worker. + loop { + let resp = channel + .read_message_blocking_timeout(Some(Duration::from_secs(5))) + .map_err(CtlError::ReadBlocking)?; + match resp.status() { + ResponseStatus::Processing => continue, + ResponseStatus::Failure => return Err(CtlError::WrongResponse(resp)), + ResponseStatus::Ok => return Ok(()), + } + } +} + +/// 8 hex chars used as the random portion of the lease `client_id`. On +/// Linux uses the `getrandom(2)` syscall directly via the `libc` crate +/// (already in the workspace), which is non-blocking, has no fs +/// dependency, and surfaces failure modes (`EAGAIN` while the entropy +/// pool is uninitialised, `ENOSYS` on ancient kernels) as a `-1` return. +/// On non-Linux Unix targets we fall back to a `/dev/urandom` read; +/// `getrandom`'s shape is OS-specific (FreeBSD: `getrandom(2)`, OpenBSD: +/// `getentropy(2)`, macOS: `SecRandomCopyBytes`) and the fs path is the +/// portable lowest common denominator. +/// +/// Endianness: we use `u32::from_le_bytes` for cross-arch reproducibility +/// of the rendered hex, independent of which source actually delivered +/// the bytes. +/// +/// On total entropy failure (`getrandom` returned `-1` AND the +/// `/dev/urandom` read failed) the function falls back to +/// `SystemTime::now().subsec_nanos()` and the caller observes the +/// degraded mode via the `app.status` line surfaced by `DetailGuard`. +/// Cryptographic strength is not required — the value only needs to be +/// unguessable enough to avoid lease-id collisions across concurrent +/// `sozu top` instances on the same host. +fn short_random_suffix() -> String { + let mut buf = [0u8; 4]; + if read_csprng_bytes(&mut buf) { + let n = u32::from_le_bytes(buf); + return format!("{n:08x}"); + } + use std::time::{SystemTime, UNIX_EPOCH}; + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.subsec_nanos()) + .unwrap_or(0); + format!("{nanos:08x}") +} + +/// Fill `buf` from the kernel CSPRNG. Returns `true` on success, `false` +/// on any error so the caller can fall through to the `subsec_nanos` +/// fallback. +/// +/// Linux: `libc::getrandom(buf, len, GRND_NONBLOCK)`. The flag asks the +/// kernel to return `EAGAIN` rather than block when the entropy pool is +/// not yet initialised — extraordinarily rare on real hosts but matters +/// inside fresh containers and at boot. We treat any short read or +/// negative return as failure and fall through. +/// +/// Non-Linux Unix targets (macOS / *BSD): `getrandom(2)` exists under +/// different ABIs (e.g. OpenBSD's `getentropy(2)` caps at 256 bytes; +/// FreeBSD's `getrandom(2)` has the same signature as Linux's but +/// belongs to `` rather than ``). For +/// portability across the platforms Sōzu builds on, fall back to a +/// `/dev/urandom` read — present and readable on every supported +/// non-Linux Unix target. +fn read_csprng_bytes(buf: &mut [u8]) -> bool { + #[cfg(target_os = "linux")] + { + // SAFETY: `libc::getrandom` accepts a mutable byte pointer + length + // and writes up to `len` bytes. We pass our owned `buf`'s pointer + // and full length; both are valid for the duration of the call. + // The `GRND_NONBLOCK` flag is `0x0001`, well-defined on Linux. + let ret = unsafe { + libc::getrandom( + buf.as_mut_ptr().cast::(), + buf.len(), + libc::GRND_NONBLOCK, + ) + }; + if ret as usize == buf.len() { + return true; + } + // fall through to `/dev/urandom` below; some kernels (very old + // 3.x or seccomp-restricted sandboxes) refuse the syscall. + } + use std::io::Read; + if let Ok(mut f) = std::fs::File::open("/dev/urandom") + && f.read_exact(buf).is_ok() + { + return true; + } + false +} + +#[cfg(test)] +mod tests { + //! Mailbox-level tests for the single-owner topology. We cannot run + //! the owner thread end-to-end without a live master, so the tests + //! here exercise the renewer cadence and the Drop wake-up shape + //! against the public mailbox surface. The invariant "every wire + //! write traverses one `Channel` connection" is enforced + //! structurally — there is no second `create_channel` call site for + //! the renewer to reach — so it does not require a behavioural test + //! beyond reading the code at `spawn_renewer`. + use super::*; + use std::time::Instant; + + /// `ttl_seconds = 2` minimum (clamped by `ttl_seconds.max(2)`) yields + /// a 1 s renewer cadence. The first `Renew` should land on the + /// mailbox in roughly 1 s — we accept a wide upper bound (3 s) for + /// CI scheduling noise. + #[test] + fn renewer_sends_renew_after_ttl_half() { + let (tx, rx) = crossbeam_channel::unbounded::(); + let stop = Arc::new(AtomicBool::new(false)); + let (wake_tx, wake_rx) = bounded::<()>(0); + + let start = Instant::now(); + let handle = spawn_renewer(tx, 2, Arc::clone(&stop), wake_rx); + + // First renew should arrive within ttl/2 (=1 s) + slack. + let msg = rx + .recv_timeout(Duration::from_secs(3)) + .expect("renewer produced no Renew within 3 s"); + assert!( + matches!(msg, DetailRequest::Renew), + "first mailbox message must be Renew" + ); + assert!( + start.elapsed() >= Duration::from_millis(900), + "renewer fired too early: {:?}", + start.elapsed() + ); + + // Tell the renewer to stop and verify it exits promptly. + stop.store(true, Ordering::Relaxed); + drop(wake_tx); + handle.join().expect("renewer panicked"); + } + + /// Dropping the wake sender mid-sleep must wake the renewer in + /// well under `ttl/2`. This guards the Drop-fast-path promise — if + /// `select!` regresses to a pure timer, this assertion catches it. + #[test] + fn renewer_wakes_on_drop() { + let (tx, _rx) = crossbeam_channel::unbounded::(); + let stop = Arc::new(AtomicBool::new(false)); + let (wake_tx, wake_rx) = bounded::<()>(0); + + // ttl_seconds = 60 → renewer would otherwise sleep 30 s. + let handle = spawn_renewer(tx, 60, Arc::clone(&stop), wake_rx); + + // Give the renewer time to reach the select!. + std::thread::sleep(Duration::from_millis(50)); + stop.store(true, Ordering::Relaxed); + drop(wake_tx); + + let start = Instant::now(); + handle.join().expect("renewer panicked"); + assert!( + start.elapsed() < Duration::from_secs(2), + "renewer did not wake on drop: {:?}", + start.elapsed() + ); + } +} diff --git a/bin/src/ctl/top/mod.rs b/bin/src/ctl/top/mod.rs new file mode 100644 index 000000000..a8794b4f5 --- /dev/null +++ b/bin/src/ctl/top/mod.rs @@ -0,0 +1,169 @@ +//! `sozu top` — live operator TUI for the Sōzu reverse proxy. +//! +//! The TUI surfaces metrics already collected by Sōzu (`QueryMetrics` over +//! the existing unix command socket) plus an `Event` stream subscription, +//! in a btop/htop-style layout — sparklines, sortable tables, function-key +//! bar, vim navigation, k9s-style colon palette, three glyph modes +//! (Braille / Block / TTY-ASCII). +//! +//! # Architecture +//! +//! Four synchronous transport threads (no `tokio` runtime in v1, by design): +//! +//! 1. The **collector** thread polls `RequestType::QueryMetrics` on the +//! `--refresh-ms` ticker over its own `Channel` and pushes each +//! `AggregatedMetrics` into a `crossbeam_channel::bounded::(1)`. +//! 2. The **events** thread subscribes to `RequestType::SubscribeEvents` +//! over a SEPARATE `Channel` (the unix `Channel` has no message-id +//! correlation; multiplexing query and subscribe on one socket is +//! unsafe) and forwards each `Event` into a `bounded::(64)`. +//! 3. The **listeners-collector** thread polls `RequestType::ListListeners` +//! every 5 s over its own channel into a `bounded::(1)`. +//! 4. The **certs-collector** thread polls +//! `RequestType::QueryCertificatesFromTheState` every 30 s over its own +//! channel into a `bounded::(1)`. +//! +//! The UI thread (this thread) owns the terminal, drives crossterm event +//! polling on a 30-fps cap, and consumes all four channels. +//! +//! Cardinality elevation is automatic: on startup the TUI sends +//! `SetMetricDetail{ client_id, detail = Backend, ttl_seconds = 60 }`. A +//! renewer re-sends every `ttl/2` seconds. On exit (Drop, panic, SIGINT, +//! SIGTERM, SIGHUP) the TUI sends a best-effort `SetMetricDetail{ +//! client_id, clear: true }`. SIGTERM and SIGHUP coverage requires the +//! `termination` feature of the `ctrlc` crate — without it `set_handler` +//! catches SIGINT only and `kill -TERM` would leave the terminal in +//! alt-screen + raw mode. Crash safety: the lease self-expires +//! server-side after `ttl_seconds` so a dead `sozu top` cannot +//! permanently elevate cardinality. + +mod app; +mod cardinality; +mod panes; +mod render; +mod theme; +mod transport; + +#[cfg(test)] +mod snapshot_tests; + +use crate::cli::{TopDetail, TopGlyphs}; + +use self::cardinality::DetailGuard; +use self::render::RenderConfig; +use self::transport::{spawn_certs, spawn_collector, spawn_events, spawn_listeners}; + +use super::{CommandManager, CtlError}; + +/// Bag of arguments forwarded from the clap `SubCmd::Top { … }` variant. +#[derive(Debug, Clone)] +pub struct TopArgs { + pub refresh_ms: u64, + pub no_mouse: bool, + pub skin: Option, + pub detail: Option, + pub lease_ttl_seconds: u32, + pub snapshot: Option, + pub tick_once: bool, + pub glyphs: Option, +} + +impl CommandManager { + /// Entry point for the `sozu top` subcommand. + /// + /// Spins up the four transport threads (collector, events, listeners, + /// certs), applies the cardinality lease, installs the SIGINT/SIGTERM + /// and panic-hook restore guards, then runs the render loop until the + /// user quits or a snapshot/tick budget exhausts. Returns once the + /// terminal is restored. + pub fn run_top(&mut self, args: TopArgs) -> Result<(), CtlError> { + // Transport threads own their own `Channel` connections — see the + // module-level docs for why we don't reuse `self.channel`. + // + // The shutdown flag is the canonical wake-up for the events thread: + // dropping its `Receiver` cannot propagate across the + // unix socket. The three poll-driven threads still exit on + // receiver-drop, but we join them all on the way out for symmetry. + let shutdown = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)); + // Shared status slot every background thread (lease renewer + + // four transport collectors) publishes degraded-mode notes + // into. The render loop drains it once per tick (see + // `cardinality::take_status`) and surfaces the message on the + // F-key bar instead of the wiped alt-screen. + let lease_status_slot = cardinality::new_status_slot(); + let (snapshot_rx, collector) = spawn_collector( + self.config.clone(), + args.refresh_ms, + std::sync::Arc::clone(&lease_status_slot), + )?; + let (events_rx, events) = spawn_events( + self.config.clone(), + std::sync::Arc::clone(&shutdown), + std::sync::Arc::clone(&lease_status_slot), + )?; + let (listeners_rx, listeners) = spawn_listeners( + self.config.clone(), + std::sync::Arc::clone(&lease_status_slot), + )?; + let (certs_rx, certs) = spawn_certs( + self.config.clone(), + std::sync::Arc::clone(&lease_status_slot), + )?; + + // Apply the runtime cardinality lease. If the master/worker is too + // old to decode `SetMetricDetail`, we surface the failure but keep + // the TUI running — operators on degraded fleets still get the + // cluster-level data the worker is already emitting. + // + // The failure path stashes a diagnostic into a string that the + // renderer surfaces via `App.status`, rather than writing to + // `stderr` directly: an `eprintln!` from this site would land + // *between* the spawn calls above and `enable_raw_mode` inside the + // renderer, leaving a one-line warning the operator never sees + // (alt-screen wipes it on entry) and that pollutes the shell + // scrollback on exit. + let detail = args.detail.unwrap_or(TopDetail::Backend); + let (lease, lease_status) = match DetailGuard::apply( + &self.config, + detail, + args.lease_ttl_seconds, + "sozu top", + std::sync::Arc::clone(&lease_status_slot), + ) { + Ok(g) => (Some(g), None), + Err(e) => ( + None, + Some(format!( + "could not elevate metric detail (continuing without lease): {e}" + )), + ), + }; + + let render_cfg = RenderConfig { + mouse: !args.no_mouse, + tick_once: args.tick_once, + snapshot_frames: args.snapshot, + skin: args.skin.clone(), + glyphs: args.glyphs, + initial_status: lease_status, + lease_status: lease_status_slot, + }; + let result = render::run(render_cfg, snapshot_rx, events_rx, listeners_rx, certs_rx); + + // Drop order: lease first (issues the best-effort `clear`), then + // flip the shutdown flag so the events thread observes it on the + // next bounded read, then join all four transport handles so we + // never return with background threads still queueing into a + // detached aggregator. + drop(lease); + shutdown.store(true, std::sync::atomic::Ordering::Relaxed); + let _ = collector.join(); + let _ = listeners.join(); + let _ = certs.join(); + let _ = events.join(); + + result.map_err(|e| { + CtlError::ResolvePath("sozu top render loop".to_owned(), std::io::Error::other(e)) + }) + } +} diff --git a/bin/src/ctl/top/panes/backends.rs b/bin/src/ctl/top/panes/backends.rs new file mode 100644 index 000000000..55497c0b9 --- /dev/null +++ b/bin/src/ctl/top/panes/backends.rs @@ -0,0 +1,134 @@ +//! BACKENDS pane — flat sortable table across every cluster's backends. +//! +//! Week-3 scope is a flat list; in-table cluster-scope filter (drill-down +//! from the CLUSTERS row) is week 4. The flat view already answers the +//! "which backend is on fire" question at a glance because the default +//! sort is bandwidth descending — the busiest backend lands at the top. + +use ratatui::Frame; +use ratatui::layout::{Constraint, Rect}; +use ratatui::style::{Modifier, Style}; +use ratatui::widgets::{Block, BorderType, Borders, Cell, Paragraph, Row, Table}; + +use super::super::app::{App, BackendSortKey, PulseKind}; +use super::super::theme::Skin; +use super::sort_header; + +pub fn render(f: &mut Frame<'_>, area: Rect, app: &App, skin: &Skin) { + let rows = app.backend_rows(); + let block = Block::default() + .borders(Borders::ALL) + .border_type(BorderType::Rounded) + .title(format!( + " BACKENDS · sort: {} {} · {} backend{} ", + app.backend_sort.label(), + if app.backend_sort_reverse { + "asc" + } else { + "desc" + }, + rows.len(), + if rows.len() == 1 { "" } else { "s" }, + )) + .style(Style::default().fg(skin.muted)); + + if rows.is_empty() { + let inner = block.inner(area); + f.render_widget(block, area); + let body = Paragraph::new( + "No per-backend metrics yet. Either no traffic has reached a backend, \ + or the worker is configured below `metrics.detail = backend`. The \ + SetMetricDetail lease auto-elevates when supported; check the EVENTS \ + pane (tab 7) for METRIC_DETAIL_CHANGED to confirm.", + ) + .style(Style::default().fg(skin.secondary)); + f.render_widget(body, inner); + return; + } + + let reverse = app.backend_sort_reverse; + let active = |key: BackendSortKey| app.backend_sort == key; + let header = Row::new(vec![ + sort_header("cluster", active(BackendSortKey::ClusterId), reverse, skin), + sort_header("backend", active(BackendSortKey::BackendId), reverse, skin), + sort_header( + "bw down/up Mbps", + active(BackendSortKey::Bandwidth), + reverse, + skin, + ), + sort_header("conn", active(BackendSortKey::Connections), reverse, skin), + sort_header("p50", active(BackendSortKey::LatencyP99), reverse, skin), + sort_header("p99", active(BackendSortKey::LatencyP99), reverse, skin), + sort_header("req", active(BackendSortKey::Requests), reverse, skin), + ]) + .style( + Style::default() + .fg(skin.primary) + .add_modifier(Modifier::BOLD), + ); + + let body: Vec> = rows + .iter() + .map(|row| { + let critical = row.p99_ms as f64 >= app.thresholds.latency_p99_critical_ms; + // Pulse takes precedence over the steady tint so transitions + // catch the eye even on rows that are already red. + let row_style = match app.pulse.backend_pulse(&row.cluster_id, &row.backend_id) { + Some(PulseKind::WentDown) | Some(PulseKind::Disappeared) => skin.pulse_hot(), + Some(PulseKind::Appeared) => skin.pulse_cool(), + None if critical => skin.row_critical(), + None => Style::default().fg(skin.secondary), + }; + Row::new(vec![ + Cell::from(row.cluster_id.clone()), + Cell::from(row.backend_id.clone()), + Cell::from(format!( + "{}/{}", + format_rate_bps(row.bw_in_bps), + format_rate_bps(row.bw_out_bps), + )), + Cell::from(format!("{}", row.connections)), + Cell::from(format!("{}", row.p50_ms)), + Cell::from(format!("{}", row.p99_ms)), + Cell::from(format!("{}", row.requests_total)), + ]) + .style(row_style) + }) + .collect(); + + let widths = [ + Constraint::Min(18), + Constraint::Min(20), + Constraint::Length(16), + Constraint::Length(6), + Constraint::Length(6), + Constraint::Length(6), + Constraint::Length(8), + ]; + let table = Table::new(body, widths).header(header).block(block); + f.render_widget(table, area); +} + +/// Render bytes/sec as a per-second bit rate scaled to the largest +/// unit the value crosses cleanly. Networking-convention base-1000 — +/// 1 Mbps = 1_000_000 bps — so the column matches what operators read +/// off nload / iftop / Grafana panels. Sub-millibit traffic collapses +/// to `0.00` rather than `0.001M` so the column stays compact. +fn format_rate_bps(bytes_per_sec: f64) -> String { + let bps = bytes_per_sec.max(0.0) * 8.0; + let gbps = bps / 1_000_000_000.0; + let mbps = bps / 1_000_000.0; + let kbps = bps / 1_000.0; + if gbps >= 1.0 { + format!("{gbps:.2}G") + } else if mbps >= 0.1 { + format!("{mbps:.2}") + } else if kbps >= 1.0 { + format!("{kbps:.1}K") + } else if bps >= 1.0 { + format!("{bps:.0}b") + } else { + "0.00".to_owned() + } +} diff --git a/bin/src/ctl/top/panes/certs.rs b/bin/src/ctl/top/panes/certs.rs new file mode 100644 index 000000000..1da714010 --- /dev/null +++ b/bin/src/ctl/top/panes/certs.rs @@ -0,0 +1,117 @@ +//! CERTS pane — TLS certificate inventory grouped by bound address. +//! +//! Pulled from `QueryCertificatesFromTheState` on a 30 s ticker (cert +//! lifecycle is operator-paced; every transition also lands as a +//! `CERTIFICATE_ADDED` / `CERTIFICATE_REMOVED` / `CERTIFICATE_REPLACED` +//! event on the EVENTS pane). Each row is one certificate with the bound +//! address, the SNI it serves, and a truncated fingerprint suffix the +//! operator can match against `sozu certificate list` output. + +use std::net::SocketAddr; + +use ratatui::Frame; +use ratatui::layout::{Constraint, Rect}; +use ratatui::style::{Modifier, Style}; +use ratatui::widgets::{Block, BorderType, Borders, Cell, Paragraph, Row, Table}; +use sozu_command_lib::proto::command::SocketAddress; + +use super::super::app::App; +use super::super::theme::Skin; + +pub fn render(f: &mut Frame<'_>, area: Rect, app: &App, skin: &Skin) { + let block = Block::default() + .borders(Borders::ALL) + .border_type(BorderType::Rounded) + .title(format!( + " CERTS · refresh 30 s · {} ", + app.last_certs + .as_ref() + .map(|_| "live".to_owned()) + .unwrap_or_else(|| "no snapshot yet".into()) + )) + .style(Style::default().fg(skin.muted)); + + let certs = match app.last_certs.as_ref() { + Some(c) => &c.list, + None => { + let inner = block.inner(area); + f.render_widget(block, area); + f.render_widget( + Paragraph::new( + "Polling QueryCertificatesFromTheState every 30 s. First snapshot \ + lands shortly after `sozu top` starts; check the EVENTS pane (tab \ + 7) for CERTIFICATE_ADDED / REMOVED / REPLACED transitions in the \ + meantime.", + ) + .style(Style::default().fg(skin.secondary)), + inner, + ); + return; + } + }; + + // Flatten every (address, summary) pair so the table sorts naturally + // by address. Fingerprint suffix only — long hex digests bury the + // domain column on narrow terminals and the operator can always + // match against `sozu certificate list` for the full hash. + let mut rows: Vec> = Vec::new(); + let mut total_certs = 0u32; + for by_address in &certs.certificates { + let addr = format_socket_address(&by_address.address); + for summary in &by_address.certificate_summaries { + let fp_suffix = if summary.fingerprint.len() > 12 { + let tail = summary.fingerprint.len() - 12; + format!("…{}", &summary.fingerprint[tail..]) + } else { + summary.fingerprint.to_owned() + }; + rows.push( + Row::new(vec![ + Cell::from(addr.to_owned()), + Cell::from(summary.domain.to_owned()), + Cell::from(fp_suffix), + ]) + .style(Style::default().fg(skin.secondary)), + ); + total_certs += 1; + } + } + + if rows.is_empty() { + let inner = block.inner(area); + f.render_widget(block, area); + f.render_widget( + Paragraph::new("No certificates loaded.").style(Style::default().fg(skin.secondary)), + inner, + ); + return; + } + + let header = Row::new(vec![ + Cell::from(format!("address · {total_certs} cert(s)")), + Cell::from("domain (SNI)"), + Cell::from("fingerprint (suffix)"), + ]) + .style( + Style::default() + .fg(skin.primary) + .add_modifier(Modifier::BOLD), + ); + let widths = [ + Constraint::Min(28), + Constraint::Min(28), + Constraint::Length(20), + ]; + let table = Table::new(rows, widths).header(header).block(block); + f.render_widget(table, area); +} + +/// Render a proto `SocketAddress` as the familiar `ip:port` shape. The +/// proto carries an IP oneof and a port; falling back to `Debug` would +/// dump the proto struct fields (`SocketAddress { ip: IpAddress { … } }`) +/// which is unreadable. `From for SocketAddr` already +/// exists in `command/src/request.rs` and prints v4 as `1.2.3.4:443` +/// and v6 as `[::1]:443`. +fn format_socket_address(address: &SocketAddress) -> String { + SocketAddr::from(*address).to_string() +} diff --git a/bin/src/ctl/top/panes/clusters.rs b/bin/src/ctl/top/panes/clusters.rs new file mode 100644 index 000000000..feb353b20 --- /dev/null +++ b/bin/src/ctl/top/panes/clusters.rs @@ -0,0 +1,127 @@ +//! CLUSTERS pane — sortable table of clusters with one row per cluster_id. +//! Default sort: 5xx error rate descending, then RPS — operators want the +//! unhealthy clusters at the top so the eye lands on them first. +//! +//! Pulse-tint on cluster disappearance and new-unhealthy-backend transitions +//! is driven by [`crate::ctl::top::app::PulseTracker`]. + +use ratatui::Frame; +use ratatui::layout::{Constraint, Rect}; +use ratatui::style::{Modifier, Style}; +use ratatui::widgets::{Block, BorderType, Borders, Cell, Row, Table}; + +use super::super::app::{App, ClusterSortKey, PulseKind}; +use super::super::theme::Skin; +use super::sort_header; + +pub fn render(f: &mut Frame<'_>, area: Rect, app: &App, skin: &Skin) { + let rows = app.cluster_rows(); + let block = Block::default() + .borders(Borders::ALL) + .border_type(BorderType::Rounded) + .title(format!( + " CLUSTERS · sort: {} {} · {} cluster{} ", + app.cluster_sort.label(), + if app.cluster_sort_reverse { + "asc" + } else { + "desc" + }, + rows.len(), + if rows.len() == 1 { "" } else { "s" }, + )) + .style(Style::default().fg(skin.muted)); + + if rows.is_empty() { + let inner = block.inner(area); + f.render_widget(block, area); + let body = ratatui::widgets::Paragraph::new( + "No cluster metrics yet. The first poll lands within --refresh-ms; \ + if the screen stays empty, ensure the worker has \ + `metrics.detail = backend` (or auto-elevation via the \ + SetMetricDetail lease has acknowledged).", + ) + .style(Style::default().fg(skin.secondary)); + f.render_widget(body, inner); + return; + } + + let reverse = app.cluster_sort_reverse; + let active = |key: ClusterSortKey| app.cluster_sort == key; + let header = Row::new(vec![ + sort_header( + "cluster_id", + active(ClusterSortKey::ClusterId), + reverse, + skin, + ), + sort_header("rps", active(ClusterSortKey::Rps), reverse, skin), + sort_header("err %", active(ClusterSortKey::ErrorRate), reverse, skin), + sort_header("p50", active(ClusterSortKey::LatencyP99), reverse, skin), + sort_header("p99", active(ClusterSortKey::LatencyP99), reverse, skin), + sort_header( + "backends", + active(ClusterSortKey::BackendsAvailable), + reverse, + skin, + ), + ]) + .style( + Style::default() + .fg(skin.primary) + .add_modifier(Modifier::BOLD), + ); + + let body: Vec> = rows + .iter() + .map(|row| { + let row_critical = row.error_rate_pct >= app.thresholds.error_ratio_critical_pct + || row.p99_ms as f64 >= app.thresholds.latency_p99_critical_ms + || (row.backends_total > 0 && row.backends_available == 0); + // Pulse takes precedence over the steady "critical" tint so a + // transition catches the eye even on a row that's already red. + let row_style = match app.pulse.cluster_pulse(&row.cluster_id) { + Some(PulseKind::Disappeared) | Some(PulseKind::WentDown) => skin.pulse_hot(), + Some(PulseKind::Appeared) => skin.pulse_cool(), + None if row_critical => skin.row_critical(), + None => Style::default().fg(skin.secondary), + }; + Row::new(vec![ + Cell::from(row.cluster_id.clone()), + Cell::from(format!("{} req/s", format_rate_count(row.rps))), + Cell::from(format!("{:.2}", row.error_rate_pct)), + Cell::from(format!("{}", row.p50_ms)), + Cell::from(format!("{}", row.p99_ms)), + Cell::from(format!("{}/{}", row.backends_available, row.backends_total)), + ]) + .style(row_style) + }) + .collect(); + + let widths = [ + Constraint::Min(20), + Constraint::Length(14), + Constraint::Length(8), + Constraint::Length(6), + Constraint::Length(6), + Constraint::Length(10), + ]; + let table = Table::new(body, widths).header(header).block(block); + f.render_widget(table, area); +} + +/// Compact representation of a per-second request rate. Auto-scales +/// to K / M / G base-1000 so a high-traffic cluster's "1.2M req/s" +/// stays in a single cell width. +fn format_rate_count(rate: u64) -> String { + let n = rate as f64; + if n >= 1_000_000_000.0 { + format!("{:.2}G", n / 1_000_000_000.0) + } else if n >= 1_000_000.0 { + format!("{:.2}M", n / 1_000_000.0) + } else if n >= 1_000.0 { + format!("{:.1}K", n / 1_000.0) + } else { + format!("{rate}") + } +} diff --git a/bin/src/ctl/top/panes/events.rs b/bin/src/ctl/top/panes/events.rs new file mode 100644 index 000000000..404199b81 --- /dev/null +++ b/bin/src/ctl/top/panes/events.rs @@ -0,0 +1,166 @@ +//! EVENTS pane — tail of the `SubscribeEvents` stream the transport thread +//! pulls from the master. Newest at the top so the eye lands on what just +//! happened. Backend-state events (`BACKEND_DOWN`, `BACKEND_UP`, +//! `NO_AVAILABLE_BACKENDS`) carry the cluster + address; control-plane +//! mutations (`CLUSTER_ADDED`, `LISTENER_UPDATED`, `STATE_LOADED`, +//! `METRIC_DETAIL_CHANGED`, …) carry whichever of the optional fields are +//! meaningful for the verb. + +use std::time::Instant; + +use ratatui::Frame; +use ratatui::layout::{Constraint, Rect}; +use ratatui::style::{Modifier, Style}; +use ratatui::widgets::{Block, BorderType, Borders, Cell, Paragraph, Row, Table}; +use sozu_command_lib::proto::command::EventKind; + +use super::super::app::App; +use super::super::theme::Skin; + +pub fn render(f: &mut Frame<'_>, area: Rect, app: &App, skin: &Skin) { + let block = Block::default() + .borders(Borders::ALL) + .border_type(BorderType::Rounded) + .title(format!(" EVENTS · {} retained ", app.events.len())) + .style(Style::default().fg(skin.muted)); + + if app.events.is_empty() { + let inner = block.inner(area); + f.render_widget(block, area); + let body = Paragraph::new( + "No events yet. The events thread subscribes on startup; the \ + first BACKEND_UP / CLUSTER_ADDED / METRIC_DETAIL_CHANGED arrives \ + whenever the master emits one.", + ) + .style(Style::default().fg(skin.secondary)); + f.render_widget(body, inner); + return; + } + + let header = Row::new(vec![ + Cell::from("when"), + Cell::from("event"), + Cell::from("cluster"), + Cell::from("backend"), + Cell::from("address"), + ]) + .style( + Style::default() + .fg(skin.primary) + .add_modifier(Modifier::BOLD), + ); + + let now = Instant::now(); + // Newest first — operators want "what just happened" at the top of the + // pane. The transport thread pushes onto the back of the VecDeque, so + // we iterate in reverse. + let rows: Vec> = app + .events + .iter() + .rev() + .map(|topev| { + let kind = EventKind::try_from(topev.event.kind).ok(); + let kind_label = kind.map(event_kind_label).unwrap_or("UNKNOWN"); + let style = Style::default().fg(event_kind_color(kind, skin)); + Row::new(vec![ + Cell::from(format_relative_age(now, topev.received_at)), + Cell::from(kind_label.to_owned()), + Cell::from(topev.event.cluster_id.as_deref().unwrap_or("").to_owned()), + Cell::from(topev.event.backend_id.as_deref().unwrap_or("").to_owned()), + Cell::from( + topev + .event + .address + .as_ref() + .map(|a| format!("{a:?}")) + .unwrap_or_default(), + ), + ]) + .style(style) + }) + .collect(); + + let widths = [ + Constraint::Length(10), + Constraint::Length(28), + Constraint::Min(20), + Constraint::Min(16), + Constraint::Min(20), + ]; + let table = Table::new(rows, widths).header(header).block(block); + f.render_widget(table, area); +} + +fn event_kind_label(kind: EventKind) -> &'static str { + match kind { + EventKind::BackendDown => "BACKEND_DOWN", + EventKind::BackendUp => "BACKEND_UP", + EventKind::NoAvailableBackends => "NO_AVAILABLE_BACKENDS", + EventKind::RemovedBackendHasNoConnections => "REMOVED_BACKEND_HAS_NO_CONNECTIONS", + EventKind::ClusterAdded => "CLUSTER_ADDED", + EventKind::ClusterRemoved => "CLUSTER_REMOVED", + EventKind::FrontendAdded => "FRONTEND_ADDED", + EventKind::FrontendRemoved => "FRONTEND_REMOVED", + EventKind::CertificateAdded => "CERTIFICATE_ADDED", + EventKind::CertificateRemoved => "CERTIFICATE_REMOVED", + EventKind::CertificateReplaced => "CERTIFICATE_REPLACED", + EventKind::ListenerActivated => "LISTENER_ACTIVATED", + EventKind::ListenerDeactivated => "LISTENER_DEACTIVATED", + EventKind::ConfigurationReloaded => "CONFIGURATION_RELOADED", + EventKind::WorkerKilled => "WORKER_KILLED", + EventKind::WorkerRelaunched => "WORKER_RELAUNCHED", + EventKind::LoggingLevelChanged => "LOGGING_LEVEL_CHANGED", + EventKind::MetricsConfigured => "METRICS_CONFIGURED", + EventKind::ListenerUpdated => "LISTENER_UPDATED", + EventKind::StateLoaded => "STATE_LOADED", + EventKind::StateSaved => "STATE_SAVED", + EventKind::ListenerAdded => "LISTENER_ADDED", + EventKind::ListenerRemoved => "LISTENER_REMOVED", + EventKind::SozuStopRequested => "SOZU_STOP_REQUESTED", + EventKind::MainUpgraded => "MAIN_UPGRADED", + EventKind::WorkerUpgraded => "WORKER_UPGRADED", + EventKind::EventsSubscribed => "EVENTS_SUBSCRIBED", + EventKind::HealthCheckHealthy => "HEALTH_CHECK_HEALTHY", + EventKind::HealthCheckUnhealthy => "HEALTH_CHECK_UNHEALTHY", + EventKind::ClusterRecovered => "CLUSTER_RECOVERED", + EventKind::MetricDetailChanged => "METRIC_DETAIL_CHANGED", + } +} + +/// Map an event kind to the appropriate skin colour. "Bad" events (down / +/// no backends / unhealthy / killed / cluster removed) take the hot tier; +/// "good" events (up / recovered / healthy / added) take the cool tier; +/// audit-style mutations take the secondary muted colour so they don't +/// drown out actionable signals. +fn event_kind_color(kind: Option, skin: &Skin) -> ratatui::style::Color { + match kind { + Some( + EventKind::BackendDown + | EventKind::NoAvailableBackends + | EventKind::HealthCheckUnhealthy + | EventKind::ClusterRemoved + | EventKind::WorkerKilled + | EventKind::SozuStopRequested, + ) => skin.hot, + Some( + EventKind::BackendUp + | EventKind::ClusterRecovered + | EventKind::HealthCheckHealthy + | EventKind::ClusterAdded, + ) => skin.cool, + Some(EventKind::MetricDetailChanged) => skin.accent, + _ => skin.secondary, + } +} + +fn format_relative_age(now: Instant, received_at: Instant) -> String { + let age = now.saturating_duration_since(received_at); + let secs = age.as_secs(); + if secs < 60 { + format!("{secs}s ago") + } else if secs < 3600 { + format!("{}m{}s", secs / 60, secs % 60) + } else { + format!("{}h{}m", secs / 3600, (secs / 60) % 60) + } +} diff --git a/bin/src/ctl/top/panes/h2.rs b/bin/src/ctl/top/panes/h2.rs new file mode 100644 index 000000000..bb65f7fab --- /dev/null +++ b/bin/src/ctl/top/panes/h2.rs @@ -0,0 +1,328 @@ +//! H2 pane — HTTP/2 health snapshot. +//! +//! Operators reading this pane want three answers fast: +//! +//! 1. How much H2 is happening right now? (active streams gauge, +//! connection count by ALPN class.) +//! 2. Is anything backed up? (`flow_control_stall` rate, +//! `pending_window_updates` gauge, RST_STREAM/GOAWAY rates.) +//! 3. Has a flood detector tripped? (CVE-2023-44487 / CVE-2024-27316 / +//! CVE-2025-8671 mitigations are surfaced as critical-tier counters.) +//! +//! All metric keys are pulled from the freshest `AggregatedMetrics` +//! snapshot's `proxying` map (the per-cluster `clusters[*].cluster` map +//! aggregates the same names). The pane reads gauges directly and computes +//! per-second rates for counters via the shared `App.rates`-style logic +//! the OVERVIEW pane already exercises. + +use ratatui::Frame; +use ratatui::layout::{Constraint, Direction, Layout, Rect}; +use ratatui::style::{Modifier, Style}; +use ratatui::text::{Line, Span}; +use ratatui::widgets::{Block, BorderType, Borders, Cell, Paragraph, Row, Table}; +use sozu_command_lib::proto::command::AggregatedMetrics; +use sozu_lib::metrics::names; + +use super::super::app::{App, count_value as count, gauge_value as gauge}; +use super::super::theme::Skin; + +pub fn render(f: &mut Frame<'_>, area: Rect, app: &App, skin: &Skin) { + let block = Block::default() + .borders(Borders::ALL) + .border_type(BorderType::Rounded) + .title(" H2 · streams · flow control · flood mitigations ") + .style(Style::default().fg(skin.muted)); + + let metrics = match app.last_metrics.as_ref() { + Some(m) => m, + None => { + let inner = block.inner(area); + f.render_widget(block, area); + f.render_widget( + Paragraph::new( + "No snapshot yet. The H2 pane reads from the same QueryMetrics \ + poll as OVERVIEW; data appears once the first poll lands.", + ) + .style(Style::default().fg(skin.secondary)), + inner, + ); + return; + } + }; + + let inner = block.inner(area); + f.render_widget(block, area); + + let chunks = Layout::default() + .direction(Direction::Vertical) + .constraints([ + Constraint::Length(7), // streams + connections gauges + Constraint::Length(7), // flow control + frame counters + Constraint::Min(5), // flood mitigations + ]) + .split(inner); + + render_streams(f, chunks[0], app, skin, metrics); + render_flow(f, chunks[1], app, skin, metrics); + render_floods(f, chunks[2], app, skin, metrics); +} + +fn render_streams(f: &mut Frame<'_>, area: Rect, app: &App, skin: &Skin, m: &AggregatedMetrics) { + let active_streams = gauge(m.proxying.get(names::h2::CONNECTION_ACTIVE_STREAMS)).unwrap_or(0); + let alpn_h2 = count(m.proxying.get(names::http::ALPN_H2)).unwrap_or(0); + let alpn_http11 = count(m.proxying.get(names::http::ALPN_HTTP11)).unwrap_or(0); + let total_alpn = alpn_h2 + alpn_http11; + let h2_pct = if total_alpn > 0 { + (alpn_h2 as f64 / total_alpn as f64) * 100.0 + } else { + 0.0 + }; + + let big = Line::from(vec![ + Span::styled( + format!("{active_streams}"), + Style::default() + .fg(skin.primary) + .add_modifier(Modifier::BOLD), + ), + Span::styled(" active H2 streams · ", Style::default().fg(skin.secondary)), + Span::styled( + format!("{h2_pct:.1} %"), + Style::default() + .fg(skin.accent) + .add_modifier(Modifier::BOLD), + ), + Span::styled(" of accepts on H2", Style::default().fg(skin.secondary)), + ]); + + let header = Row::new(vec!["metric", "value", "trend (60 s)"]).style( + Style::default() + .fg(skin.primary) + .add_modifier(Modifier::BOLD), + ); + // Trend columns render a Unicode-bar sparkline from the per-key + // SparkRing populated each snapshot by `App::fold_h2_trends`. The + // first sample lands as a single bar; the ring fills out to the + // 60-sample width as more snapshots arrive. + let rows = [ + Row::new(vec![ + Cell::from("active streams"), + Cell::from(format!("{active_streams}")), + Cell::from(app.h2_trend_bars(names::h2::CONNECTION_ACTIVE_STREAMS)), + ]) + .style(Style::default().fg(skin.secondary)), + Row::new(vec![ + Cell::from("H2 connections accepted"), + Cell::from(format!("{alpn_h2}")), + Cell::from(app.h2_trend_bars(names::http::ALPN_H2)), + ]) + .style(Style::default().fg(skin.secondary)), + Row::new(vec![ + Cell::from("HTTP/1.1 accepted"), + Cell::from(format!("{alpn_http11}")), + Cell::from(app.h2_trend_bars(names::http::ALPN_HTTP11)), + ]) + .style(Style::default().fg(skin.secondary)), + Row::new(vec![ + Cell::from("client.connections (gauge)"), + Cell::from(format!("{}", app.overview.client_connections)), + Cell::from(app.h2_trend_bars(names::client::CONNECTIONS)), + ]) + .style(Style::default().fg(skin.secondary)), + ]; + + let chunks = Layout::default() + .direction(Direction::Vertical) + .constraints([Constraint::Length(1), Constraint::Min(3)]) + .split(area); + f.render_widget(Paragraph::new(big), chunks[0]); + f.render_widget( + Table::new( + rows, + [ + Constraint::Min(28), + Constraint::Length(16), + Constraint::Min(16), + ], + ) + .header(header), + chunks[1], + ); +} + +fn render_flow(f: &mut Frame<'_>, area: Rect, app: &App, skin: &Skin, m: &AggregatedMetrics) { + let header = Row::new(vec!["flow control", "value", "trend (60 s)"]).style( + Style::default() + .fg(skin.primary) + .add_modifier(Modifier::BOLD), + ); + let gauge_at = |key: &str| gauge(m.proxying.get(key)).map(|v| v as i64); + let count_at = |key: &str| count(m.proxying.get(key)); + let rows = [ + metric_row( + "connection.window_bytes", + gauge_at(names::h2::CONNECTION_WINDOW_BYTES), + app.h2_trend_bars(names::h2::CONNECTION_WINDOW_BYTES), + skin, + false, + ), + metric_row( + "pending_window_updates", + gauge_at(names::h2::CONNECTION_PENDING_WINDOW_UPDATES), + app.h2_trend_bars(names::h2::CONNECTION_PENDING_WINDOW_UPDATES), + skin, + false, + ), + metric_row( + "flow_control_stall", + count_at(names::h2::FLOW_CONTROL_STALL), + app.h2_trend_bars(names::h2::FLOW_CONTROL_STALL), + skin, + true, + ), + metric_row( + "frames.tx.window_update", + count_at(names::h2::FRAMES_TX_WINDOW_UPDATE), + app.h2_trend_bars(names::h2::FRAMES_TX_WINDOW_UPDATE), + skin, + false, + ), + metric_row( + "frames.tx.rst_stream", + count_at(names::h2::FRAMES_TX_RST_STREAM), + app.h2_trend_bars(names::h2::FRAMES_TX_RST_STREAM), + skin, + true, + ), + metric_row( + "frames.tx.goaway", + count_at(names::h2::FRAMES_TX_GOAWAY), + app.h2_trend_bars(names::h2::FRAMES_TX_GOAWAY), + skin, + true, + ), + metric_row( + "headers.rejected.budget_overrun", + count_at(names::h2::HEADERS_REJECTED_BUDGET_OVERRUN), + app.h2_trend_bars(names::h2::HEADERS_REJECTED_BUDGET_OVERRUN), + skin, + true, + ), + ]; + f.render_widget( + Table::new( + rows, + [ + Constraint::Min(36), + Constraint::Length(16), + Constraint::Min(16), + ], + ) + .header(header), + area, + ); +} + +fn render_floods(f: &mut Frame<'_>, area: Rect, app: &App, skin: &Skin, m: &AggregatedMetrics) { + // Critical-tier counters: any non-zero value is a documented attack + // mitigation firing. Keep them in their own block with a hot-tier title + // so the eye is drawn here when it should be. + let block = Block::default() + .borders(Borders::ALL) + .border_type(BorderType::Rounded) + .title(" flood mitigations · CVE-2023-44487 / CVE-2024-27316 / CVE-2025-8671 ") + .style(Style::default().fg(skin.hot)); + let inner = block.inner(area); + f.render_widget(block, area); + + let header = Row::new(vec!["counter", "value", "trend (60 s)"]).style( + Style::default() + .fg(skin.primary) + .add_modifier(Modifier::BOLD), + ); + + let candidates = [ + (names::h2::FLOOD_VIOLATION_GLITCH_WINDOW, "glitch_window"), + (names::h2::FLOOD_VIOLATION_RAPID_RESET, "rapid_reset"), + ( + names::h2::FLOOD_VIOLATION_CONTINUATION, + "continuation_flood", + ), + (names::h2::FLOOD_VIOLATION_MADE_YOU_RESET, "made_you_reset"), + (names::h2::FLOOD_VIOLATION_PING, "ping_flood"), + (names::h2::FLOOD_VIOLATION_SETTINGS, "settings_flood"), + (names::h2::FLOOD_VIOLATION_PRIORITY, "priority_flood"), + (names::h2::WINDOW_UPDATE_DROPPED, "window_update_dropped"), + ( + names::h2::CLOSE_WITH_ACTIVE_STREAMS, + "close_with_active_streams", + ), + ]; + + let rows: Vec> = candidates + .iter() + .map(|(key, label)| { + let v = count(m.proxying.get(*key)).unwrap_or(0); + let style = if v > 0 { + Style::default().fg(skin.hot).add_modifier(Modifier::BOLD) + } else { + Style::default().fg(skin.secondary) + }; + Row::new(vec![ + Cell::from(*label), + Cell::from(format!("{v}")), + Cell::from(app.h2_trend_bars(key)), + ]) + .style(style) + }) + .collect(); + + f.render_widget( + Table::new( + rows, + [ + Constraint::Min(28), + Constraint::Length(16), + Constraint::Min(16), + ], + ) + .header(header), + inner, + ); +} + +/// Render one numeric metric as a labelled table row. `value` is pre- +/// extracted (`gauge(...)` for gauges, `count(...)` for counters) and +/// widened to `i64` so the helper does not need to know the underlying +/// variant. `trend_bars` is a Unicode-bar sparkline string produced by +/// `App::h2_trend_bars`. `warn_when_nonzero` flips the row to the +/// critical tint when the value is a flood / error counter the +/// operator should see. +fn metric_row<'a>( + label: &'a str, + value: Option, + trend_bars: String, + skin: &Skin, + warn_when_nonzero: bool, +) -> Row<'a> { + let style = row_style(skin, value.unwrap_or(0) > 0 && warn_when_nonzero); + Row::new(vec![ + Cell::from(label), + Cell::from(value.map(|v| format!("{v}")).unwrap_or_else(|| "—".into())), + Cell::from(trend_bars), + ]) + .style(style) +} + +fn row_style(skin: &Skin, warn: bool) -> Style { + if warn { + skin.row_critical() + } else { + Style::default().fg(skin.secondary) + } +} + +// `gauge` / `count` helpers come from `super::super::app` (renamed at +// import time) so the H2 pane and the App-side rate calculators share +// one source of truth for `FilteredMetrics -> Option<{i64,u64}>` +// extraction. diff --git a/bin/src/ctl/top/panes/listeners.rs b/bin/src/ctl/top/panes/listeners.rs new file mode 100644 index 000000000..a11531ae4 --- /dev/null +++ b/bin/src/ctl/top/panes/listeners.rs @@ -0,0 +1,103 @@ +//! LISTENERS pane — the static map of bound sockets. +//! +//! The listeners thread polls `RequestType::ListListeners` at a slower 5 s +//! cadence (operator-paced; listener state changes flow through the EVENTS +//! pane). This pane just renders the freshest snapshot as a flat table. +//! +//! Three columns per address: protocol (HTTP / HTTPS / TCP), address, +//! status hint (active, scheme, ALPN summary for HTTPS). + +use ratatui::Frame; +use ratatui::layout::{Constraint, Rect}; +use ratatui::style::{Modifier, Style}; +use ratatui::widgets::{Block, BorderType, Borders, Cell, Paragraph, Row, Table}; + +use super::super::app::App; +use super::super::theme::Skin; + +pub fn render(f: &mut Frame<'_>, area: Rect, app: &App, skin: &Skin) { + let block = Block::default() + .borders(Borders::ALL) + .border_type(BorderType::Rounded) + .title(format!( + " LISTENERS · refresh 5 s · {} ", + app.last_listeners + .as_ref() + .map(|_| "live".to_owned()) + .unwrap_or_else(|| "no snapshot yet".into()) + )) + .style(Style::default().fg(skin.muted)); + + let listeners = match app.last_listeners.as_ref() { + Some(l) => &l.list, + None => { + let inner = block.inner(area); + f.render_widget(block, area); + f.render_widget( + Paragraph::new( + "Polling ListListeners every 5 s. First snapshot lands shortly \ + after `sozu top` starts; see the EVENTS pane for \ + LISTENER_ADDED / LISTENER_REMOVED / LISTENER_UPDATED audit \ + transitions in the meantime.", + ) + .style(Style::default().fg(skin.secondary)), + inner, + ); + return; + } + }; + + let mut rows: Vec> = Vec::new(); + for (addr, cfg) in &listeners.http_listeners { + rows.push(Row::new(vec![ + Cell::from("HTTP"), + Cell::from(addr.to_owned()), + Cell::from(format!("active={}", cfg.active)), + ])); + } + for (addr, cfg) in &listeners.https_listeners { + let alpn = cfg.alpn_protocols.to_vec().join(","); + let alpn = if alpn.is_empty() { "—".into() } else { alpn }; + rows.push( + Row::new(vec![ + Cell::from("HTTPS"), + Cell::from(addr.to_owned()), + Cell::from(format!("active={} · alpn={}", cfg.active, alpn)), + ]) + .style(Style::default().fg(skin.secondary)), + ); + } + for (addr, cfg) in &listeners.tcp_listeners { + rows.push( + Row::new(vec![ + Cell::from("TCP"), + Cell::from(addr.to_owned()), + Cell::from(format!("active={}", cfg.active)), + ]) + .style(Style::default().fg(skin.secondary)), + ); + } + + if rows.is_empty() { + let inner = block.inner(area); + f.render_widget(block, area); + f.render_widget( + Paragraph::new("No listeners configured.").style(Style::default().fg(skin.secondary)), + inner, + ); + return; + } + + let header = Row::new(vec!["proto", "address", "status"]).style( + Style::default() + .fg(skin.primary) + .add_modifier(Modifier::BOLD), + ); + let widths = [ + Constraint::Length(8), + Constraint::Min(22), + Constraint::Min(40), + ]; + let table = Table::new(rows, widths).header(header).block(block); + f.render_widget(table, area); +} diff --git a/bin/src/ctl/top/panes/mod.rs b/bin/src/ctl/top/panes/mod.rs new file mode 100644 index 000000000..28dd93f18 --- /dev/null +++ b/bin/src/ctl/top/panes/mod.rs @@ -0,0 +1,42 @@ +//! Pane modules — one per `ActiveTab` variant. Each pane is pure rendering: +//! it takes a `&App` and a `Frame` slice and draws into it. State stays in +//! `App`; panes never mutate. + +pub mod backends; +pub mod certs; +pub mod clusters; +pub mod events; +pub mod h2; +pub mod listeners; +pub mod overview; + +use ratatui::style::{Modifier, Style}; +use ratatui::text::{Line, Span}; +use ratatui::widgets::Cell; + +use super::theme::Skin; + +/// Render a sortable table header cell. When `active` is true, the column is +/// the current sort key and gets the accent colour, bold, and an arrow glyph +/// (▲ for ascending / reversed, ▼ for descending / default). Otherwise the +/// label is drawn in the muted secondary tint. +/// +/// Shared by the CLUSTERS and BACKENDS panes; the call sites compute the +/// `active`/`reverse` booleans against their own enum (`ClusterSortKey`, +/// `BackendSortKey`) so this helper stays generic over the column type. +pub(super) fn sort_header(label: &str, active: bool, reverse: bool, skin: &Skin) -> Cell<'static> { + if active { + let arrow = if reverse { "▲" } else { "▼" }; + Cell::from(Line::from(vec![Span::styled( + format!("{label} {arrow}"), + Style::default() + .fg(skin.accent) + .add_modifier(Modifier::BOLD), + )])) + } else { + Cell::from(Line::from(Span::styled( + label.to_owned(), + Style::default().fg(skin.secondary), + ))) + } +} diff --git a/bin/src/ctl/top/panes/overview.rs b/bin/src/ctl/top/panes/overview.rs new file mode 100644 index 000000000..cf08a9af6 --- /dev/null +++ b/bin/src/ctl/top/panes/overview.rs @@ -0,0 +1,238 @@ +//! OVERVIEW pane — four sparklines (RPS, p99 latency, 5xx error %, slab +//! saturation) above a row of big numerals. The "blast factor" lives here: +//! it's the operator's first impression of the proxy's health. + +use ratatui::Frame; +use ratatui::layout::{Constraint, Direction, Layout, Rect}; +use ratatui::style::{Modifier, Style}; +use ratatui::text::{Line, Span}; +use ratatui::widgets::{Block, BorderType, Borders, Paragraph, RenderDirection, Sparkline}; + +use super::super::app::App; +use super::super::theme::{GLYPH_FALLING, GLYPH_RISING, GLYPH_STEADY, Skin}; + +/// Render the four-sparkline OVERVIEW grid into `area`. The layout is +/// `2 lines big-numeral header` × `flex sparkline body` per cell. +pub fn render(f: &mut Frame<'_>, area: Rect, app: &App, skin: &Skin) { + let bar_set = app.glyphs.sparkline_set(); + // 2x2 grid: top row RPS + p99, bottom row service-time + saturation. + let outer = Layout::default() + .direction(Direction::Vertical) + .constraints([Constraint::Percentage(50), Constraint::Percentage(50)]) + .split(area); + let top = Layout::default() + .direction(Direction::Horizontal) + .constraints([Constraint::Percentage(50), Constraint::Percentage(50)]) + .split(outer[0]); + let bot = Layout::default() + .direction(Direction::Horizontal) + .constraints([Constraint::Percentage(50), Constraint::Percentage(50)]) + .split(outer[1]); + + render_cell( + f, + top[0], + skin, + "REQUESTS / SEC", + &format_rps(&app.overview.rps), + &subtitle_for_rps(app), + &app.overview.rps.to_vec(), + SparkScale::Auto, + &bar_set, + ); + render_cell( + f, + top[1], + skin, + "LATENCY p99 (ms)", + &format_latency(&app.overview.latency_p99_ms), + &subtitle_for_latency(app), + &app.overview.latency_p99_ms.to_vec(), + SparkScale::FixedMax(scale_for_latency(app)), + &bar_set, + ); + render_cell( + f, + bot[0], + skin, + "SERVICE TIME p99 (ms)", + &format_latency(&app.overview.service_time_p99_ms), + &subtitle_for_service_time(app), + &app.overview.service_time_p99_ms.to_vec(), + SparkScale::FixedMax(scale_for_service_time(app)), + &bar_set, + ); + render_cell( + f, + bot[1], + skin, + "SATURATION (%)", + &format_pct_simple(&app.overview.saturation_pct), + &subtitle_for_saturation(app), + &app.overview.saturation_pct.to_vec(), + SparkScale::FixedMax(100), + &bar_set, + ); +} + +enum SparkScale { + Auto, + FixedMax(u64), +} + +fn render_cell( + f: &mut Frame<'_>, + area: Rect, + skin: &Skin, + title: &str, + big: &str, + subtitle: &str, + samples: &[u64], + scale: SparkScale, + bar_set: &ratatui::symbols::bar::Set<'_>, +) { + let block = Block::default() + .borders(Borders::ALL) + .border_type(BorderType::Rounded) + .title(format!(" {title} ")) + .style(Style::default().fg(skin.muted)); + let inner = block.inner(area); + f.render_widget(block, area); + + let chunks = Layout::default() + .direction(Direction::Vertical) + .constraints([ + Constraint::Length(1), // big numeral + Constraint::Length(1), // subtitle + Constraint::Min(1), // sparkline + ]) + .split(inner); + + let big = Paragraph::new(Line::from(Span::styled( + big.to_owned(), + Style::default() + .fg(skin.primary) + .add_modifier(Modifier::BOLD), + ))); + f.render_widget(big, chunks[0]); + + let sub = Paragraph::new(Line::from(Span::styled( + subtitle.to_owned(), + Style::default().fg(skin.secondary), + ))); + f.render_widget(sub, chunks[1]); + + let max = match scale { + SparkScale::Auto => samples.iter().copied().max().unwrap_or(1).max(1), + SparkScale::FixedMax(m) => m.max(1), + }; + // Recolour the sparkline based on the latest value's position in the + // gradient. ratatui's Sparkline uses one style for the whole widget; + // we approximate the "gradient at peak" by colouring the whole sparkline + // at the position of its most recent sample. Per-bar gradient lands in + // week 3 once the Canvas-backed renderer is in place. + let last = samples.last().copied().unwrap_or(0); + let pos = (last as f32 / max as f32).clamp(0.0, 1.0); + // RightToLeft anchors the newest sample on the right edge so the + // sparkline scrolls history leftward as new ticks arrive. The + // default LeftToRight left-anchors and leaves the right side empty + // until the ring reaches column-width samples — looks like the + // graph is filling only half its pane on the operator's first + // minute of monitoring. + let spark = Sparkline::default() + .data(samples) + .max(max) + .bar_set(bar_set.clone()) + .direction(RenderDirection::RightToLeft) + .style(Style::default().fg(skin.spark_color(pos))); + f.render_widget(spark, chunks[2]); +} + +fn format_rps(ring: &super::super::app::SparkRing) -> String { + match ring.last() { + Some(v) => format!("{v} req/s"), + None => "—".into(), + } +} + +fn format_latency(ring: &super::super::app::SparkRing) -> String { + match ring.last() { + Some(v) => format!("{v} ms"), + None => "—".into(), + } +} + +fn subtitle_for_service_time(app: &App) -> String { + let trend = trend_glyph(&app.overview.service_time_p99_ms); + if app.overview.service_time_p99_ms.is_empty() { + "no samples".into() + } else { + format!("sozu request-processing p99 · {trend}") + } +} + +fn scale_for_service_time(app: &App) -> u64 { + // Anchor the sparkline's max at the latency threshold so service-time + // spikes peg the cell the same way backend-latency spikes peg the + // sibling cell. Floors at 50 ms so a quiet system doesn't make the + // sparkline jitter on rounding noise. + let threshold = app.thresholds.latency_p99_critical_ms.max(50.0) as u64; + threshold.max(app.overview.service_time_p99_ms.max()) +} + +fn format_pct_simple(ring: &super::super::app::SparkRing) -> String { + match ring.last() { + Some(v) => format!("{v} %"), + None => "—".into(), + } +} + +fn subtitle_for_rps(app: &App) -> String { + let trend = trend_glyph(&app.overview.rps); + format!( + "{} client conns · {} active sessions · {trend} 60 s", + app.overview.client_connections, app.overview.active_sessions, + ) +} + +fn subtitle_for_latency(app: &App) -> String { + let trend = trend_glyph(&app.overview.latency_p99_ms); + if app.overview.latency_p99_ms.is_empty() { + "no samples".into() + } else { + format!( + "max p99 across clusters · {} ms threshold · {trend}", + app.thresholds.latency_p99_critical_ms + ) + } +} + +fn subtitle_for_saturation(app: &App) -> String { + let trend = trend_glyph(&app.overview.saturation_pct); + format!( + "slab/buffer; warn at {:.0} % · {trend}", + app.thresholds.slab_critical_pct + ) +} + +fn trend_glyph(ring: &super::super::app::SparkRing) -> &'static str { + let last2: Vec = ring.samples().rev().take(2).copied().collect(); + if last2.len() < 2 { + GLYPH_STEADY + } else if last2[0] > last2[1] { + GLYPH_RISING + } else if last2[0] < last2[1] { + GLYPH_FALLING + } else { + GLYPH_STEADY + } +} + +fn scale_for_latency(app: &App) -> u64 { + // Anchor the sparkline's max at the threshold so a steady stream below + // the threshold draws short bars and a spike above the threshold pegs + // the cell. Floors at 100 ms so a slow startup with no traffic doesn't + // make the cell jiggle. + let threshold = app.thresholds.latency_p99_critical_ms.max(100.0) as u64; + threshold.max(app.overview.latency_p99_ms.max()) +} diff --git a/bin/src/ctl/top/render.rs b/bin/src/ctl/top/render.rs new file mode 100644 index 000000000..6fc0dc8e4 --- /dev/null +++ b/bin/src/ctl/top/render.rs @@ -0,0 +1,645 @@ +//! Render loop for `sozu top`. Synchronous (no tokio): the UI thread owns +//! the terminal, polls crossterm events with `event::poll(timeout)`, and +//! drains snapshot + event channels between input ticks. +//! +//! Frame cap: 30 fps. Data ticks fire as snapshots arrive on the +//! collector channel. Synchronized output (DEC mode 2026 via +//! `BeginSynchronizedUpdate` / `EndSynchronizedUpdate`) wraps each frame +//! so tmux + iTerm2 see a single atomic paint instead of per-cell flicker. + +use std::io; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::{Duration, Instant}; + +use crossbeam_channel::Receiver; +use crossterm::cursor::{Hide, Show}; +use crossterm::event::{ + DisableMouseCapture, EnableMouseCapture, Event as CtEvent, KeyCode, KeyEvent, KeyEventKind, + KeyModifiers, poll, read, +}; +use crossterm::execute; +use crossterm::terminal::{ + BeginSynchronizedUpdate, EndSynchronizedUpdate, EnterAlternateScreen, LeaveAlternateScreen, + disable_raw_mode, enable_raw_mode, +}; +use ratatui::Terminal; +use ratatui::backend::CrosstermBackend; +use ratatui::layout::{Alignment, Constraint, Direction, Layout, Rect}; +use ratatui::style::{Modifier, Style}; +use ratatui::text::{Line, Span}; +use ratatui::widgets::{Block, BorderType, Borders, Paragraph, Tabs}; +use tui_big_text::{BigText, PixelSize}; +use tui_input::backend::crossterm::EventHandler; + +use super::app::{ActiveTab, App}; +use super::panes; +use super::theme::{GlyphMode, Skin}; +use super::transport::{CertsSnapshot, ListenersSnapshot, Snapshot, TopEvent}; + +/// Cap the redraw rate at 30 fps regardless of how often new snapshots / +/// events arrive. Higher rates only burn CPU on tmux + non-Sixel +/// terminals; 33 ms is the documented btop-style upper bound. +const RENDER_INTERVAL: Duration = Duration::from_millis(33); + +pub struct RenderConfig { + pub mouse: bool, + pub tick_once: bool, + pub snapshot_frames: Option, + /// Optional `--skin ` override, threaded through from clap so + /// the renderer can call `Skin::resolve` once at startup. `None` + /// resolves to the built-in default unless `SOZU_TOP_SKIN` overrides. + pub skin: Option, + /// Optional `--glyphs` clap override. `None` runs `GlyphMode::resolve` + /// auto-detect against `TERM` / `LC_ALL` / `LC_CTYPE` / `LANG`. + pub glyphs: Option, + /// Pre-seed for `App.status`. The caller threads any + /// terminal-entry-time diagnostic (e.g. lease elevation failure) + /// through here so the renderer surfaces it on the first frame + /// instead of writing to `stderr` (which the alt-screen wipes). + pub initial_status: Option, + /// Shared status slot the lease renewer (and, in the future, the + /// four transport collectors) push degraded-mode notes into. The + /// render loop drains it once per tick and feeds `App::status` so + /// the operator sees the message on the F-key bar instead of the + /// wiped alt-screen. See `cardinality::StatusSlot` for the type. + pub lease_status: crate::ctl::top::cardinality::StatusSlot, +} + +/// Drive the TUI to completion. Returns when the user quits, the data +/// channels close, or `tick_once` / `snapshot_frames` exhausts. +pub fn run( + cfg: RenderConfig, + snapshots: Receiver, + events: Receiver, + listeners: Receiver, + certs: Receiver, +) -> io::Result<()> { + // Panic hook: explicitly leave alternate screen and disable raw mode + // before the prior hook prints the panic message. The `RawModeGuard` + // Drop also restores the terminal on the unwinding path, but installing + // the hook here means the panic banner lands in the operator's normal + // shell scrollback instead of inside the alt-screen (which the OS + // discards when the program exits). + // + // `PanicHookGuard` restores the prior hook on clean return so repeated + // `run` calls in the same process (tests, embedded callers) do not + // stack hook layers indefinitely. + let _panic_guard = PanicHookGuard::install(|| { + let _ = disable_raw_mode(); + let _ = execute!(io::stdout(), LeaveAlternateScreen, Show); + }); + + // SIGINT/SIGTERM handler: flips a shared flag the loop checks every + // tick. The terminal restore happens via `RawModeGuard::Drop` regardless + // of how we exit (clean quit, panic, or signal-driven exit). A failed + // install is degraded gracefully — the crossterm event loop already + // observes Ctrl-C as a keypress, so the handler is belt-and-braces + // rather than the primary path. The previous `.expect` aborted the + // TUI on programmatic re-entry (a second `run` in the same process + // address space returned `MultipleHandlers`); falling through with a + // status-bar note preserves Ctrl-C-as-keypress and keeps embedded + // callers viable. + let signal_quit = Arc::new(AtomicBool::new(false)); + let mut signal_handler_status: Option = None; + if let Err(err) = ctrlc::set_handler({ + let signal_quit = Arc::clone(&signal_quit); + move || signal_quit.store(true, Ordering::SeqCst) + }) { + signal_handler_status = Some(format!( + "ctrlc handler install failed ({err}); Ctrl-C via keypress still works" + )); + } + + let mut app = App::new(); + let (skin, skin_status) = Skin::resolve(cfg.skin.as_deref()); + let glyphs = GlyphMode::resolve(cfg.glyphs); + app.glyphs = glyphs; + // Status-bar precedence: an `--skin` parse failure is a direct + // response to the operator's explicit override and overrides the + // pre-seeded lease diagnostic; otherwise the lease-elevation note + // wins. Either way we never reach `enable_raw_mode` without a + // chance to surface the message on frame one. + // Precedence: skin status > lease-elevation note > signal-handler + // diagnostic. Status-bar real estate is one line; we surface the + // signal-handler issue only when nothing more operator-relevant is + // queued. + if let Some(msg) = skin_status { + app.status = msg; + } else if let Some(msg) = cfg.initial_status { + app.status = msg; + } else if let Some(msg) = signal_handler_status { + app.status = msg; + } + + let _guard = RawModeGuard::install(cfg.mouse)?; + let backend = CrosstermBackend::new(io::stdout()); + let mut terminal = Terminal::new(backend)?; + + // Opt-out for terminals that don't speak DEC mode 2026 synchronised + // output. `SOZU_TOP_SYNC=0` skips the `BeginSynchronizedUpdate` / + // `EndSynchronizedUpdate` frame wrap; the default behaviour stays + // wrapped because every modern terminal either honours the + // sequence or silently ignores it. + let sync_output = std::env::var("SOZU_TOP_SYNC").ok().as_deref() != Some("0"); + + let mut last_render = Instant::now() - RENDER_INTERVAL; + let mut frames_drawn: u32 = 0; + let snapshot_frames_target = cfg.snapshot_frames; + + loop { + if signal_quit.load(Ordering::SeqCst) || app.should_quit { + break; + } + + // Drain snapshots: keep the freshest one (the channel is + // bounded(1), so at most a handful are buffered). + while let Ok(snap) = snapshots.try_recv() { + app.ingest_snapshot(&snap); + } + while let Ok(ev) = events.try_recv() { + app.ingest_event(ev); + } + while let Ok(listeners) = listeners.try_recv() { + app.ingest_listeners(listeners); + } + while let Ok(certs) = certs.try_recv() { + app.ingest_certs(certs); + } + + // Drain any renewer-published status. The renewer thread writes + // here when its channel open fails or its send loop errors; the + // operator sees the resulting message on the F-key bar instead + // of the wiped alt-screen. + if let Some(msg) = crate::ctl::top::cardinality::take_status(&cfg.lease_status) { + app.status = msg; + app.mark_dirty(); + } + + // Poll for input or sleep until the next render tick. The timeout + // is whichever is sooner: the next render or 50 ms (so we drain + // channels at least 20 times per second when the user is idle). + let now = Instant::now(); + let next_render = last_render + RENDER_INTERVAL; + let timeout = next_render + .saturating_duration_since(now) + .min(Duration::from_millis(50)); + + if poll(timeout)? { + match read()? { + CtEvent::Key(key) if key.kind == KeyEventKind::Press => { + handle_key(&mut app, key); + } + CtEvent::Resize(_, _) => { + // ratatui re-queries the size on the next draw, but the + // render loop's dirty-gate (`take_dirty || pulse.has_active`) + // would otherwise skip that draw on a quiet system — the + // resize event itself does not advance the snapshot tick + // or any pulse. Mark the App dirty so the next frame + // re-flows every pane into the new terminal area. + app.mark_dirty(); + } + _ => {} + } + } + + // Frame cap: only redraw if RENDER_INTERVAL has elapsed since the + // last paint. Synchronized output wraps the draw to give tmux a + // single atomic frame. + if last_render.elapsed() >= RENDER_INTERVAL { + // Age each active pulse before the dirty check so a pulse that + // just decremented contributes to "is the frame dirty?". Calling + // tick_pulses unconditionally also keeps animations fading + // smoothly when no fresh snapshot arrived this frame. + app.tick_pulses(); + // Dirty-gate: skip the synchronized-update + draw when nothing + // visible changed AND no pulse is mid-animation. `take_dirty` + // is read-and-clear so the flag won't strand for the next + // frame; `has_active` keeps the fading tint painting until + // the pulse retires. Quiet-system CPU drops from ~2-3 % to + // near-zero between snapshots. + let dirty = app.take_dirty() || app.pulse.has_active(); + if !dirty { + continue; + } + if sync_output { + execute!(io::stdout(), BeginSynchronizedUpdate)?; + } + terminal.draw(|f| draw(f, &app, &skin))?; + if sync_output { + execute!(io::stdout(), EndSynchronizedUpdate)?; + } + last_render = Instant::now(); + frames_drawn += 1; + + if cfg.tick_once && frames_drawn >= 1 { + break; + } + if let Some(target) = snapshot_frames_target { + if frames_drawn >= target { + break; + } + } + } + } + Ok(()) +} + +fn handle_key(app: &mut App, key: KeyEvent) { + // Palette mode swallows almost every key so the operator can type + // command text freely. Only Enter / Escape / Ctrl-C escape back to + // the normal handler. + if app.palette_open { + match key.code { + KeyCode::Enter => app.apply_palette(), + KeyCode::Esc => app.cancel_palette(), + KeyCode::Char('c') if key.modifiers.contains(KeyModifiers::CONTROL) => { + app.cancel_palette() + } + _ => { + // Forward editing keys (backspace, arrows, character input) + // to the tui-input widget so it maintains its own cursor. + // The widget mutates its internal buffer (palette_input) + // without setting our dirty flag; mark dirty unconditionally + // so the next frame repaints the typed text instead of + // waiting for the next snapshot tick (~1 s on a quiet + // system). + app.palette_input.handle_event(&CtEvent::Key(key)); + app.mark_dirty(); + } + } + return; + } + match key.code { + KeyCode::Char(':') => app.open_palette(), + KeyCode::Char('q') | KeyCode::Char('Q') => app.should_quit = true, + KeyCode::Char('c') if key.modifiers.contains(KeyModifiers::CONTROL) => { + app.should_quit = true + } + KeyCode::F(10) => app.should_quit = true, + KeyCode::Char('?') | KeyCode::F(1) => { + app.help_visible = !app.help_visible; + app.mark_dirty(); + } + // F2 Theme: cycle the resolved glyph mode (Block → Braille → Tty). + // The skin's gradient colours don't have a CLI override yet, so the + // closest visible "theme switch" is the bar alphabet. + KeyCode::F(2) => { + app.glyphs = app.glyphs.cycle(); + app.mark_dirty(); + } + // F3 Find / F4 Filter — both open the colon palette so the + // operator can type `:cluster ` / `:backend ` / `:help`. + // No dedicated find/filter widget yet; the palette is the + // closest behaviour and matches the rest of the binding. + KeyCode::F(3) | KeyCode::F(4) => app.open_palette(), + // F5 Pause: hold the snapshot ingest in place. Transport keeps + // polling so we don't drop the lease, but the App ignores + // incoming snapshots until F5 is pressed again. + KeyCode::F(5) => { + app.paused = !app.paused; + app.mark_dirty(); + } + // F6 Sort: cycle the active pane's sort column. Mirrors `s`. + KeyCode::F(6) => match app.active_tab { + ActiveTab::Clusters => { + app.cluster_sort = app.cluster_sort.cycle(); + app.mark_dirty(); + } + ActiveTab::Backends => { + app.backend_sort = app.backend_sort.cycle(); + app.mark_dirty(); + } + _ => {} + }, + KeyCode::Tab => { + app.active_tab = app.active_tab.cycle(true); + app.mark_dirty(); + } + KeyCode::BackTab => { + app.active_tab = app.active_tab.cycle(false); + app.mark_dirty(); + } + KeyCode::Char(c @ '1'..='7') => { + if let Some(tab) = ActiveTab::from_digit(c.to_digit(10).unwrap_or(0) as u8) { + app.active_tab = tab; + app.mark_dirty(); + } + } + // CLUSTERS sort cycle / reverse; mirror procs / k9s muscle memory. + KeyCode::Char('s') if app.active_tab == ActiveTab::Clusters => { + app.cluster_sort = app.cluster_sort.cycle(); + app.mark_dirty(); + } + KeyCode::Char('S') if app.active_tab == ActiveTab::Clusters => { + app.cluster_sort_reverse = !app.cluster_sort_reverse; + app.mark_dirty(); + } + KeyCode::Char('s') if app.active_tab == ActiveTab::Backends => { + app.backend_sort = app.backend_sort.cycle(); + app.mark_dirty(); + } + KeyCode::Char('S') if app.active_tab == ActiveTab::Backends => { + app.backend_sort_reverse = !app.backend_sort_reverse; + app.mark_dirty(); + } + // Pause toggle via 'p' as well, matching htop / btop muscle memory. + KeyCode::Char('p') | KeyCode::Char('P') => { + app.paused = !app.paused; + app.mark_dirty(); + } + _ => {} + } +} + +fn draw(f: &mut ratatui::Frame<'_>, app: &App, skin: &Skin) { + let area = f.area(); + let alert = app.thresholds.critical_message(&app.overview); + let constraints: Vec = match alert { + Some(_) => vec![ + Constraint::Length(3), // tabs row + Constraint::Length(5), // big-text alert banner + Constraint::Min(8), // active pane + Constraint::Length(1), // function-key bar + ], + None => vec![ + Constraint::Length(3), + Constraint::Min(8), + Constraint::Length(1), + ], + }; + let chunks = Layout::default() + .direction(Direction::Vertical) + .constraints(constraints) + .split(area); + + draw_tabs(f, chunks[0], app, skin); + if let Some(headline) = alert { + draw_alert(f, chunks[1], skin, headline); + draw_pane(f, chunks[2], app, skin); + draw_fkey_bar(f, chunks[3], app, skin); + } else { + draw_pane(f, chunks[1], app, skin); + draw_fkey_bar(f, chunks[2], app, skin); + } +} + +fn draw_alert(f: &mut ratatui::Frame<'_>, area: Rect, skin: &Skin, headline: &str) { + // Two-column layout: big-text headline on the left, narrow context + // strip on the right with the headline label so screen-readers / + // tmux-buffer scrollers still get a copyable string. + let cols = Layout::default() + .direction(Direction::Horizontal) + .constraints([Constraint::Percentage(70), Constraint::Percentage(30)]) + .split(area); + let big = BigText::builder() + .pixel_size(PixelSize::Quadrant) + .style(Style::default().fg(skin.hot).add_modifier(Modifier::BOLD)) + .lines(vec![Line::from(headline.to_owned())]) + .build(); + f.render_widget(big, cols[0]); + let side = Paragraph::new(vec![ + Line::from(Span::styled( + "ALERT", + Style::default().fg(skin.hot).add_modifier(Modifier::BOLD), + )), + Line::from(Span::styled( + headline.to_owned(), + Style::default().fg(skin.primary), + )), + Line::from(Span::styled( + "see OVERVIEW for context", + Style::default().fg(skin.secondary), + )), + ]) + .alignment(Alignment::Left); + f.render_widget(side, cols[1]); +} + +fn draw_tabs(f: &mut ratatui::Frame<'_>, area: Rect, app: &App, skin: &Skin) { + let titles: Vec> = ActiveTab::ALL + .iter() + .enumerate() + .map(|(i, t)| { + let n = i + 1; + Line::from(vec![Span::styled( + format!(" {n} {} ", t.label()), + if *t == app.active_tab { + skin.tab_focused() + } else { + skin.tab_unfocused() + }, + )]) + }) + .collect(); + let selected = ActiveTab::ALL + .iter() + .position(|t| *t == app.active_tab) + .unwrap_or(0); + let title = format!( + " sōzu top · {} ", + app.last_snapshot_at + .map(|_| "live".to_owned()) + .unwrap_or_else(|| "no snapshot yet".into()) + ); + let block = Block::default() + .borders(Borders::ALL) + .border_type(BorderType::Rounded) + .title(title) + .style(Style::default().fg(skin.muted)); + let tabs = Tabs::new(titles) + .select(selected) + .block(block) + .divider(Span::raw(" ")); + f.render_widget(tabs, area); +} + +fn draw_pane(f: &mut ratatui::Frame<'_>, area: Rect, app: &App, skin: &Skin) { + match app.active_tab { + ActiveTab::Overview => panes::overview::render(f, area, app, skin), + ActiveTab::Clusters => panes::clusters::render(f, area, app, skin), + ActiveTab::Backends => panes::backends::render(f, area, app, skin), + ActiveTab::Listeners => panes::listeners::render(f, area, app, skin), + ActiveTab::Certs => panes::certs::render(f, area, app, skin), + ActiveTab::H2 => panes::h2::render(f, area, app, skin), + ActiveTab::Events => panes::events::render(f, area, app, skin), + } +} + +fn draw_fkey_bar(f: &mut ratatui::Frame<'_>, area: Rect, app: &App, skin: &Skin) { + // Palette mode replaces the F-key bar with a one-line input so the + // operator types `:cluster` / `:backend` / … inline. Drop back to + // the htop-style strip otherwise. + if app.palette_open { + draw_palette(f, area, app, skin); + return; + } + // htop-style F-key strip: alternating label/action so muscle memory + // works without reading the keys explicitly. Labels match the keys + // wired in `handle_key`: F1 Help, F2 Glyphs (cycle Block/Braille/Tty), + // F3/F4 Palette (open `:`), F5 Pause (also `p`), F6 Sort (per active + // pane), F10 Quit. F7/F8/F9 remain visible as reserved slots so the + // bar width stays stable across builds; they are no-ops today. + let bindings: &[(&str, &str)] = &[ + ("F1", "Help"), + ("F2", "Glyphs"), + ("F3", "Find"), + ("F4", "Filter"), + ("F5", if app.paused { "Resume" } else { "Pause" }), + ("F6", "Sort"), + ("F7", "·"), + ("F8", "·"), + ("F9", "·"), + ("F10", "Quit"), + ]; + let mut spans: Vec> = Vec::new(); + for (k, a) in bindings { + spans.push(Span::styled(format!(" {k} "), skin.fkey_label())); + spans.push(Span::styled(format!(" {a} "), skin.fkey_action())); + } + spans.push(Span::raw(" ")); + if let Some(err) = app.palette_error.as_ref() { + spans.push(Span::styled( + format!(" {err} "), + Style::default().fg(skin.hot).add_modifier(Modifier::BOLD), + )); + } else { + spans.push(Span::styled( + " : palette ", + Style::default() + .fg(skin.accent) + .add_modifier(Modifier::BOLD), + )); + spans.push(Span::styled( + format!(" sort: {} ", app.cluster_sort.label()), + Style::default() + .fg(skin.accent) + .add_modifier(Modifier::BOLD), + )); + } + let para = Paragraph::new(Line::from(spans)).alignment(Alignment::Left); + f.render_widget(para, area); +} + +fn draw_palette(f: &mut ratatui::Frame<'_>, area: Rect, app: &App, skin: &Skin) { + // Single-line `:cmd_here_` input. Prefixed `:` is implicit (the + // operator presses `:` to enter palette mode, so the rendered text + // does NOT include the colon — `apply_palette` strips any leading + // colon defensively so paste-from-clipboard still works). + let value = app.palette_input.value(); + let line = Line::from(vec![ + Span::styled( + " :", + Style::default() + .fg(skin.accent) + .add_modifier(Modifier::BOLD), + ), + Span::styled( + value.to_owned(), + Style::default() + .fg(skin.primary) + .add_modifier(Modifier::BOLD), + ), + Span::styled( + "_ ", // poor man's cursor; ratatui doesn't render the OS cursor + Style::default() + .fg(skin.accent) + .add_modifier(Modifier::SLOW_BLINK), + ), + Span::styled( + "Enter apply · Esc cancel · :cluster :backend :listener :cert :h2 :event :help :quit", + Style::default().fg(skin.secondary), + ), + ]); + f.render_widget(Paragraph::new(line).alignment(Alignment::Left), area); +} + +/// RAII guard that restores the terminal on drop, panic, or signal exit. +/// Combines: `enable_raw_mode`, `EnterAlternateScreen`, optional +/// `EnableMouseCapture`, and cursor hide. Drop reverses the same sequence +/// so a panic mid-render doesn't leave the user's shell in raw mode with +/// the cursor hidden. +/// +/// Install ordering matters: every step that succeeds MUST be matched +/// by a Drop branch that reverses it, even if a later step fails. The +/// guard is therefore constructed AFTER `enable_raw_mode` succeeds and +/// progressively flips `alt_entered` / `mouse_enabled` flags as the +/// follow-on `execute!` calls succeed. Any `?` after that point still +/// triggers Drop on the unwinding return — so a failure in +/// `EnableMouseCapture` cleanly leaves raw mode and the alt-screen, +/// not "raw mode on, alt-screen on, no Drop scheduled". +struct RawModeGuard { + mouse_enabled: bool, + alt_entered: bool, +} + +impl RawModeGuard { + fn install(mouse: bool) -> io::Result { + enable_raw_mode()?; + let mut guard = Self { + mouse_enabled: false, + alt_entered: false, + }; + let mut out = io::stdout(); + execute!(out, EnterAlternateScreen, Hide)?; + guard.alt_entered = true; + if mouse { + execute!(out, EnableMouseCapture)?; + guard.mouse_enabled = true; + } + Ok(guard) + } +} + +impl Drop for RawModeGuard { + fn drop(&mut self) { + let mut out = io::stdout(); + if self.mouse_enabled { + let _ = execute!(out, DisableMouseCapture); + } + if self.alt_entered { + let _ = execute!(out, Show, LeaveAlternateScreen); + } + let _ = disable_raw_mode(); + } +} + +type BoxedPanicHook = Box) + Send + Sync + 'static>; + +/// RAII guard around `std::panic::set_hook` so that repeated calls to +/// `render::run` in the same process (tests, embedded callers) do not +/// stack hook layers indefinitely. The installed hook chains the prior +/// hook for banner emission, and Drop restores the prior hook so the +/// next install starts from the same baseline. +struct PanicHookGuard { + prior: std::sync::Arc>>, +} + +impl PanicHookGuard { + fn install(restore: F) -> Self + where + F: Fn() + Send + Sync + 'static, + { + let prior = std::sync::Arc::new(std::sync::Mutex::new(Some(std::panic::take_hook()))); + let prior_for_hook = std::sync::Arc::clone(&prior); + std::panic::set_hook(Box::new(move |info| { + restore(); + if let Ok(g) = prior_for_hook.lock() + && let Some(h) = g.as_ref() + { + h(info); + } + })); + Self { prior } + } +} + +impl Drop for PanicHookGuard { + fn drop(&mut self) { + if let Ok(mut g) = self.prior.lock() + && let Some(prior) = g.take() + { + std::panic::set_hook(prior); + } + } +} diff --git a/bin/src/ctl/top/snapshot_tests.rs b/bin/src/ctl/top/snapshot_tests.rs new file mode 100644 index 000000000..36fd50614 --- /dev/null +++ b/bin/src/ctl/top/snapshot_tests.rs @@ -0,0 +1,314 @@ +//! Pane snapshot tests via `insta` + ratatui's `TestBackend`. +//! +//! Each pane is rendered against an `App` fixture into a `TestBackend` of a +//! fixed size, the resulting `Buffer` is converted to a plain `String`, and +//! `insta::assert_snapshot!` compares against the on-disk snapshot under +//! `bin/src/ctl/top/snapshots/`. +//! +//! Three canonical sizes cover the realistic operator-terminal envelope: +//! +//! - 80x24 — htop's default and the lower-bound modern terminal. +//! - 120x40 — typical full-screen SSH session. +//! - 200x60 — multi-monitor pane / projector. +//! +//! Snapshots strip background-colour metadata: we only assert the +//! character grid + foreground style, since terminal-specific colour +//! rendering varies by emulator and would make snapshots brittle without +//! adding regression value. + +use std::collections::BTreeMap; +use std::time::{Duration, Instant}; + +use ratatui::Terminal; +use ratatui::backend::TestBackend; +use ratatui::layout::Rect; +use sozu_command_lib::proto::command::{ + AggregatedMetrics, BackendMetrics, ClusterMetrics, FilteredMetrics, ListenersList, Percentiles, + filtered_metrics, +}; +use sozu_lib::metrics::names; + +use super::app::App; +use super::panes; +use super::theme::Skin; +use super::transport::{ListenersSnapshot, Snapshot}; + +/// Render `draw` into a `TestBackend` of the given size and return the +/// character grid as a `\n`-delimited string. Style and colour are +/// dropped — the snapshot file would otherwise track ANSI escape codes +/// that vary across ratatui patch releases without saying anything +/// useful about the rendered layout. +fn render_to_string(width: u16, height: u16, app: &App, draw: F) -> String +where + F: FnOnce(&mut ratatui::Frame<'_>, Rect, &App, &Skin), +{ + let backend = TestBackend::new(width, height); + let mut terminal = Terminal::new(backend).expect("TestBackend Terminal"); + let skin = Skin::default_dark(); + terminal + .draw(|f| draw(f, f.area(), app, &skin)) + .expect("draw"); + let buffer = terminal.backend().buffer().clone(); + let area = *buffer.area(); + let mut out = String::with_capacity((area.width as usize + 1) * area.height as usize); + for y in 0..area.height { + for x in 0..area.width { + // `buffer[(x, y)]` is a `Cell`; `.symbol()` returns the + // grapheme cluster (default " " for an empty cell). + out.push_str(buffer[(x, y)].symbol()); + } + // Trim trailing spaces per line so the snapshot file is smaller + // and operator diffs are easier to read. + while out.ends_with(' ') { + out.pop(); + } + out.push('\n'); + } + out +} + +/// Build a synthetic `AggregatedMetrics` payload at a synthetic `tick`. +/// Three clusters, two backends each, a smattering of gauges and +/// counters that trip the threshold table without firing the alert +/// banner. Cumulative counters scale linearly with `tick` so two +/// ingestions at `tick=0` then `tick=1` (one second apart) make the +/// `RateCalculator` emit a stable delta matching the cumulative +/// scaling — the snapshot then shows realistic Mbps / req-per-s +/// values rather than a flat 0 from a first-observation `None`. +fn fixture_metrics_at(tick: u64) -> AggregatedMetrics { + let tick = tick as i64; + let mut proxying: BTreeMap = BTreeMap::new(); + proxying.insert(names::slab::USAGE_PERCENT.into(), gauge(45)); + proxying.insert(names::client::CONNECTIONS.into(), gauge(312)); + proxying.insert(names::http::ACTIVE_REQUESTS.into(), gauge(87)); + proxying.insert(names::h2::CONNECTION_ACTIVE_STREAMS.into(), gauge(24)); + proxying.insert(names::http::ALPN_H2.into(), count(1_000 + 100 * tick)); + proxying.insert(names::http::ALPN_HTTP11.into(), count(500 + 50 * tick)); + proxying.insert(names::h2::CONNECTION_WINDOW_BYTES.into(), gauge(65_535)); + proxying.insert( + names::h2::CONNECTION_PENDING_WINDOW_UPDATES.into(), + gauge(0), + ); + proxying.insert(names::h2::FLOW_CONTROL_STALL.into(), count(2)); + proxying.insert(names::h2::FRAMES_TX_WINDOW_UPDATE.into(), count(42)); + proxying.insert(names::h2::FRAMES_TX_RST_STREAM.into(), count(0)); + proxying.insert(names::h2::FRAMES_TX_GOAWAY.into(), count(0)); + proxying.insert(names::h2::HEADERS_REJECTED_BUDGET_OVERRUN.into(), count(0)); + proxying.insert( + names::event_loop::SERVICE_TIME.into(), + percentiles(3, 8, 12), + ); + + let mut clusters: BTreeMap = BTreeMap::new(); + for (i, id) in ["api-prod", "static-cdn", "queue-worker"] + .iter() + .enumerate() + { + let mut cluster: BTreeMap = BTreeMap::new(); + let i64_i = i as i64; + cluster.insert( + names::backend::REQUESTS.into(), + count((1_000 + i64_i * 500) * tick), + ); + cluster.insert(names::http_status::S500.into(), count(2 + i64_i)); + cluster.insert(names::http_status::S503.into(), count(1)); + cluster.insert( + names::backend::RESPONSE_TIME.into(), + percentiles(20, 80, 180 + i as u64 * 20), + ); + cluster.insert(names::cluster::TOTAL_BACKENDS.into(), gauge(2)); + cluster.insert( + names::cluster::AVAILABLE_BACKENDS.into(), + gauge(if i == 2 { 0 } else { 2 }), + ); + let tick_u64 = tick as u64; + let backends = vec![ + backend_metrics( + format!("{id}-1"), + 125_000 * (i as u64 + 1) * tick_u64, + 30, + 110, + ), + backend_metrics( + format!("{id}-2"), + 250_000 * (i as u64 + 1) * tick_u64, + 35, + 120, + ), + ]; + clusters.insert((*id).into(), ClusterMetrics { cluster, backends }); + } + + AggregatedMetrics { + main: BTreeMap::new(), + workers: BTreeMap::new(), + clusters, + proxying, + } +} + +fn fixture_listeners() -> ListenersList { + ListenersList { + http_listeners: BTreeMap::new(), + https_listeners: BTreeMap::new(), + tcp_listeners: BTreeMap::new(), + } +} + +fn gauge(v: u64) -> FilteredMetrics { + FilteredMetrics { + inner: Some(filtered_metrics::Inner::Gauge(v)), + } +} + +fn count(v: i64) -> FilteredMetrics { + FilteredMetrics { + inner: Some(filtered_metrics::Inner::Count(v)), + } +} + +fn percentiles(p50: u64, p90: u64, p99: u64) -> FilteredMetrics { + FilteredMetrics { + inner: Some(filtered_metrics::Inner::Percentiles(Percentiles { + samples: 1_000, + p_50: p50, + p_90: p90, + p_99: p99, + p_99_9: p99 * 2, + p_99_99: p99 * 3, + p_99_999: p99 * 4, + p_100: p99 * 5, + sum: 1_000 * p50, + })), + } +} + +fn backend_metrics(id: String, bytes: u64, p50: u64, p99: u64) -> BackendMetrics { + let mut metrics: BTreeMap = BTreeMap::new(); + // `BYTES_IN` / `BYTES_OUT` are the per-backend backend-socket + // counters published by `record_backend_metrics!`; the BACKENDS pane + // reads them as "bw down/up". `BACK_BYTES_IN` / `BACK_BYTES_OUT` are + // emitted as no-label proxy counters and never land per-backend, so + // the fixture must not pretend otherwise. + metrics.insert(names::backend::BYTES_IN.into(), count(bytes as i64)); + metrics.insert(names::backend::BYTES_OUT.into(), count((bytes * 4) as i64)); + metrics.insert(names::backend::CONNECTIONS_PER_BACKEND.into(), gauge(12)); + metrics.insert( + names::backend::RESPONSE_TIME.into(), + percentiles(p50, p50 + 30, p99), + ); + metrics.insert(names::backend::REQUESTS.into(), count(bytes as i64 / 4)); + BackendMetrics { + backend_id: id, + metrics, + } +} + +/// Fresh fixture App at OVERVIEW with two synthetic snapshots already +/// ingested, one second apart. The `RateCalculator` returns `None` on +/// the first observation, so a single ingestion would show a flat 0 in +/// every rate-derived column (cluster RPS, backend bandwidth) — the +/// double ingest exercises the rate path so the snapshot reflects real +/// per-second values. +fn fixture_app() -> App { + let mut app = App::new(); + let t0 = Instant::now(); + app.ingest_snapshot(&Snapshot { + metrics: fixture_metrics_at(0), + received_at: t0, + }); + app.ingest_snapshot(&Snapshot { + metrics: fixture_metrics_at(1), + received_at: t0 + Duration::from_secs(1), + }); + app.ingest_listeners(ListenersSnapshot { + list: fixture_listeners(), + }); + app +} + +// ── OVERVIEW pane ───────────────────────────────────────────────────── + +#[test] +fn snapshot_overview_80x24() { + let app = fixture_app(); + let out = render_to_string(80, 24, &app, |f, area, app, skin| { + panes::overview::render(f, area, app, skin) + }); + insta::assert_snapshot!(out); +} + +#[test] +fn snapshot_overview_120x40() { + let app = fixture_app(); + let out = render_to_string(120, 40, &app, |f, area, app, skin| { + panes::overview::render(f, area, app, skin) + }); + insta::assert_snapshot!(out); +} + +// ── CLUSTERS pane ───────────────────────────────────────────────────── + +#[test] +fn snapshot_clusters_80x24() { + let app = fixture_app(); + let out = render_to_string(80, 24, &app, |f, area, app, skin| { + panes::clusters::render(f, area, app, skin) + }); + insta::assert_snapshot!(out); +} + +#[test] +fn snapshot_clusters_empty_120x40() { + // No `last_metrics` — the empty-state copy must render legibly so + // operators see why the table is blank. + let app = App::new(); + let out = render_to_string(120, 40, &app, |f, area, app, skin| { + panes::clusters::render(f, area, app, skin) + }); + insta::assert_snapshot!(out); +} + +// ── BACKENDS pane ───────────────────────────────────────────────────── + +#[test] +fn snapshot_backends_120x40() { + let app = fixture_app(); + let out = render_to_string(120, 40, &app, |f, area, app, skin| { + panes::backends::render(f, area, app, skin) + }); + insta::assert_snapshot!(out); +} + +// ── LISTENERS pane ──────────────────────────────────────────────────── + +#[test] +fn snapshot_listeners_empty_80x24() { + let app = fixture_app(); // empty `ListenersList`, exercises empty-state copy + let out = render_to_string(80, 24, &app, |f, area, app, skin| { + panes::listeners::render(f, area, app, skin) + }); + insta::assert_snapshot!(out); +} + +// ── H2 pane ─────────────────────────────────────────────────────────── + +#[test] +fn snapshot_h2_120x40() { + let app = fixture_app(); + let out = render_to_string(120, 40, &app, |f, area, app, skin| { + panes::h2::render(f, area, app, skin) + }); + insta::assert_snapshot!(out); +} + +// ── EVENTS pane (empty) ─────────────────────────────────────────────── + +#[test] +fn snapshot_events_empty_80x24() { + let app = App::new(); + let out = render_to_string(80, 24, &app, |f, area, app, skin| { + panes::events::render(f, area, app, skin) + }); + insta::assert_snapshot!(out); +} diff --git a/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_backends_120x40.snap b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_backends_120x40.snap new file mode 100644 index 000000000..faa2b4cf5 --- /dev/null +++ b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_backends_120x40.snap @@ -0,0 +1,45 @@ +--- +source: bin/src/ctl/top/snapshot_tests.rs +assertion_line: 277 +expression: out +--- +╭ BACKENDS · sort: bw desc · 6 backends ───────────────────────────────────────────────────────────────────────────────╮ +│cluster backend bw down/up Mbps conn p50 p99 req │ +│queue-worker queue-worker-2 6.00/24.00 12 35 120 187500 │ +│static-cdn static-cdn-2 4.00/16.00 12 35 120 125000 │ +│queue-worker queue-worker-1 3.00/12.00 12 30 110 93750 │ +│api-prod api-prod-2 2.00/8.00 12 35 120 62500 │ +│static-cdn static-cdn-1 2.00/8.00 12 30 110 62500 │ +│api-prod api-prod-1 1.00/4.00 12 30 110 31250 │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ diff --git a/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_clusters_80x24.snap b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_clusters_80x24.snap new file mode 100644 index 000000000..9ab038a1e --- /dev/null +++ b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_clusters_80x24.snap @@ -0,0 +1,29 @@ +--- +source: bin/src/ctl/top/snapshot_tests.rs +assertion_line: 255 +expression: out +--- +╭ CLUSTERS · sort: err% desc · 3 clusters ─────────────────────────────────────╮ +│cluster_id rps err % ▼ p50 p99 backends │ +│api-prod 94.8K req/s 0.00 35 180 2/2 │ +│static-cdn 189.0K req/s 0.00 35 200 2/2 │ +│queue-worker 283.2K req/s 0.00 35 220 0/2 │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +╰──────────────────────────────────────────────────────────────────────────────╯ diff --git a/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_clusters_empty_120x40.snap b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_clusters_empty_120x40.snap new file mode 100644 index 000000000..253486c59 --- /dev/null +++ b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_clusters_empty_120x40.snap @@ -0,0 +1,44 @@ +--- +source: bin/src/ctl/top/snapshot_tests.rs +expression: out +--- +╭ CLUSTERS · sort: err% desc · 0 clusters ─────────────────────────────────────────────────────────────────────────────╮ +│No cluster metrics yet. The first poll lands within --refresh-ms; if the screen stays empty, ensure the worker has `me│ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ diff --git a/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_events_empty_80x24.snap b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_events_empty_80x24.snap new file mode 100644 index 000000000..64fe418ef --- /dev/null +++ b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_events_empty_80x24.snap @@ -0,0 +1,28 @@ +--- +source: bin/src/ctl/top/snapshot_tests.rs +expression: out +--- +╭ EVENTS · 0 retained ─────────────────────────────────────────────────────────╮ +│No events yet. The events thread subscribes on startup; the first BACKEND_UP /│ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +╰──────────────────────────────────────────────────────────────────────────────╯ diff --git a/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_h2_120x40.snap b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_h2_120x40.snap new file mode 100644 index 000000000..b4b56060f --- /dev/null +++ b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_h2_120x40.snap @@ -0,0 +1,45 @@ +--- +source: bin/src/ctl/top/snapshot_tests.rs +assertion_line: 299 +expression: out +--- +╭ H2 · streams · flow control · flood mitigations ─────────────────────────────────────────────────────────────────────╮ +│24 active H2 streams · 66.7 % of accepts on H2 │ +│metric value trend (60 s) │ +│active streams 24 ██ │ +│H2 connections accepted 1100 ▇█ │ +│HTTP/1.1 accepted 550 ▇█ │ +│client.connections (gauge) 312 ██ │ +│ │ +│flow control value trend (60 s) │ +│connection.window_bytes 65535 ██ │ +│pending_window_updates 0 ▁▁ │ +│flow_control_stall 2 ██ │ +│frames.tx.window_update 42 ██ │ +│frames.tx.rst_stream 0 ▁▁ │ +│frames.tx.goaway 0 ▁▁ │ +│╭ flood mitigations · CVE-2023-44487 / CVE-2024-27316 / CVE-2025-8671 ───────────────────────────────────────────────╮│ +││counter value trend (60 s) ││ +││glitch_window 0 ▁▁ ││ +││rapid_reset 0 ▁▁ ││ +││continuation_flood 0 ▁▁ ││ +││made_you_reset 0 ▁▁ ││ +││ping_flood 0 ▁▁ ││ +││settings_flood 0 ▁▁ ││ +││priority_flood 0 ▁▁ ││ +││window_update_dropped 0 ▁▁ ││ +││close_with_active_streams 0 ▁▁ ││ +││ ││ +││ ││ +││ ││ +││ ││ +││ ││ +││ ││ +││ ││ +││ ││ +││ ││ +││ ││ +││ ││ +││ ││ +│╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯│ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ diff --git a/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_listeners_empty_80x24.snap b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_listeners_empty_80x24.snap new file mode 100644 index 000000000..bdddbdaf3 --- /dev/null +++ b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_listeners_empty_80x24.snap @@ -0,0 +1,28 @@ +--- +source: bin/src/ctl/top/snapshot_tests.rs +expression: out +--- +╭ LISTENERS · refresh 5 s · live ──────────────────────────────────────────────╮ +│No listeners configured. │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +│ │ +╰──────────────────────────────────────────────────────────────────────────────╯ diff --git a/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_overview_120x40.snap b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_overview_120x40.snap new file mode 100644 index 000000000..2b24e5d2c --- /dev/null +++ b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_overview_120x40.snap @@ -0,0 +1,45 @@ +--- +source: bin/src/ctl/top/snapshot_tests.rs +assertion_line: 244 +expression: out +--- +╭ REQUESTS / SEC ──────────────────────────────────────────╮╭ LATENCY p99 (ms) ────────────────────────────────────────╮ +│567000 req/s ││220 ms │ +│312 client conns · 87 active sessions · ▲ 60 s ││max p99 across clusters · 500 ms threshold · ● │ +│ █ ││ │ +│ █ ││ │ +│ █ ││ │ +│ █ ││ │ +│ █ ││ │ +│ █ ││ │ +│ █ ││ │ +│ █ ││ │ +│ █ ││ │ +│ █ ││ ██│ +│ █ ││ ██│ +│ █ ││ ██│ +│ █ ││ ██│ +│ █ ││ ██│ +│ █ ││ ██│ +│ █ ││ ██│ +╰──────────────────────────────────────────────────────────╯╰──────────────────────────────────────────────────────────╯ +╭ SERVICE TIME p99 (ms) ───────────────────────────────────╮╭ SATURATION (%) ──────────────────────────────────────────╮ +│12 ms ││45 % │ +│sozu request-processing p99 · ● ││slab/buffer; warn at 80 % · ● │ +│ ││ │ +│ ││ │ +│ ││ │ +│ ││ │ +│ ││ │ +│ ││ │ +│ ││ │ +│ ││ │ +│ ││ ▁▁│ +│ ││ ██│ +│ ││ ██│ +│ ││ ██│ +│ ││ ██│ +│ ││ ██│ +│ ││ ██│ +│ ▃▃││ ██│ +╰──────────────────────────────────────────────────────────╯╰──────────────────────────────────────────────────────────╯ diff --git a/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_overview_80x24.snap b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_overview_80x24.snap new file mode 100644 index 000000000..22ddc5de1 --- /dev/null +++ b/bin/src/ctl/top/snapshots/sozu__ctl__top__snapshot_tests__snapshot_overview_80x24.snap @@ -0,0 +1,29 @@ +--- +source: bin/src/ctl/top/snapshot_tests.rs +assertion_line: 235 +expression: out +--- +╭ REQUESTS / SEC ──────────────────────╮╭ LATENCY p99 (ms) ────────────────────╮ +│567000 req/s ││220 ms │ +│312 client conns · 87 active sessions ││max p99 across clusters · 500 ms thres│ +│ █ ││ │ +│ █ ││ │ +│ █ ││ │ +│ █ ││ │ +│ █ ││ ▄▄│ +│ █ ││ ██│ +│ █ ││ ██│ +│ █ ││ ██│ +╰──────────────────────────────────────╯╰──────────────────────────────────────╯ +╭ SERVICE TIME p99 (ms) ───────────────╮╭ SATURATION (%) ──────────────────────╮ +│12 ms ││45 % │ +│sozu request-processing p99 · ● ││slab/buffer; warn at 80 % · ● │ +│ ││ │ +│ ││ │ +│ ││ │ +│ ││ │ +│ ││ ▄▄│ +│ ││ ██│ +│ ││ ██│ +│ ▁▁││ ██│ +╰──────────────────────────────────────╯╰──────────────────────────────────────╯ diff --git a/bin/src/ctl/top/theme.rs b/bin/src/ctl/top/theme.rs new file mode 100644 index 000000000..c14bb3d65 --- /dev/null +++ b/bin/src/ctl/top/theme.rs @@ -0,0 +1,610 @@ +//! Theme + glyph mode for `sozu top`. +//! +//! Defaults to a hard-coded `Skin` with Okabe-Ito categorical palette plus +//! Viridis-shaped continuous ramps and three glyph modes (Braille / Block / +//! TTY-ASCII). The `--skin ` flag (with `SOZU_TOP_SKIN` env override, +//! k9s parity) resolves to a TOML file under `$XDG_CONFIG_HOME/sozu/skins/ +//! .toml`, falling back to `/etc/sozu/skins/` for system-wide skins. +//! Auto-detection of terminal capabilities (`COLORTERM=truecolor`, `tput +//! colors`, `LANG`, `TERM=linux/dumb`) lands separately in the glyph +//! cascade follow-up. + +use std::env; +use std::path::{Path, PathBuf}; + +use ratatui::style::{Color, Modifier, Style}; +use serde::Deserialize; + +use crate::cli::TopGlyphs; + +/// Categorical palette + accent colours used across every pane. Defaults to +/// Okabe-Ito categorical for cluster colour assignment + a Viridis-shaped +/// continuous ramp for sparkline gradients. The colour-blind safe choice is +/// the hard-coded fallback; a `--skin` override only swaps the palette, never +/// overrides the structural rules (red/green is never the only signal — +/// glyphs `▲ ▼ ●` carry the redundant cue). +#[derive(Debug, Clone)] +pub struct Skin { + /// Primary foreground for headings, focused tab, big-text numerals. + pub primary: Color, + /// Secondary foreground for status text, function-key labels. + pub secondary: Color, + /// Accent colour for sortable column headers + selected row. + pub accent: Color, + /// Cool tint for "all is well" sparkline tails (low values). + pub cool: Color, + /// Warm tint for "elevated" sparkline tails (mid-high values, no alert). + pub warm: Color, + /// Hot tint for sparkline alert peaks. + pub hot: Color, + /// Dim grey for tab labels not in focus, rule lines. + pub muted: Color, + /// Categorical palette assigned in cluster-table-row order. Cycles when + /// the cluster count exceeds the palette length. Okabe-Ito 7-colour set + /// extended with two Viridis points at the warm end for cluster counts + /// above 7. Not yet consumed by a pane; reserved for cluster-row tinting + /// when the CLUSTERS pane gets categorical row colours. + #[allow(dead_code)] + pub categorical: Vec, +} + +impl Skin { + /// Hard-coded default skin used when no `--skin` / `SOZU_TOP_SKIN` + /// override resolves. Okabe-Ito categorical (colour-blind safe in + /// isolation; pairs distinguishable across the three dichromatic types + /// per Okabe & Ito 2002) plus a Viridis-shaped continuous ramp. + pub fn default_dark() -> Self { + Self { + primary: Color::Rgb(232, 232, 240), + secondary: Color::Rgb(180, 184, 192), + accent: Color::Rgb(86, 192, 240), + cool: Color::Rgb(57, 173, 152), + warm: Color::Rgb(245, 191, 79), + hot: Color::Rgb(232, 84, 90), + muted: Color::Rgb(96, 100, 112), + categorical: OKABE_ITO_PLUS.to_vec(), + } + } + + /// Resolve the operator's skin choice. Precedence: + /// + /// 1. `SOZU_TOP_SKIN` env var (k9s parity) — set to `default` or + /// `none` to keep the built-in palette regardless of `--skin`. + /// 2. `--skin ` clap argument. + /// 3. The built-in `default_dark()` palette. + /// + /// Lookup paths: `$XDG_CONFIG_HOME/sozu/skins/.toml` (defaulting + /// to `$HOME/.config/sozu/skins/.toml`), then + /// `/etc/sozu/skins/.toml`. Returns `(skin, status_message)` + /// where the status string is `None` on success (built-in or loaded + /// cleanly) or `Some(diagnostic)` when a lookup failed; the renderer + /// surfaces this in the status bar so the operator sees why their + /// override didn't take effect. + pub fn resolve(name: Option<&str>) -> (Self, Option) { + let env_choice = env::var("SOZU_TOP_SKIN").ok(); + let effective = env_choice.as_deref().or(name); + let choice = match effective { + Some("") | Some("default") | Some("none") | None => { + return (Self::default_dark(), None); + } + Some(other) => other, + }; + // Helper for the five fail-closed paths below: every diagnostic + // returns the built-in default paired with an operator-facing + // status string. Keeps the fail-closed-on-every-lookup-defect + // policy in a single spelling. + let default_with = |msg: String| (Self::default_dark(), Some(msg)); + match Self::lookup_paths(choice).into_iter().find(|p| p.is_file()) { + Some(path) => { + // Defence-in-depth on top of the literal-string filter in + // `lookup_paths`: canonicalize both the chosen file and the + // anchor skins directory, then require the file to live + // under the anchor. Defeats symlink-based escapes (a + // `/sozu/skins/.toml` symlink pointing at + // `/etc/shadow`) and TOCTOU races between `is_file()` and + // `from_open_file`. Returning the default with a + // diagnostic keeps `--skin` behaviour predictable when + // the operator mis-set the lookup path or hit a + // packaging bug. + let Ok(resolved) = path.canonicalize() else { + return default_with(format!( + "skin `{choice}` canonicalize failed; using default" + )); + }; + // Fail closed when the parent anchor cannot be resolved. + // The previous shape skipped the confinement check on + // anchor failure (race-delete of the parent, weird + // /proc paths, unusual fs mounts) and parsed the bare + // resolved file — defeating the defence-in-depth check. + let Some(anchor) = Self::skins_anchor(&path) else { + return default_with(format!( + "skin `{choice}` anchor resolve failed; using default" + )); + }; + if !resolved.starts_with(&anchor) { + return default_with(format!( + "skin `{choice}` resolved outside skins dir; using default" + )); + } + // Close the TOCTOU window on the leaf: the second open + // sets `O_NOFOLLOW`, so the kernel refuses with `ELOOP` + // if the resolved file has been swapped for a symlink + // between `canonicalize` and the open. Intermediate + // path components still resolve normally, but a swap + // there cannot escape the canonicalised anchor because + // `resolved.starts_with(&anchor)` was already verified + // above, and any swap to a non-existent path simply + // errors on `File::open`. A bare `read_to_string(&Path)` + // would re-resolve the leaf symlink and read the wrong + // target; the explicit `OpenOptions` + `O_NOFOLLOW` + // path forbids that. + match Self::from_open_file(&resolved) { + Ok(skin) => (skin, None), + Err(e) => { + default_with(format!("skin `{choice}` parse error: {e}; using default")) + } + } + } + None => default_with(format!("skin `{choice}` not found; using default")), + } + } + + /// Canonicalize the parent skins directory of a candidate skin path so + /// the caller can confine the resolved file underneath it. Returns + /// `None` when the parent cannot be canonicalized (e.g. the candidate + /// itself disappeared between `is_file()` and here); the caller then + /// falls back to the default skin with a diagnostic. + fn skins_anchor(candidate: &Path) -> Option { + candidate.parent()?.canonicalize().ok() + } + + /// Read + parse a skin TOML file. The leaf component is opened with + /// `O_NOFOLLOW` so the kernel refuses the open with `ELOOP` if the + /// resolved file has been swapped for a symlink between the + /// `canonicalize` step in `resolve` and this open. A bare + /// `read_to_string(&Path)` (or `File::open(&Path)` without + /// `O_NOFOLLOW`) would re-resolve the leaf symlink and read the + /// wrong target — the gap that the previous shape of this helper + /// left open. Intermediate path components still resolve normally; + /// the anchor confinement check in `resolve` + /// (`resolved.starts_with(&anchor)`) keeps any intermediate-component + /// race from escaping the skins directory. + pub fn from_open_file(path: &Path) -> Result { + use std::io::Read; + use std::os::unix::fs::OpenOptionsExt; + let mut file = std::fs::OpenOptions::new() + .read(true) + .custom_flags(libc::O_NOFOLLOW) + .open(path) + .map_err(SkinError::Io)?; + let mut body = String::new(); + file.read_to_string(&mut body).map_err(SkinError::Io)?; + let raw: RawSkin = toml::from_str(&body).map_err(|e| SkinError::Parse(e.to_string()))?; + raw.into_skin().map_err(SkinError::Validate) + } + + fn lookup_paths(name: &str) -> Vec { + let mut paths = Vec::new(); + // Reject `..` / path-separators to keep `--skin` from escaping the + // skins directory; treat malformed names as "not found". + if name.contains('/') || name.contains('\\') || name.contains("..") { + return paths; + } + let xdg = env::var_os("XDG_CONFIG_HOME") + .map(PathBuf::from) + .or_else(|| env::var_os("HOME").map(|h| PathBuf::from(h).join(".config"))); + if let Some(base) = xdg { + paths.push(base.join("sozu").join("skins").join(format!("{name}.toml"))); + } + paths.push(PathBuf::from("/etc/sozu/skins").join(format!("{name}.toml"))); + paths + } + + /// Style for the focused tab label in the numbered tab row. + pub fn tab_focused(&self) -> Style { + Style::default() + .fg(self.primary) + .bg(self.accent) + .add_modifier(Modifier::BOLD) + } + + /// Style for unfocused tab labels. + pub fn tab_unfocused(&self) -> Style { + Style::default().fg(self.muted) + } + + /// Style for sparkline gradient at a given normalised position + /// (`pos` in `[0.0, 1.0]`). Low → cool, mid → warm, high → hot. Pure + /// function so the renderer can call it per-bar. + pub fn spark_color(&self, pos: f32) -> Color { + if pos < 0.5 { + self.cool + } else if pos < 0.85 { + self.warm + } else { + self.hot + } + } + + /// Style applied to a cluster row when its sparkline has crossed the + /// "critical" threshold (e.g. 5xx ratio > threshold). Stronger signal + /// than `warm` and combines with the row's pulse marker. + pub fn row_critical(&self) -> Style { + Style::default().fg(self.hot).add_modifier(Modifier::BOLD) + } + + /// Background tint for a row whose subject just disappeared (cluster + /// or backend went away). Hot foreground + muted background so the + /// row remains readable while still catching the eye. + pub fn pulse_hot(&self) -> Style { + Style::default() + .fg(self.hot) + .bg(self.muted) + .add_modifier(Modifier::BOLD) + } + + /// Background tint for a row whose subject just appeared (new cluster + /// rolled out). Lower-priority cue than `pulse_hot`. + pub fn pulse_cool(&self) -> Style { + Style::default() + .fg(self.cool) + .bg(self.muted) + .add_modifier(Modifier::BOLD) + } + + /// Style for the function-key bar at the bottom of the screen. + pub fn fkey_label(&self) -> Style { + Style::default() + .fg(self.primary) + .bg(self.muted) + .add_modifier(Modifier::BOLD) + } + + pub fn fkey_action(&self) -> Style { + Style::default().fg(self.secondary) + } +} + +/// Errors surfaced by `Skin::from_open_file`. Kept narrow so the renderer can +/// stringify them into a status-bar diagnostic without leaking IO details. +#[derive(Debug, thiserror::Error)] +pub enum SkinError { + #[error("read skin: {0}")] + Io(std::io::Error), + #[error("parse skin: {0}")] + Parse(String), + #[error("validate skin: {0}")] + Validate(String), +} + +#[derive(Debug, Deserialize)] +struct RawSkin { + primary: String, + secondary: String, + accent: String, + cool: String, + warm: String, + hot: String, + muted: String, + #[serde(default)] + categorical: Vec, +} + +impl RawSkin { + fn into_skin(self) -> Result { + let primary = parse_hex(&self.primary, "primary")?; + let secondary = parse_hex(&self.secondary, "secondary")?; + let accent = parse_hex(&self.accent, "accent")?; + let cool = parse_hex(&self.cool, "cool")?; + let warm = parse_hex(&self.warm, "warm")?; + let hot = parse_hex(&self.hot, "hot")?; + let muted = parse_hex(&self.muted, "muted")?; + let categorical: Vec = if self.categorical.is_empty() { + OKABE_ITO_PLUS.to_vec() + } else { + self.categorical + .iter() + .enumerate() + .map(|(i, s)| parse_hex(s, &format!("categorical[{i}]"))) + .collect::, _>>()? + }; + Ok(Skin { + primary, + secondary, + accent, + cool, + warm, + hot, + muted, + categorical, + }) + } +} + +fn parse_hex(s: &str, field: &str) -> Result { + let raw = s.trim_start_matches('#'); + if raw.len() != 6 { + return Err(format!( + "field `{field}`: expected #RRGGBB hex colour, got `{s}`" + )); + } + let bytes = match u32::from_str_radix(raw, 16) { + Ok(n) => n, + Err(_) => return Err(format!("field `{field}`: `{s}` is not hex")), + }; + let r = ((bytes >> 16) & 0xff) as u8; + let g = ((bytes >> 8) & 0xff) as u8; + let b = (bytes & 0xff) as u8; + Ok(Color::Rgb(r, g, b)) +} + +/// Okabe-Ito 7-colour categorical palette + 2 Viridis high-end points to +/// extend headroom for >7 clusters in the heatmap. Each `Color::Rgb` value +/// is colour-blind safe in isolation; pairs are distinguishable across the +/// three common dichromatic types per Okabe-Ito's original 2002 paper. +const OKABE_ITO_PLUS: &[Color] = &[ + Color::Rgb(0, 158, 115), // bluish green + Color::Rgb(86, 180, 233), // sky blue + Color::Rgb(213, 94, 0), // vermilion + Color::Rgb(204, 121, 167), // reddish purple + Color::Rgb(240, 228, 66), // yellow + Color::Rgb(0, 114, 178), // blue + Color::Rgb(230, 159, 0), // orange + // Viridis high end — gives extra differentiation when palette wraps. + Color::Rgb(247, 209, 60), + Color::Rgb(94, 201, 97), +]; + +/// Resolved glyph mode for sparklines and bar fills. `TopGlyphs` from clap +/// is the user override; `GlyphMode::resolve` collapses `None` to a default +/// (`Block`) until the auto-detect cascade lands in week 3. +#[derive(Debug, Clone, Copy)] +pub enum GlyphMode { + /// Highest-density Unicode Braille mosaics; lifts each bar with sub-cell + /// resolution. Default once auto-detect lands and the terminal reports + /// Unicode-capable locale + an adequate font. + Braille, + /// Plain Unicode block elements (`▁▂▃▄▅▆▇█`). Broadest Unicode terminal + /// compatibility; the safe v1 default. + Block, + /// 7-bit ASCII fallback for `linux`/`dumb` TERMs and serial consoles. + Tty, +} + +impl GlyphMode { + /// Bar `Set` consumed by ratatui's `Sparkline` widget. Each tier picks + /// a glyph alphabet matched to the resolved terminal capability: + /// + /// - `Block` keeps ratatui's default `▁▂▃▄▅▆▇█` ramp. + /// - `Braille` swaps to dot mosaics (`⡀⡄⡆⡇⣇⣧⣷⣿`) that look denser + /// on font-stacks that anti-alias the block ramp into a single + /// solid bar. + /// - `Tty` falls back to 7-bit ASCII so a `linux`/`dumb` console + /// renders the sparkline as `. , - = + #` instead of `?` boxes. + pub fn sparkline_set(self) -> ratatui::symbols::bar::Set<'static> { + use ratatui::symbols::bar::{NINE_LEVELS, Set}; + match self { + Self::Block => NINE_LEVELS, + Self::Braille => Set { + full: "⣿", + seven_eighths: "⣷", + three_quarters: "⣧", + five_eighths: "⣇", + half: "⡇", + three_eighths: "⡆", + one_quarter: "⡄", + one_eighth: "⡀", + empty: " ", + }, + Self::Tty => Set { + full: "#", + seven_eighths: "#", + three_quarters: "+", + five_eighths: "+", + half: "=", + three_eighths: "-", + one_quarter: "-", + one_eighth: ".", + empty: " ", + }, + } + } + + /// Alphabet for inline trend strings rendered by `App::h2_trend_bars`. + /// One character per sample, ordered from lowest to highest. The + /// renderer maps each sample to an index in this slice based on the + /// ring's max sample, so a flat-zero series prints as the first + /// character on every position. + pub fn trend_alphabet(self) -> &'static [char] { + match self { + Self::Block => &['▁', '▂', '▃', '▄', '▅', '▆', '▇', '█'], + Self::Braille => &['⡀', '⡄', '⡆', '⡇', '⣇', '⣧', '⣷', '⣿'], + Self::Tty => &['.', ',', '-', '=', '+', '*', 'o', '#'], + } + } + + /// Rotate through the three modes in a fixed order. Used by F2 so + /// the operator can switch alphabets at runtime without restarting + /// `sozu top`. + pub fn cycle(self) -> Self { + match self { + Self::Block => Self::Braille, + Self::Braille => Self::Tty, + Self::Tty => Self::Block, + } + } + + /// Collapse the optional clap override to a concrete mode. When the + /// operator passed `--glyphs`, honour the explicit choice. Otherwise + /// the auto-detect cascade walks three terminal capability signals: + /// + /// 1. `TERM` reports `dumb`, `linux`, `xterm-old`, or any `*-mono*` + /// variant — fall back to 7-bit ASCII (`Tty`). These terminals + /// typically render Unicode glyphs as `?` / boxes. + /// 2. The active locale (`LC_ALL` / `LC_CTYPE` / `LANG`) ends in + /// `UTF-8` / `UTF8` AND isn't `C` / `POSIX` — Braille mosaics + /// are safe. + /// 3. Otherwise default to `Block` (broadest Unicode terminal + /// compatibility — every Unicode-capable TTY ships block + /// elements `▁..▇█` even without nerd-font support). + pub fn resolve(override_: Option) -> Self { + if let Some(forced) = override_ { + return match forced { + TopGlyphs::Braille => Self::Braille, + TopGlyphs::Block => Self::Block, + TopGlyphs::Tty => Self::Tty, + }; + } + Self::autodetect() + } + + fn autodetect() -> Self { + let term = std::env::var("TERM").unwrap_or_default(); + let term_lower = term.to_ascii_lowercase(); + if term_lower.is_empty() + || term_lower == "dumb" + || term_lower == "linux" + || term_lower == "xterm-old" + || term_lower.ends_with("-mono") + || term_lower.contains("-mono-") + { + return Self::Tty; + } + let locale = std::env::var("LC_ALL") + .or_else(|_| std::env::var("LC_CTYPE")) + .or_else(|_| std::env::var("LANG")) + .unwrap_or_default(); + let locale_upper = locale.to_ascii_uppercase(); + let is_c_locale = + locale_upper == "C" || locale_upper == "POSIX" || locale_upper.starts_with("C."); + let is_utf8 = locale_upper.contains("UTF-8") || locale_upper.contains("UTF8"); + if is_utf8 && !is_c_locale { + Self::Braille + } else { + Self::Block + } + } +} + +/// Status glyphs that double the colour signal so the colour-blind cue is +/// always backed up by a shape. `▲` rising, `▼` falling, `●` steady. +pub const GLYPH_RISING: &str = "▲"; +pub const GLYPH_FALLING: &str = "▼"; +pub const GLYPH_STEADY: &str = "●"; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_hex_accepts_hash_prefix_and_bare() { + assert_eq!( + parse_hex("#56c0f0", "x").unwrap(), + Color::Rgb(0x56, 0xc0, 0xf0) + ); + assert_eq!( + parse_hex("56c0f0", "x").unwrap(), + Color::Rgb(0x56, 0xc0, 0xf0) + ); + } + + #[test] + fn parse_hex_rejects_wrong_length() { + assert!(parse_hex("#abc", "x").is_err()); + assert!(parse_hex("#abcdefgg", "x").is_err()); + } + + #[test] + fn skin_from_toml_round_trip() { + let toml = r##" + primary = "#e8e8f0" + secondary = "#b4b8c0" + accent = "#56c0f0" + cool = "#39ad98" + warm = "#f5bf4f" + hot = "#e8545a" + muted = "#606470" + categorical = ["#009e73", "#56b4e9"] + "##; + let raw: RawSkin = toml::from_str(toml).expect("parse"); + let skin = raw.into_skin().expect("validate"); + assert_eq!(skin.hot, Color::Rgb(0xe8, 0x54, 0x5a)); + assert_eq!(skin.categorical.len(), 2); + } + + #[test] + fn skin_from_toml_empty_categorical_uses_default() { + let toml = r##" + primary = "#e8e8f0" + secondary = "#b4b8c0" + accent = "#56c0f0" + cool = "#39ad98" + warm = "#f5bf4f" + hot = "#e8545a" + muted = "#606470" + "##; + let raw: RawSkin = toml::from_str(toml).unwrap(); + let skin = raw.into_skin().unwrap(); + assert_eq!(skin.categorical.len(), OKABE_ITO_PLUS.len()); + } + + #[test] + fn skin_lookup_rejects_traversal() { + assert!(Skin::lookup_paths("../etc/passwd").is_empty()); + assert!(Skin::lookup_paths("foo/bar").is_empty()); + } + + #[test] + fn glyph_mode_explicit_override_wins() { + assert!(matches!( + GlyphMode::resolve(Some(TopGlyphs::Tty)), + GlyphMode::Tty + )); + assert!(matches!( + GlyphMode::resolve(Some(TopGlyphs::Braille)), + GlyphMode::Braille + )); + assert!(matches!( + GlyphMode::resolve(Some(TopGlyphs::Block)), + GlyphMode::Block + )); + } + + /// `from_open_file` must refuse to follow a leaf symlink. The TOCTOU + /// guard in `resolve` rests on the second open failing closed when + /// the resolved path has been swapped for a symlink between + /// `canonicalize` and the actual read. We plant a symlink under a + /// temp skins dir pointing at a real file outside it (`/etc/hostname` + /// is portable on Linux/BSD/macOS dev hosts and CI runners) and + /// assert the loader returns an `Io` error — `O_NOFOLLOW` surfaces + /// as `ELOOP` from the kernel and the loader does not read the + /// symlink target. + #[test] + fn from_open_file_refuses_leaf_symlink() { + use std::os::unix::fs::symlink; + + let tmp = tempfile::tempdir().expect("create temp skins dir"); + let link = tmp.path().join("evil.toml"); + // Skip the test if the symlink target does not exist on this + // platform (e.g. minimal sandboxes without `/etc/hostname`). + let target = Path::new("/etc/hostname"); + if !target.exists() { + return; + } + symlink(target, &link).expect("plant symlink"); + + let err = Skin::from_open_file(&link).expect_err("must refuse symlink leaf"); + match err { + SkinError::Io(io) => { + // The kernel reports `ELOOP` for `O_NOFOLLOW` on a + // symlink; some libc wrappers translate it differently + // but the error kind is always `Other` / `InvalidInput` + // / `FilesystemLoop` (Rust 1.86+). All that matters for + // the regression guard is that the read did not + // succeed. + let _ = io; + } + other => panic!("expected Io(ELOOP) error, got {other:?}"), + } + } +} diff --git a/bin/src/ctl/top/transport.rs b/bin/src/ctl/top/transport.rs new file mode 100644 index 000000000..5854b2a88 --- /dev/null +++ b/bin/src/ctl/top/transport.rs @@ -0,0 +1,566 @@ +//! Transport layer for `sozu top` — synchronous threads over the existing +//! unix command socket. No async runtime in v1 by design. +//! +//! Four `Channel` connections to the master, each owned by its own thread: +//! +//! 1. **Snapshot collector** (`spawn_collector`): polls `RequestType:: +//! QueryMetrics` on the configurable `--refresh-ms` ticker and pushes +//! each `AggregatedMetrics` (plus a wall-clock sample anchor) into a +//! `crossbeam_channel::bounded::(1)`. +//! 2. **Events stream** (`spawn_events`): opens `RequestType:: +//! SubscribeEvents` once and forwards every inbound `Event` into a +//! `crossbeam_channel::bounded::(64)`. The unix `Channel` +//! is a single framed socket without message-id correlation, so +//! multiplexing this stream with the discrete `QueryMetrics` round-trip +//! on one socket is unsafe — we open a separate connection. +//! 3. **Listeners collector** (`spawn_listeners`): polls +//! `RequestType::ListListeners` every 5 s into a `bounded(1)` channel. +//! 4. **Certs collector** (`spawn_certs`): polls +//! `RequestType::QueryCertificatesFromTheState` every 30 s into a +//! `bounded(1)` channel. +//! +//! All snapshot threads use **publish-or-skip on backpressure**: when the +//! `bounded(1)` channel is already populated (the UI hasn't drained yet), +//! the fresh snapshot is dropped rather than the thread blocking or dying. +//! The next poll produces a fresher snapshot anyway, so dropping an +//! in-flight one is correct: it preserves "newest-wins" without needing +//! the sender to peek into the receiver's slot. The events thread uses the +//! same shape, just with a `bounded(64)` buffer for burst tolerance. +//! +//! The three poll-driven threads exit cleanly when their `crossbeam_channel` +//! peer is dropped (the UI thread owns the rx ends; tearing down the App +//! drops the senders so `try_send` returns `Disconnected`). The events +//! thread does NOT see receiver-drop — its read blocks on the unix socket +//! and dropping the crossbeam `Receiver` cannot propagate across +//! the socket. It exits on an `Arc` shutdown flag owned by +//! `run_top` and a bounded `EVENTS_READ_TIMEOUT` per read. +//! +//! Transient errors are surfaced via a shared `StatusSlot` (the same +//! mailbox the lease renewer uses); the render loop drains it once per +//! tick and shows the message in the status bar. The threads continue +//! running — a single transient socket error never crashes the UI. + +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::{Duration, Instant}; + +use crossbeam_channel::{Receiver, Sender, TrySendError, bounded}; +use sozu_command_lib::{ + channel::ChannelError, + config::Config, + proto::command::{ + AggregatedMetrics, Event, ListListeners, ListOfCertificatesByAddress, ListenersList, + QueryCertificatesFilters, QueryMetricsOptions, Request, Response, ResponseStatus, + SubscribeEvents, request::RequestType, response_content::ContentType, + }, +}; + +use crate::ctl::create_channel; + +use super::CtlError; +use super::cardinality::{StatusSlot, publish_status}; + +/// Bundle published by the collector thread on every successful poll. +/// Owns `AggregatedMetrics` outright so the UI can rebuild ring buffers +/// without holding any other lock. The `received_at` field anchors rate +/// calculation across ticks. +#[derive(Debug, Clone)] +pub struct Snapshot { + pub metrics: AggregatedMetrics, + pub received_at: Instant, +} + +/// Wrapper around an inbound `Event` so we can later attach metadata +/// (received_at, source-worker tag) without breaking the channel shape. +#[derive(Debug, Clone)] +pub struct TopEvent { + pub event: Event, + pub received_at: Instant, +} + +/// Listener inventory snapshot pushed by the listeners-collector thread. +/// Refreshed at a slower cadence than metrics (5 s default) because listener +/// state changes are operator-paced — adds, removes, activates, deactivates +/// all flow via control-plane mutations that the EVENTS pane already shows. +/// +/// Unlike `Snapshot`, there is no `received_at` anchor: the listeners pane +/// renders the absolute set, never per-tick rates, so the wall-clock would +/// have nothing to discriminate against. +#[derive(Debug, Clone)] +pub struct ListenersSnapshot { + pub list: ListenersList, +} + +/// Certificate inventory snapshot pushed by the certs-collector thread. +/// Polled at 30 s — even slower than listeners because cert lifecycle is +/// operator-driven (add, remove, replace via the master's state) and every +/// transition already lands as a CERTIFICATE_* event on the EVENTS pane. +/// +/// Same shape as `ListenersSnapshot`: no `received_at` because the certs +/// pane renders the absolute set without per-tick rates. +#[derive(Debug, Clone)] +pub struct CertsSnapshot { + pub list: ListOfCertificatesByAddress, +} + +/// Capacity of the events channel. 64 is generous for the operator-pace +/// event stream (BACKEND_UP/DOWN, control-plane mutations, the new +/// METRIC_DETAIL_CHANGED audit). Bursts above 64 follow the publish-or-skip +/// contract used by every other snapshot channel: `try_send` on a full +/// bounded channel drops the newest sample (the 64 oldest stay queued for +/// the UI). The bound keeps memory bounded if the UI freezes momentarily. +const EVENTS_CAP: usize = 64; + +/// Per-read deadline for the events loop. We do NOT want an unbounded +/// blocking read here: the only signal that the UI is gone is the +/// `shutdown` flag flipped by `run_top` after `render::run` returns, +/// and dropping the crossbeam `Receiver` does NOT propagate +/// across the unix socket. A 1 s deadline keeps the shutdown latency +/// bounded by a single round-trip without burning CPU on idle traffic +/// (the master is event-pace; quiet seconds are the common case). +const EVENTS_READ_TIMEOUT: Duration = Duration::from_secs(1); + +/// `Snapshot` channel capacity. 1 with publish-or-skip on backpressure is +/// intentional: while the UI is rendering a frame, a fresh snapshot is +/// dropped rather than queueing behind the stale one. The next poll +/// produces a newer snapshot anyway, so the cadence stays "as fresh as +/// the UI can consume" without the sender having to peek into the +/// receiver's slot. +const SNAPSHOT_CAP: usize = 1; + +/// Shared polling skeleton for the three `bounded(1)` collector threads +/// (`spawn_collector`, `spawn_listeners`, `spawn_certs`). +/// +/// The threading topology (one OS thread per pane, owning its own `Channel`, +/// `bounded(1)` publish-or-skip on backpressure) is locked by design and is +/// not abstracted here — only the per-tick polling shape is shared: +/// +/// 1. record `Instant::now()` +/// 2. call the per-thread `poll` closure +/// 3. on `Ok(v)`: `try_send(v)` — `Full` is dropped (publish-or-skip), +/// `Disconnected` exits the thread cleanly when the UI drops `rx` +/// 4. on `Err(_)`: `eprintln!` and keep going (a transient socket error +/// never kills the thread; the next tick reconnects via the shared +/// `Channel` retry path inside `poll`) +/// 5. sleep the remainder of `interval` if the round-trip was faster +/// +/// The events thread (`spawn_events`) has a different shape (single +/// `SubscribeEvents` write + open-ended drain loop) and intentionally does +/// not reuse this helper. +fn poll_loop( + label: &'static str, + interval: Duration, + tx: Sender, + status: StatusSlot, + mut channel: sozu_command_lib::channel::Channel, + mut poll: F, +) where + F: FnMut(&mut sozu_command_lib::channel::Channel) -> Result, +{ + loop { + let started = Instant::now(); + match poll(&mut channel) { + Ok(v) => match tx.try_send(v) { + Ok(()) => {} + // Publish-or-skip: if the UI hasn't drained the previous + // value, skip this one rather than killing the thread. The + // next poll produces a fresher value anyway. + Err(TrySendError::Full(_)) => {} + Err(TrySendError::Disconnected(_)) => return, + }, + Err(err) => { + publish_status(&status, format!("{label} poll error: {err}")); + } + } + // Sleep the remaining slice of the configured interval so we don't + // hammer the master after a slow round-trip. If a poll took longer + // than `interval`, fire the next one immediately. + let elapsed = started.elapsed(); + if elapsed < interval { + std::thread::sleep(interval - elapsed); + } + } +} + +/// Spawn the snapshot-collector thread. Returns the `Snapshot` receiver and +/// a join handle. Thread exits when the receiver is dropped or the channel +/// returns a permanent socket error. +pub fn spawn_collector( + config: Config, + refresh_ms: u64, + status: StatusSlot, +) -> Result<(Receiver, std::thread::JoinHandle<()>), CtlError> { + // Open the dedicated polling channel up-front so a connection failure + // surfaces synchronously (operator gets `CtlError::CreateChannel`) + // rather than silently spinning behind the spawned thread. + let channel = create_channel(&config)?; + let (tx, rx) = bounded::(SNAPSHOT_CAP); + let interval = Duration::from_millis(refresh_ms); + let handle = std::thread::Builder::new() + .name("sozu-top-collector".into()) + .spawn(move || { + poll_loop("snapshot", interval, tx, status, channel, |ch| { + poll_metrics(ch).map(|metrics| Snapshot { + metrics, + received_at: Instant::now(), + }) + }) + }) + .map_err(|source| CtlError::SpawnFailed { + label: "sozu-top-collector", + source, + })?; + Ok((rx, handle)) +} + +fn poll_metrics( + channel: &mut sozu_command_lib::channel::Channel, +) -> Result { + let req = Request { + request_type: Some(RequestType::QueryMetrics(QueryMetricsOptions { + list: false, + cluster_ids: vec![], + backend_ids: vec![], + metric_names: vec![], + no_clusters: false, + workers: false, + })), + }; + channel + .write_message(&req) + .map_err(|e| format!("write QueryMetrics: {e}"))?; + + // The protocol shape is `0..N Response{PROCESSING}` then exactly one + // terminal `Response{OK|FAILURE}`. We poll until the terminal arrives + // (or a per-message read timeout pushes us back). Matches the existing + // `bin/src/ctl/command.rs::get_metrics` loop. + loop { + let resp = channel + .read_message_blocking_timeout(Some(Duration::from_secs(5))) + .map_err(|e| format!("read QueryMetrics response: {e}"))?; + match resp.status() { + ResponseStatus::Processing => continue, + ResponseStatus::Failure => { + return Err(format!("QueryMetrics failed: {}", resp.message)); + } + ResponseStatus::Ok => match resp.content { + Some(content) => match content.content_type { + Some(ContentType::Metrics(m)) => return Ok(m), + other => { + return Err(format!( + "unexpected content variant for QueryMetrics: {}", + content_type_name(other.as_ref()), + )); + } + }, + None => return Err("QueryMetrics OK with no content".into()), + }, + } + } +} + +/// Cadence of the listeners poll. Operator-paced; 5 s matches the brief's +/// "cold subjects" tier and HAProxy hatop's documented `show stat` cadence. +const LISTENERS_INTERVAL: Duration = Duration::from_secs(5); + +/// Cadence of the certs poll. Operator-paced and lower-priority than +/// listeners; cert mutations also flow through the EVENTS pane in +/// real-time, so the 30 s refresh is enough to keep the table fresh. +const CERTS_INTERVAL: Duration = Duration::from_secs(30); + +/// Spawn the listeners-collector thread. Polls `RequestType::ListListeners` +/// every `LISTENERS_INTERVAL` over its own `Channel` and pushes a +/// `ListenersSnapshot` into a `bounded(1)` newest-wins channel. Same +/// shape as `spawn_collector`. +pub fn spawn_listeners( + config: Config, + status: StatusSlot, +) -> Result<(Receiver, std::thread::JoinHandle<()>), CtlError> { + let channel = create_channel(&config)?; + let (tx, rx) = bounded::(SNAPSHOT_CAP); + let handle = std::thread::Builder::new() + .name("sozu-top-listeners".into()) + .spawn(move || { + poll_loop("listeners", LISTENERS_INTERVAL, tx, status, channel, |ch| { + poll_listeners(ch).map(|list| ListenersSnapshot { list }) + }) + }) + .map_err(|source| CtlError::SpawnFailed { + label: "sozu-top-listeners", + source, + })?; + Ok((rx, handle)) +} + +fn poll_listeners( + channel: &mut sozu_command_lib::channel::Channel, +) -> Result { + let req = Request { + request_type: Some(RequestType::ListListeners(ListListeners {})), + }; + channel + .write_message(&req) + .map_err(|e| format!("write ListListeners: {e}"))?; + loop { + let resp = channel + .read_message_blocking_timeout(Some(Duration::from_secs(5))) + .map_err(|e| format!("read ListListeners response: {e}"))?; + match resp.status() { + ResponseStatus::Processing => continue, + ResponseStatus::Failure => { + return Err(format!("ListListeners failed: {}", resp.message)); + } + ResponseStatus::Ok => match resp.content { + Some(content) => match content.content_type { + Some(ContentType::ListenersList(l)) => return Ok(l), + other => { + return Err(format!( + "unexpected content variant for ListListeners: {}", + content_type_name(other.as_ref()), + )); + } + }, + None => return Err("ListListeners OK with no content".into()), + }, + } + } +} + +/// Spawn the certs-collector thread. Polls `RequestType::QueryCertificates +/// FromTheState` every `CERTS_INTERVAL` over its own `Channel` and pushes a +/// `CertsSnapshot` into a `bounded(1)` newest-wins channel. The "from the +/// state" variant (vs `QueryCertificatesFromWorkers`) reads the master's +/// `ConfigState` — the canonical cert inventory — without paying the +/// worker-fan-out cost on every poll. +pub fn spawn_certs( + config: Config, + status: StatusSlot, +) -> Result<(Receiver, std::thread::JoinHandle<()>), CtlError> { + let channel = create_channel(&config)?; + let (tx, rx) = bounded::(SNAPSHOT_CAP); + let handle = std::thread::Builder::new() + .name("sozu-top-certs".into()) + .spawn(move || { + poll_loop("certs", CERTS_INTERVAL, tx, status, channel, |ch| { + poll_certs(ch).map(|list| CertsSnapshot { list }) + }) + }) + .map_err(|source| CtlError::SpawnFailed { + label: "sozu-top-certs", + source, + })?; + Ok((rx, handle)) +} + +fn poll_certs( + channel: &mut sozu_command_lib::channel::Channel, +) -> Result { + let req = Request { + request_type: Some(RequestType::QueryCertificatesFromTheState( + QueryCertificatesFilters { + domain: None, + fingerprint: None, + }, + )), + }; + channel + .write_message(&req) + .map_err(|e| format!("write QueryCertificatesFromTheState: {e}"))?; + loop { + let resp = channel + .read_message_blocking_timeout(Some(Duration::from_secs(5))) + .map_err(|e| format!("read QueryCertificatesFromTheState response: {e}"))?; + match resp.status() { + ResponseStatus::Processing => continue, + ResponseStatus::Failure => { + return Err(format!( + "QueryCertificatesFromTheState failed: {}", + resp.message + )); + } + ResponseStatus::Ok => match resp.content { + Some(content) => match content.content_type { + Some(ContentType::CertificatesByAddress(l)) => return Ok(l), + Some(ContentType::CertificatesWithFingerprints(map)) => { + // `query_certificates_from_main` answers with the + // fingerprint-keyed map (the same shape `sozu + // certificate query` consumes). The CERTS pane + // wants per-address rows; synthesise them here, + // dropping the PEM + private-key fields + // immediately because the TUI only needs the + // (domain, fingerprint) pair. NEVER let the key + // material flow further than this conversion — + // an `eprintln!` / log line downstream would + // otherwise leak the operator's private key to + // the renderer's alt-screen or scrollback. + return Ok(certs_from_fingerprint_map(map)); + } + other => { + return Err(format!( + "unexpected content variant for QueryCertificatesFromTheState: {}", + content_type_name(other.as_ref()), + )); + } + }, + None => return Err("QueryCertificatesFromTheState OK with no content".into()), + }, + } + } +} + +/// Convert the fingerprint-keyed `CertificatesWithFingerprints` payload +/// (which carries cert PEM + private key) into the by-address +/// `ListOfCertificatesByAddress` shape the CERTS pane consumes (which +/// carries only the `(domain, fingerprint)` pair). Drops the key + cert +/// PEM fields IMMEDIATELY so private-key material never reaches the +/// renderer, the error log, the alt-screen scrollback, or any +/// downstream `eprintln!`. The address is synthesised because the +/// fingerprint-keyed response doesn't carry one; `0.0.0.0:0` renders +/// as `0.0.0.0:0` in the table and signals "no per-address grouping +/// available". A follow-up could plumb the actual listener address +/// from the state, but the inventory shape is correct. +fn certs_from_fingerprint_map( + payload: sozu_command_lib::proto::command::CertificatesWithFingerprints, +) -> ListOfCertificatesByAddress { + use sozu_command_lib::proto::command::{ + CertificateSummary, CertificatesByAddress, SocketAddress, + }; + let mut summaries: Vec = Vec::with_capacity(payload.certs.len()); + for (fingerprint, cert) in payload.certs { + // The cert's first SNI is the operator-facing identifier. If + // `names` is empty (legacy certs without an SNI override) + // fall back to a `` placeholder so the row still + // shows up rather than disappearing silently. + let domain = cert + .names + .into_iter() + .next() + .unwrap_or_else(|| "".to_owned()); + summaries.push(CertificateSummary { + fingerprint, + domain, + }); + // `cert.certificate`, `cert.certificate_chain`, `cert.key` + // drop here as `cert` goes out of scope — never copied + // forward, never logged. + } + ListOfCertificatesByAddress { + certificates: vec![CertificatesByAddress { + address: SocketAddress { + ip: sozu_command_lib::proto::command::IpAddress { + inner: Some(sozu_command_lib::proto::command::ip_address::Inner::V4(0)), + }, + port: 0, + }, + certificate_summaries: summaries, + }], + } +} + +/// Stable short name for a `ContentType` variant, used in error +/// messages to identify which variant arrived without `Debug`-printing +/// its payload (private keys, large blobs). Returns `` for +/// `None` (no content_type set in the response). +fn content_type_name(ct: Option<&ContentType>) -> &'static str { + match ct { + None => "", + Some(ContentType::Workers(_)) => "Workers", + Some(ContentType::Metrics(_)) => "Metrics", + Some(ContentType::WorkerResponses(_)) => "WorkerResponses", + Some(ContentType::Event(_)) => "Event", + Some(ContentType::FrontendList(_)) => "FrontendList", + Some(ContentType::ListenersList(_)) => "ListenersList", + Some(ContentType::WorkerMetrics(_)) => "WorkerMetrics", + Some(ContentType::AvailableMetrics(_)) => "AvailableMetrics", + Some(ContentType::Clusters(_)) => "Clusters", + Some(ContentType::ClusterHashes(_)) => "ClusterHashes", + Some(ContentType::CertificatesByAddress(_)) => "CertificatesByAddress", + Some(ContentType::CertificatesWithFingerprints(_)) => "CertificatesWithFingerprints", + Some(ContentType::RequestCounts(_)) => "RequestCounts", + Some(ContentType::MaxConnectionsPerIpLimit(_)) => "MaxConnectionsPerIpLimit", + Some(ContentType::HealthChecksList(_)) => "HealthChecksList", + Some(ContentType::MetricDetailStatus(_)) => "MetricDetailStatus", + Some(ContentType::WorkerMetricDetailStatus(_)) => "WorkerMetricDetailStatus", + } +} + +/// Spawn the events-stream thread. Returns the `TopEvent` receiver and +/// a join handle. Thread exits when `shutdown` is set, when the +/// SubscribeEvents stream errors out, or when the master closes the +/// subscription with a terminal Ok/Failure. +/// +/// The `shutdown` flag is the canonical wake-up: dropping the +/// `Receiver` cannot propagate across the unix socket, so +/// without an explicit flag the thread sleeps forever on the next read. +/// `run_top` owns the `Arc` and flips it after the render +/// loop returns. +pub fn spawn_events( + config: Config, + shutdown: Arc, + status: StatusSlot, +) -> Result<(Receiver, std::thread::JoinHandle<()>), CtlError> { + let mut channel = create_channel(&config)?; + let (tx, rx) = bounded::(EVENTS_CAP); + let handle = std::thread::Builder::new() + .name("sozu-top-events".into()) + .spawn(move || events_loop(&mut channel, tx, shutdown, status)) + .map_err(|source| CtlError::SpawnFailed { + label: "sozu-top-events", + source, + })?; + Ok((rx, handle)) +} + +fn events_loop( + channel: &mut sozu_command_lib::channel::Channel, + tx: Sender, + shutdown: Arc, + status: StatusSlot, +) { + let req = Request { + request_type: Some(RequestType::SubscribeEvents(SubscribeEvents {})), + }; + if let Err(e) = channel.write_message(&req) { + publish_status(&status, format!("events: SubscribeEvents write error: {e}")); + return; + } + while !shutdown.load(Ordering::Relaxed) { + let resp = match channel.read_message_blocking_timeout(Some(EVENTS_READ_TIMEOUT)) { + Ok(r) => r, + // Bounded read timeout fired with no payload; loop back to + // re-check the shutdown flag. This is the steady-state path + // on a quiet master, not an error. + Err(ChannelError::TimeoutReached(_)) => continue, + Err(e) => { + publish_status(&status, format!("events: read error: {e}")); + return; + } + }; + match resp.status() { + ResponseStatus::Processing => { + if let Some(ev) = unwrap_event(resp) { + let topev = TopEvent { + event: ev, + received_at: Instant::now(), + }; + // Publish-or-skip on overflow: we never block the events + // thread on a stuck UI. A `try_send` failure on a full + // bounded channel drops the newest sample (the 64 + // oldest stay queued for the UI). Documented contract + // shared with the snapshot channels above. + let _ = tx.try_send(topev); + } + } + // Some servers may close the subscription with an explicit + // terminal Ok/Failure; surface it then exit. + ResponseStatus::Ok | ResponseStatus::Failure => return, + } + } +} + +fn unwrap_event(resp: Response) -> Option { + match resp.content?.content_type? { + ContentType::Event(ev) => Some(ev), + _ => None, + } +} diff --git a/bin/src/main.rs b/bin/src/main.rs index 63eaf40ae..9680c551a 100644 --- a/bin/src/main.rs +++ b/bin/src/main.rs @@ -30,6 +30,8 @@ extern crate sozu_command_lib; #[cfg(target_os = "linux")] extern crate num_cpus; +use sozu_lib::metrics::names; + // FreeBSD/NetBSD libc malloc is already jemalloc; the bundled `jemallocator` // dep is filtered out of the build graph in `bin/Cargo.toml`. The // `libc_jemalloc` cfg is emitted by `bin/build.rs` for those targets; without @@ -140,7 +142,7 @@ fn register_panic_hook() { let original_panic_hook = panic::take_hook(); panic::set_hook(Box::new(move |panic_info| { - incr!("panic"); + incr!(names::misc::PANIC); METRICS.with(|metrics| { (*metrics.borrow_mut()).send_data(); }); diff --git a/bin/tests/sozu_top_e2e.rs b/bin/tests/sozu_top_e2e.rs new file mode 100644 index 000000000..7dc9cd306 --- /dev/null +++ b/bin/tests/sozu_top_e2e.rs @@ -0,0 +1,177 @@ +//! Subprocess e2e tests for the `sozu top` TUI subcommand. +//! +//! These tests spawn the actual `sozu` binary (located via Cargo's +//! `CARGO_BIN_EXE_sozu` env var) and exercise the TUI's entry-point +//! behaviour — clap parsing, binary linkage, and (under `--ignored`) the +//! full transport-thread + render-loop lifecycle against a real Sōzu +//! master. +//! +//! The non-`#[ignore]`d tests run in CI and validate the bits that don't +//! depend on a running master: building the bin with `--features tui`, +//! invoking it without crashing, and producing the expected `--help` +//! shape. The full master-side e2e (`sozu_top_tick_once_against_real_master`) +//! is `#[ignore]`d so operators can run it manually with `cargo test +//! -p sozu --features tui --tests -- --ignored sozu_top` because it +//! spawns a daemonised master + writes to the filesystem. + +#![cfg(feature = "tui")] + +use std::process::Command; +use std::time::{Duration, Instant}; + +/// Path to the freshly-compiled `sozu` binary. Cargo populates this env +/// var for every integration test under `bin/tests/` so the binary the +/// test exercises is always the one the rest of the suite just built. +fn sozu_bin() -> &'static str { + env!("CARGO_BIN_EXE_sozu") +} + +/// `sozu top --help` must parse cleanly and print every flag added by +/// week-1's clap surface. Regression guard against accidentally dropping +/// a flag from `bin/src/cli.rs` or breaking the `#[cfg(feature = "tui")]` +/// gate on the `SubCmd::Top` variant. +#[test] +fn sozu_top_help_lists_every_flag() { + let output = Command::new(sozu_bin()) + .args(["top", "--help"]) + .output() + .expect("spawn sozu top --help"); + assert!( + output.status.success(), + "sozu top --help exited non-zero: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + // Flag set per `bin/src/cli.rs::SubCmd::Top`. `--no-color` and + // `--log-file` were dropped (commit ddce50e4) once they ceased to + // influence behaviour; the help text mustn't regress that. If a flag + // lands or leaves the surface, this list and the operator docs in + // `doc/configure_cli.md` + `doc/sozu-top.md` move together. + for flag in [ + "--refresh-ms", + "--no-mouse", + "--skin", + "--detail", + "--lease-ttl-seconds", + "--snapshot", + "--tick-once", + "--glyphs", + ] { + assert!( + stdout.contains(flag), + "sozu top --help missing flag `{flag}`. output:\n{stdout}", + ); + } + // Flags that used to exist but were intentionally removed — assert + // they stay gone so a future regression can't quietly resurface them. + for removed in ["--no-color", "--log-file"] { + assert!( + !stdout.contains(removed), + "sozu top --help unexpectedly carries removed flag `{removed}`. output:\n{stdout}", + ); + } +} + +/// `sozu --version` must report `+tui` when built with the feature, and +/// `-tui` otherwise. Build matrices that ship both variants rely on the +/// banner to tell them apart at deployment time. This test only runs +/// under `--features tui`, so it asserts the `+tui` side; the `-tui` +/// banner is covered by the default-feature snapshot of the version +/// string in CI's lean-build cell. +#[test] +fn sozu_version_reports_plus_tui() { + let output = Command::new(sozu_bin()) + .arg("--version") + .output() + .expect("spawn sozu --version"); + assert!(output.status.success(), "sozu --version exited non-zero"); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!( + stdout.contains("+tui"), + "sozu --version did not list +tui under --features tui. output:\n{stdout}", + ); + assert!( + !stdout.contains("-tui"), + "sozu --version listed -tui under --features tui. output:\n{stdout}", + ); +} + +/// `sozu top --tick-once` against a real master: spawns a daemonised +/// `sozu start`, waits for the command socket to appear, runs the TUI +/// for exactly one frame, and asserts a clean exit. `#[ignore]`d by +/// default because it writes to a temp dir, binds an ephemeral port, +/// and depends on graceful master shutdown (a hung master would block +/// CI). Run manually with: +/// +/// ```bash +/// cargo test -p sozu --features tui --tests -- --ignored sozu_top +/// ``` +#[test] +#[ignore] +fn sozu_top_tick_once_against_real_master() { + let temp = tempfile::tempdir().expect("tempdir"); + let socket_path = temp.path().join("sozu.sock"); + let config_path = temp.path().join("config.toml"); + + // Minimum viable config: command socket only, no listeners, no + // clusters, one worker, automatic-restart disabled (so the master + // exits cleanly when we send SIGTERM). + let config = format!( + r#" +command_socket = "{socket}" +command_buffer_size = 16384 +max_command_buffer_size = 163840 +worker_count = 1 +worker_automatic_restart = false +saved_state = "" +log_level = "warn" +log_target = "stderr" +max_connections = 100 +buffer_size = 16393 +"#, + socket = socket_path.display(), + ); + std::fs::write(&config_path, config).expect("write config"); + + let mut master = Command::new(sozu_bin()) + .args(["start", "-c", config_path.to_str().unwrap()]) + .spawn() + .expect("spawn sozu start"); + + // Wait up to 10 s for the master to create the command socket. + let deadline = Instant::now() + Duration::from_secs(10); + while !socket_path.exists() { + if Instant::now() > deadline { + let _ = master.kill(); + let _ = master.wait(); + panic!("sozu master never created {socket_path:?}"); + } + std::thread::sleep(Duration::from_millis(50)); + } + + // Now exercise the TUI for a single tick. `--no-mouse` skips SGR + // mouse capture (avoids stale escape sequences in the parent shell + // if the test harness leaks them); `--snapshot 1` renders one + // frame and exits. + let output = Command::new(sozu_bin()) + .args([ + "-c", + config_path.to_str().unwrap(), + "top", + "--snapshot", + "1", + "--no-mouse", + ]) + .output() + .expect("spawn sozu top --snapshot 1"); + + // Send SIGTERM to the master; SoftStop drains and exits. Give it 5 s. + let _ = master.kill(); + let _ = master.wait(); + + assert!( + output.status.success(), + "sozu top --snapshot 1 exited non-zero: stderr=\n{}", + String::from_utf8_lossy(&output.stderr) + ); +} diff --git a/command/build.rs b/command/build.rs index 8590c23d1..a8599e60e 100644 --- a/command/build.rs +++ b/command/build.rs @@ -41,6 +41,12 @@ pub fn main() { .message_attribute("HttpsListenerConfig", "#[derive(Hash, Eq)]") .message_attribute("UpdateHttpListenerConfig", "#[derive(Hash, Eq)]") .message_attribute("UpdateHttpsListenerConfig", "#[derive(Hash, Eq)]") + // Re-attach Hash/Eq for the `sozu top` runtime cardinality lease + // status message: it carries `map`, + // which strips prost's auto-derive. Without this the + // `Request.request_type` and `ResponseContent.content_type` oneofs + // can't derive their own Hash/Eq either. + .message_attribute("MetricDetailStatus", "#[derive(Hash, Eq)]") // JSON state-file forward compat: `SaveState`/`LoadState` files are // JSON-encoded `WorkerRequest` records. Without `#[serde(default)]`, // serde rejects records that don't carry every `Vec`/`map` field — so diff --git a/command/src/command.proto b/command/src/command.proto index a144add20..297b4f62f 100644 --- a/command/src/command.proto +++ b/command/src/command.proto @@ -136,6 +136,13 @@ message Request { string remove_health_check = 53; // list health check configurations (optional cluster id filter). QueryHealthChecks query_health_checks = 54; + // Apply, renew, or release a runtime cardinality lease on the metrics + // drain. `sozu top` (and any future TUI client) leases DETAIL_BACKEND + // for the duration of an interactive session; the worker's effective + // detail is `max(configured, max(active leases))`. Leases self-expire + // server-side after `ttl_seconds` so a crashed client never permanently + // elevates cardinality. See doc/configure.md for the full semantics. + SetMetricDetail set_metric_detail = 55; } } @@ -1137,6 +1144,20 @@ message ResponseContent { // health check configurations by cluster (renumbered from PR #1191's // original `14` since post-1209 occupies that tag). HealthChecksList health_checks_list = 15; + // Aggregated outcome of a `SetMetricDetail` fan-out: per-worker + // configured/effective/previous_effective levels plus the list of + // workers that could not decode the verb (mixed-version safety). + MetricDetailStatus metric_detail_status = 16; + // Per-worker status payload returned by a single worker in + // response to `SetMetricDetail`. The master collects these + // across the fan-out and assembles them into + // `MetricDetailStatus.workers[]`. Carries the + // worker's own `(configured, effective, previous_effective, + // active_lease_count)` quartet — distinct from the master-side + // view rendered in `MetricDetailStatus.{configured,effective, + // previous_effective}` because each worker holds its own + // `Aggregator` with an independent lease table. + WorkerMetricDetailStatus worker_metric_detail_status = 17; } } @@ -1176,6 +1197,40 @@ message Event { optional string cluster_id = 2; optional string backend_id = 3; optional SocketAddress address = 4; + // Set only when `kind == METRIC_DETAIL_CHANGED` and the worker is + // surfacing a worker-local lease transition (apply, clear, or polled + // expiry). Operator-initiated transitions are audited at the master + // dispatch site and DO emit this event for the SubscribeEvents bus, + // but the audit-log line for those is generated master-side and + // duplicates of `metric_detail` should be ignored by SOC tooling. + // See the `EventKind::METRIC_DETAIL_CHANGED` doc and the + // `MetricDetailTransition` message below for the trust model. + optional MetricDetailTransition metric_detail = 5; +} + +// Worker-emitted cardinality-lease transition. Populates the +// `Event.metric_detail` field when a worker's `effective` level changes +// because a lease was applied, renewed, expired (TTL janitor), or +// cleared. The master folds these into the audit log alongside the +// operator-initiated transitions emitted from +// `bin/src/command/requests.rs::worker_request`, closing the gap where +// worker-local expiries previously left no audit trail. +message MetricDetailTransition { + // The worker's effective cardinality level BEFORE the transition. + required MetricDetail previous_effective = 1; + // The worker's effective cardinality level AFTER the transition. + required MetricDetail effective = 2; + // What caused the transition. Stable strings: "lease_tick_expired" + // (janitor retired one or more leases), "lease_apply" (worker arm + // applied a lease), "lease_clear" (worker arm cleared a lease). + // Operator-initiated apply/clear emit master-side; the worker still + // emits this Event so the SubscribeEvents bus has one canonical + // signal for cardinality changes regardless of origin. + required string transition_kind = 3; + // Operator-supplied lease key (`SetMetricDetail.client_id`) when the + // transition was triggered by an explicit apply/clear. Empty for + // janitor expiries, which clear many leases at once. + optional string client_id = 4; } enum EventKind { @@ -1242,6 +1297,20 @@ enum EventKind { // backend available". Pairs with `NoAvailableBackends` (tag 2) so // dashboards can plot per-cluster recovery. CLUSTER_RECOVERED = 29; + // The worker's effective `MetricDetail` changed because a runtime + // lease was applied, renewed, expired, or cleared. Pairs with + // `MetricsConfigured` (tag 17) but distinct: that one fires for + // `MetricsConfiguration` (Enabled/Disabled/Clear), this one fires + // for cardinality changes. + // + // Emitter scope: operator-initiated transitions emit + // `METRIC_DETAIL_CHANGED` via the master-side audit log (see + // `bin/src/command/requests.rs` around the `SetMetricDetail` + // success path). Worker-local transitions — the polled janitor + // expiring a lease, or a worker-local clear/apply after a master + // fan-out — are not yet surfaced because the worker has no direct + // IPC path to the master's audit sink; follow-up tracked separately. + METRIC_DETAIL_CHANGED = 30; } message ClusterHashes { @@ -1265,6 +1334,8 @@ message WorkerInfo { required uint32 id = 1; required int32 pid = 2; required RunState run_state = 3; + reserved 4; + reserved "proto_version"; } // Runstate of a worker @@ -1431,6 +1502,119 @@ enum MetricDetail { DETAIL_BACKEND = 3; } +// Apply, renew, or release a runtime cardinality lease on the metrics drain. +// +// Leasing model: `sozu top` (and any future TUI client) leases a higher +// `MetricDetail` for the duration of an interactive session. The worker's +// effective detail is `max(configured, max(active leases))`, where +// `configured` is `MetricsConfig.detail` from the static configuration. +// Multiple clients can lease independently; the worker keeps a `client_id`- +// keyed table and uses the maximum across active entries. +// +// Lifecycle: +// 1. Apply: send `SetMetricDetail{ client_id, detail, ttl_seconds, reason }`. +// The worker stores `(client_id) -> (detail, expires_at = now + ttl)`. If +// a lease for `client_id` already exists, it is REPLACED (acts as a +// renewal). The renewer client is expected to re-send every `ttl/2`. +// 2. Expire: leases self-expire server-side at `expires_at`. The worker's +// janitor (5s polled tick at the top of `notify`) prunes expired leases +// and recomputes effective. Crash safety: a dead client is forgotten. +// 3. Clear: send `SetMetricDetail{ client_id, clear: true }` for explicit +// revocation. `client_id` must match the leased entry; mismatched IDs +// are silently ignored (other clients' leases are not affected). +// +// Audit +// ===== +// Every operator-initiated effective-level transition emits an +// `EventKind::METRIC_DETAIL_CHANGED` event on `SubscribeEvents` with the +// previous and new effective levels and the requesting `client_id` plus +// optional `reason` text. Renewal-no-op (same effective level) is NOT +// emitted. +// +// Emitter scope: operator-initiated transitions emit +// `METRIC_DETAIL_CHANGED` via the master-side audit log. Worker-local +// transitions — the polled janitor expiring a lease, or a worker-local +// clear/apply after a master fan-out — are not yet surfaced; follow-up +// tracked separately. +// +// Backwards compatibility +// ======================= +// Workers that pre-date this verb cannot decode `SetMetricDetail` and return +// `WorkerResponse::error("unknown request type")` which folds into the standard +// fan-out error tally (`extras.fanout.workers_err`); operators see "succeeded +// with errors" rather than a dedicated capability-skip list. Production +// deployments keep master + workers in sync via the `UpgradeMain` hot-upgrade +// flow, so this mixed-version state is transient. The master itself also +// leases (mirroring the symmetric `setup_metrics` path) so the audit log has a +// single canonical row when an operator flips detail across the fleet. +message SetMetricDetail { + // Stable identifier for the leasing client (`sozu top` uses + // `top::`). Required so multiple TUIs / scrapers / other + // tooling can lease independently. + required string client_id = 1; + // Target detail for the lease. Required when `clear` is false/absent. + optional MetricDetail detail = 2; + // Time-to-live for the lease in seconds. The worker rejects (FAILURE) + // values larger than 300s to bound the worst-case effect of a stuck + // renewer. Defaults server-side to 60s when absent (the master treats + // 0 as "use default" and emits a warning). + optional uint32 ttl_seconds = 3; + // When true, releases the lease for `client_id` instead of applying. + // `detail` and `ttl_seconds` are ignored when `clear` is true. + optional bool clear = 4; + // Optional human-readable provenance for the audit log + // (e.g. `"sozu top --detail backend"`, `"prometheus-scraper:sozu-1"`). + optional string reason = 5; + // Master-populated peer binding. These fields are NOT set by clients — + // the master fills them in `bin/src/command/requests.rs::worker_request` + // from the connecting `ClientSession` (`actor_pid` + `session_ulid`) + // before forwarding to workers. The worker stores the binding + // alongside the lease and rejects subsequent `clear` requests whose + // binding does not match the apply-time binding. Prevents one same-UID + // operator from accidentally (or deliberately) clearing another + // operator's lease by guessing the `client_id` format. A `None` value + // means "binding not available" — the worker accepts any matching + // `client_id` clear, preserving compat with pre-binding callers and + // with platforms whose unix socket peer credentials are unavailable. + optional int32 peer_pid = 6; + optional string peer_session_ulid = 7; +} + +// Per-worker outcome of a `SetMetricDetail` fan-out. Reported back to the +// requesting client so it can decide whether the elevation actually took +// effect (e.g. all workers acknowledged) or whether degraded operation +// (some workers too old) is in play. +message WorkerMetricDetailStatus { + // The worker's static `MetricsConfig.detail` (or DETAIL_CLUSTER if + // unset). Independent of leases. + required MetricDetail configured = 1; + // Effective level AFTER processing this verb: `max(configured, leases)`. + required MetricDetail effective = 2; + // Effective level BEFORE the verb. Equal to `effective` for a no-op. + required MetricDetail previous_effective = 3; + // Number of active leases on this worker (post-prune). Useful to + // surface "another client is still leasing this level" in the TUI. + required uint32 active_lease_count = 4; +} + +// Aggregated `SetMetricDetail` outcome across the fleet. Returned by the +// master to the requesting client (no `WorkerResponses` indirection needed +// because the schema is symmetric per-worker). +message MetricDetailStatus { + // The master's own `configured` view (mirrors a worker's view since the + // master also runs the metrics aggregator). + required MetricDetail configured = 1; + // Master's effective level AFTER the verb. + required MetricDetail effective = 2; + // Master's effective level BEFORE the verb. + required MetricDetail previous_effective = 3; + // Per-worker status. Map keyed by worker_id (string form for parity + // with `WorkerResponses`). + map workers = 4; + reserved 5; + reserved "unsupported_workers"; +} + // intended to workers message ServerMetricsConfig { required string address = 1; diff --git a/command/src/proto/display.rs b/command/src/proto/display.rs index 07813c293..2d5e40255 100644 --- a/command/src/proto/display.rs +++ b/command/src/proto/display.rs @@ -18,10 +18,10 @@ use crate::{ CertificatesWithFingerprints, ClusterMetrics, CustomHttpAnswers, Event, EventKind, FilteredMetrics, HealthChecksList, HttpEndpoint, HttpListenerConfig, HttpsListenerConfig, ListOfCertificatesByAddress, ListedFrontends, ListenersList, - ProtobufEndpoint, QueryCertificatesFilters, RequestCounts, Response, ResponseContent, - ResponseStatus, RunState, SocketAddress, TlsVersion, WorkerInfos, WorkerMetrics, - WorkerResponses, filtered_metrics, protobuf_endpoint, request::RequestType, - response_content::ContentType, + MetricDetailStatus, ProtobufEndpoint, QueryCertificatesFilters, RequestCounts, + Response, ResponseContent, ResponseStatus, RunState, SocketAddress, TlsVersion, + WorkerInfos, WorkerMetrics, WorkerResponses, filtered_metrics, protobuf_endpoint, + request::RequestType, response_content::ContentType, }, }, }; @@ -121,6 +121,7 @@ pub fn format_request_type(request_type: &RequestType) -> &str { RequestType::SetHealthCheck(_) => "SetHealthCheck", RequestType::RemoveHealthCheck(_) => "RemoveHealthCheck", RequestType::QueryHealthChecks(_) => "QueryHealthChecks", + RequestType::SetMetricDetail(_) => "SetMetricDetail", } } @@ -198,6 +199,7 @@ impl ResponseContent { } ContentType::Clusters(_) | ContentType::ClusterHashes(_) => Ok(()), // not displayed directly, see print_cluster_responses ContentType::CertificatesByAddress(certs) => print_certificates_by_address(certs), + ContentType::MetricDetailStatus(status) => print_metric_detail_status(status), ContentType::MaxConnectionsPerIpLimit(limit_info) => { if limit_info.limit == 0 { println!("Max connections per (cluster, source-IP): unlimited (0)"); @@ -211,6 +213,14 @@ impl ResponseContent { } ContentType::HealthChecksList(list) => print_health_checks(list), ContentType::Event(_event) => Ok(()), // not event displayed yet! + // Per-worker SetMetricDetail status payload. The aggregated + // MetricDetailStatus is what operators read at the + // `sozu` CLI surface; the per-worker variant flows + // master-side only (collected by SetMetricDetailTask) and + // is never printed directly. Silent OK keeps the match + // exhaustive without surfacing internal IPC payloads on + // the operator's terminal. + ContentType::WorkerMetricDetailStatus(_) => Ok(()), } } } @@ -946,6 +956,36 @@ fn print_certificates_by_address(list: &ListOfCertificatesByAddress) -> Result<( Ok(()) } +fn print_metric_detail_status(status: &MetricDetailStatus) -> Result<(), DisplayError> { + let mut table = Table::new(); + table.set_format(*prettytable::format::consts::FORMAT_BOX_CHARS); + table.add_row(row![ + "scope", + "configured", + "previous effective", + "effective", + "active leases" + ]); + table.add_row(row!( + "main", + status.configured().as_str_name(), + status.previous_effective().as_str_name(), + status.effective().as_str_name(), + "—", + )); + for (worker_id, w) in &status.workers { + table.add_row(row!( + format!("worker:{worker_id}"), + w.configured().as_str_name(), + w.previous_effective().as_str_name(), + w.effective().as_str_name(), + w.active_lease_count, + )); + } + table.printstd(); + Ok(()) +} + fn print_request_counts(request_counts: &RequestCounts) -> Result<(), DisplayError> { let mut table = Table::new(); table.set_format(*prettytable::format::consts::FORMAT_BOX_CHARS); @@ -1363,6 +1403,7 @@ impl Display for Event { EventKind::HealthCheckHealthy => "health check: backend healthy", EventKind::HealthCheckUnhealthy => "health check: backend unhealthy", EventKind::ClusterRecovered => "cluster recovered", + EventKind::MetricDetailChanged => "metric detail changed", }; let address = match &self.address { Some(a) => a.to_string(), diff --git a/command/src/proto/mod.rs b/command/src/proto/mod.rs index 38635a75a..a5d2075d5 100644 --- a/command/src/proto/mod.rs +++ b/command/src/proto/mod.rs @@ -1,7 +1,7 @@ use std::collections::BTreeMap; use command::{ - AggregatedMetrics, BackendMetrics, Bucket, FilteredHistogram, FilteredMetrics, + AggregatedMetrics, BackendMetrics, Bucket, FilteredHistogram, FilteredMetrics, Percentiles, filtered_metrics::Inner, }; use prost::UnknownEnumValue; @@ -148,25 +148,116 @@ impl FilteredMetrics { })), }; } + (Some(Inner::Percentiles(a)), Some(Inner::Percentiles(b))) => { + // You cannot statistically merge two percentile summaries + // without the underlying samples. The companion + // `_histogram` Inner::Histogram value is the source + // of truth for accurate aggregation and merges correctly + // above. We still propagate the percentile shape so legacy + // consumers reading it observe at least the worst-case + // upper bound across workers — element-wise max preserves + // the "is anyone slow?" intent. `samples` and `sum` add so + // the totals reflect cross-worker volume. + *self = Self { + inner: Some(Inner::Percentiles(Percentiles { + samples: a.samples + b.samples, + p_50: a.p_50.max(b.p_50), + p_90: a.p_90.max(b.p_90), + p_99: a.p_99.max(b.p_99), + p_99_9: a.p_99_9.max(b.p_99_9), + p_99_99: a.p_99_99.max(b.p_99_99), + p_99_999: a.p_99_999.max(b.p_99_999), + p_100: a.p_100.max(b.p_100), + sum: a.sum + b.sum, + })), + }; + } _ => {} } } fn is_mergeable(&self) -> bool { match &self.inner { - Some(Inner::Gauge(_)) | Some(Inner::Count(_)) | Some(Inner::Histogram(_)) => true, + Some(Inner::Gauge(_)) + | Some(Inner::Count(_)) + | Some(Inner::Histogram(_)) + | Some(Inner::Percentiles(_)) => true, // Inner::Time and Inner::Timeserie are never used in Sōzu - Some(Inner::Time(_)) - | Some(Inner::Percentiles(_)) - | Some(Inner::TimeSerie(_)) - | None => false, + Some(Inner::Time(_)) | Some(Inner::TimeSerie(_)) | None => false, } } } #[cfg(test)] mod tests { - use super::command::{Bucket, FilteredHistogram, FilteredMetrics, filtered_metrics::Inner}; + use std::collections::BTreeMap; + + use super::AggregatedMetrics; + use super::command::{ + Bucket, ClusterMetrics, FilteredHistogram, FilteredMetrics, Percentiles, WorkerMetrics, + filtered_metrics::Inner, + }; + + #[test] + fn merge_relocates_single_worker_to_top_level() { + // Regression: a one-worker fleet must populate `clusters` and + // `proxying` so CLI/TUI consumers reading those maps see the + // worker's data. `std::mem::take(&mut self.workers)` empties the + // per-worker map after relocation, which is the documented + // contract when the caller asked for the merged shape. + let mut worker = WorkerMetrics { + proxy: BTreeMap::new(), + clusters: BTreeMap::new(), + }; + worker.proxy.insert( + "requests".to_owned(), + FilteredMetrics { + inner: Some(Inner::Count(42)), + }, + ); + let mut cluster = ClusterMetrics { + cluster: BTreeMap::new(), + backends: Vec::new(), + }; + cluster.cluster.insert( + "requests".to_owned(), + FilteredMetrics { + inner: Some(Inner::Count(7)), + }, + ); + worker.clusters.insert("cluster-a".to_owned(), cluster); + + let mut agg = AggregatedMetrics { + main: BTreeMap::new(), + workers: BTreeMap::from([("0".to_owned(), worker)]), + clusters: BTreeMap::new(), + proxying: BTreeMap::new(), + }; + + agg.merge_metrics(); + + assert!( + agg.workers.is_empty(), + "merge takes ownership of the per-worker map" + ); + assert_eq!( + agg.proxying.get("requests"), + Some(&FilteredMetrics { + inner: Some(Inner::Count(42)), + }), + "single worker's proxy counter must surface in proxying" + ); + let cluster_a = agg + .clusters + .get("cluster-a") + .expect("cluster row must surface in top-level clusters"); + assert_eq!( + cluster_a.cluster.get("requests"), + Some(&FilteredMetrics { + inner: Some(Inner::Count(7)), + }) + ); + } #[test] fn merge_counts_and_gauges() { @@ -203,6 +294,56 @@ mod tests { ); } + #[test] + fn merge_percentiles_takes_max_per_quantile() { + // Multi-worker percentile aggregation propagates the worst-case + // quantile across workers and accumulates samples + sum so the + // surfaced summary remains the "is anyone slow?" upper bound. + let mut left = FilteredMetrics { + inner: Some(Inner::Percentiles(Percentiles { + samples: 100, + p_50: 5, + p_90: 20, + p_99: 100, + p_99_9: 200, + p_99_99: 250, + p_99_999: 300, + p_100: 400, + sum: 12_000, + })), + }; + let right = FilteredMetrics { + inner: Some(Inner::Percentiles(Percentiles { + samples: 50, + p_50: 7, + p_90: 15, + p_99: 80, + p_99_9: 240, + p_99_99: 245, + p_99_999: 290, + p_100: 380, + sum: 6_000, + })), + }; + left.merge(&right); + assert_eq!( + left, + FilteredMetrics { + inner: Some(Inner::Percentiles(Percentiles { + samples: 150, + p_50: 7, + p_90: 20, + p_99: 100, + p_99_9: 240, + p_99_99: 250, + p_99_999: 300, + p_100: 400, + sum: 18_000, + })), + } + ); + } + #[test] fn merge_histograms() { let mut histogram_a = FilteredMetrics { diff --git a/command/src/request.rs b/command/src/request.rs index 6be58c531..8a3edd9b1 100644 --- a/command/src/request.rs +++ b/command/src/request.rs @@ -77,6 +77,7 @@ impl Request { // handled at worker level prior to this call RequestType::ConfigureMetrics(_) + | RequestType::SetMetricDetail(_) | RequestType::QueryMetrics(_) | RequestType::Logging(_) | RequestType::QueryClustersHashes(_) diff --git a/command/src/state.rs b/command/src/state.rs index 71c02edaa..701263c60 100644 --- a/command/src/state.rs +++ b/command/src/state.rs @@ -151,6 +151,7 @@ impl ConfigState { | RequestType::QueryMetrics(_) | RequestType::QueryClustersHashes(_) | RequestType::ConfigureMetrics(_) + | RequestType::SetMetricDetail(_) | RequestType::ReturnListenSockets(_) | RequestType::SetMaxConnectionsPerIp(_) | RequestType::QueryMaxConnectionsPerIp(_) @@ -3423,4 +3424,28 @@ mod tests { "expected NotFound, got: {err}" ); } + + /// `ConfigState::dispatch` MUST treat `SetMetricDetail` as a + /// runtime-only verb (no persisted state mutation). A future + /// refactor that drops the variant from the no-op match arm and + /// falls through to the catch-all would silently re-break the + /// SetMetricDetail dispatch path with `UndispatchableRequest`. + #[test] + fn dispatch_passes_through_set_metric_detail() { + use crate::proto::command::{MetricDetail, SetMetricDetail}; + let mut state = ConfigState::new(); + let req: Request = RequestType::SetMetricDetail(SetMetricDetail { + client_id: "test:1".to_owned(), + detail: Some(MetricDetail::DetailBackend as i32), + ttl_seconds: Some(60), + clear: Some(false), + reason: Some("regression-guard".to_owned()), + peer_pid: None, + peer_session_ulid: None, + }) + .into(); + state + .dispatch(&req) + .expect("SetMetricDetail must traverse dispatch without UndispatchableRequest"); + } } diff --git a/doc/configure.md b/doc/configure.md index b8a29f860..afa53e0db 100644 --- a/doc/configure.md +++ b/doc/configure.md @@ -1514,6 +1514,56 @@ the SCM socket as a proto enum (`MetricDetail`); old binaries on either side fall back to `cluster` so a mixed-version rollout keeps emitting the historical metric shape. +#### Runtime cardinality lease + +Operators can elevate a worker's effective `metric_detail` for the lifetime of +an interactive session without rewriting `config.toml`. `sozu top` uses this +mechanism to enable `backend` detail while the TUI is attached, and reverts it +on exit. Other tooling (per-host scraper agents, ad-hoc debugging) can use the +same surface. + +The lease is keyed by an operator-supplied `client_id` and stored on each +worker. The effective level is `max(configured, max(active leases))`, so a +lease never *lowers* the configured detail; it only elevates. When the last +lease expires (TTL pop) or is explicitly cleared, the effective level falls +back to the configured value. + +The proto verb that exposes this surface is `SetMetricDetail` (request tag +`55`); the response shape is `MetricDetailStatus` carrying the master's +`(configured, effective, previous_effective)` triple plus a per-worker +`WorkerMetricDetailStatus` map. Every apply, clear, and TTL expiry emits an +audit-log event of kind `MetricDetailChanged` (`EventKind::METRIC_DETAIL_CHANGED`, +tag `30`) on the text and JSON sinks, with `lease_id=` and +`metric_detail_reason=` as dedicated columns so operator-supplied strings +cannot smuggle a forged adjacent column. + +Server-side caps and defaults (all defined in `lib/src/metrics/mod.rs`): + +| Knob | Default | Cap | Notes | +| ----------------------------- | ------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `ttl_seconds` | `60` | `300` | Lease lifetime. The TUI renews every `ttl/2` seconds; on TUI crash the lease self-expires after at most one `ttl` window. The master rejects out-of-range TTLs before fan-out so a buggy or malicious request cannot N×amplify worker-side rejections. | +| `LEASE_TABLE_CAP` | — | `64` | Maximum number of simultaneous leases per worker. Renewals of existing entries always succeed; only new inserts are subject to this cap. Mitigates the CWE-770 vector where a misbehaving client rolls `client_id` faster than expiry. | +| `LEASE_CLIENT_ID_MAX_BYTES` | — | `64` | Maximum `client_id` length. Operator tooling should pick a stable identifier (the TUI uses `top::<8-hex>`); arbitrary user input must be capped at this length before submission. | + +Trust model: the lease records the connecting peer's PID and master session +ULID (from `SO_PEERCRED` on Linux). Subsequent `clear` requests are +authorised against the apply-time binding — a different operator on the same +host cannot clear another operator's lease even if they guess the +`client_id`. Pre-binding callers and platforms without `SO_PEERCRED` degrade +to "binding unknown" → accept any clear. Clients NEVER set the peer fields +themselves; the master populates them from the `ClientSession` before +fan-out. + +Reversibility: leases self-expire, so a crashed dashboard does not leave a +worker permanently emitting at elevated cardinality. The audit-log trail +records every transition; SOC tooling can reconstruct the cardinality +posture of every worker at any point in time without polling. + +Renewals are not state transitions — `SetMetricDetail` is deliberately +*not* in the systemd `RELOADING=1` / `READY=1` bracket set, so a long-lived +TUI session does not flap the unit state. The audit-log event remains the +authoritative trail for cardinality changes. + #### StatsD wire format **Untagged** (default, `tagged_metrics = false`): diff --git a/doc/configure_cli.md b/doc/configure_cli.md index ff764df5f..ab69b97a2 100644 --- a/doc/configure_cli.md +++ b/doc/configure_cli.md @@ -131,3 +131,35 @@ sozu --config /path/to/config.toml events listens to events sent by Sōzu workers whenever a backend is down, up again, or when no backend is available. + +## Live operator TUI (`sozu top`) + +The `top` subcommand is a btop/htop-style live dashboard. Build with the +optional `tui` Cargo feature (`cargo build -p sozu --features tui --release`); +`sozu --version` reports `+tui` when the subcommand is linked in. See +[`doc/sozu-top.md`](sozu-top.md) for the full operator guide (panes, key +bindings, skin format, threshold tuning). + +```bash +sozu --config /path/to/config.toml top +``` + +Common flags: + +| Flag | Effect | +|------|--------| +| `--refresh-ms ` | Data poll cadence in milliseconds (default `1000`). | +| `--detail ` | Cardinality lease level (`process|frontend|cluster|backend`, default `backend`). | +| `--lease-ttl-seconds ` | Lease TTL; auto-renewed at half-TTL (default `60`, server clamps at `300`). | +| `--skin ` | Resolve `$XDG_CONFIG_HOME/sozu/skins/.toml` (`SOZU_TOP_SKIN` env wins). | +| `--glyphs ` | Force a glyph mode (`braille|block|tty`); auto-detect by default. | +| `--no-mouse` | Disable SGR mouse capture (helps with multiplexers that mis-route mouse events). | +| `--snapshot `, `--tick-once` | Render N frames / one tick and exit (test affordances). | + +Key bindings (operator quick reference; see `doc/sozu-top.md` for the +full list): + +- `1`-`7` jumps to OVERVIEW · CLUSTERS · BACKENDS · LISTENERS · CERTS · H2 · EVENTS. +- `Tab` / `Shift-Tab` cycles tabs forward / backward. +- `s` / `S` cycles / reverses the sort column on CLUSTERS and BACKENDS. +- `q` / `Q` / `Ctrl-C` / `F10` quits, `?` / `F1` toggles help. diff --git a/doc/sozu-top.md b/doc/sozu-top.md new file mode 100644 index 000000000..a20dc02fb --- /dev/null +++ b/doc/sozu-top.md @@ -0,0 +1,196 @@ +# `sozu top` — live operator TUI + +`sozu top` is a btop/htop-style terminal dashboard for the Sōzu reverse +proxy. It surfaces metrics that Sōzu already emits (per-cluster, per- +backend, H2-flow, slab-saturation, recent control-plane events), in a +single screen with sparklines, sortable tables, and a colour-coded +event tail. The subcommand is gated behind the optional `tui` Cargo +feature so production binaries built without `--features tui` do not +pull `ratatui`, `crossterm`, or any other TUI dependency. + +```bash +cargo build -p sozu --features tui --release +sozu --config /etc/sozu/config.toml top +sozu --version # reports `+tui` when the subcommand is linked +``` + +## Architecture at a glance + +- Four synchronous transport threads poll the master over the existing + unix command socket, each on its own dedicated connection: + - **Collector** sends `QueryMetrics` every `--refresh-ms` + (default 1 s) and pushes each `AggregatedMetrics` into a + bounded-1 channel with publish-or-skip-on-backpressure semantics. + - **Events** subscribes to the control-plane event stream (backend + up/down, cluster added/removed, certificate added/removed, …). + - **Listeners** polls `ListListeners` every 5 s. + - **Certs** polls `QueryCertificatesFromTheState` every 30 s. +- A single UI thread owns the terminal, polls crossterm input on a + 30 fps cap, and synchronises frame output via DEC mode 2026 + (`BeginSynchronizedUpdate` / `EndSynchronizedUpdate`) so tmux and + iTerm2 see one atomic paint per frame. +- The TUI auto-elevates the worker's metric cardinality to `Backend` + by leasing it through a `SetMetricDetail` runtime verb. The lease + is `client_id`-keyed, TTL-bounded (default 60 s, clamp 300 s), and + self-expires server-side if the TUI crashes — so a dead `sozu top` + cannot permanently elevate cardinality. Renewal runs at half-TTL. + +## Panes + +Numbered tabs at the top of the screen; key digits map directly. + +| Tab | Pane | Drives | +|-----|------|--------| +| `1` | OVERVIEW | Four sparklines (REQUESTS/SEC, p99 LATENCY, 5xx ERRORS, SATURATION) with big numerals + trend glyphs (`▲ ▼ ●`). | +| `2` | CLUSTERS | Sortable per-cluster table (cluster_id, rps, err%, p50, p99, backends_available/total). Default sort: error-rate desc. | +| `3` | BACKENDS | Sortable per-backend table (cluster, backend, bw down/up, connections, p50, p99, requests). Default sort: bandwidth desc. | +| `4` | LISTENERS | HTTP / HTTPS / TCP listener inventory; refreshed every 5 s. | +| `5` | CERTS | Certificate inventory by listener address + fingerprint + names; refreshed every 30 s. | +| `6` | H2 | Active streams, ALPN H2 share, flow-control gauges, frame counters, and CVE flood-mitigation counters (`h2.flood.violation.*`). | +| `7` | EVENTS | Colour-coded tail of `SubscribeEvents`. BACKEND_DOWN / NO_AVAILABLE_BACKENDS / WORKER_KILLED in hot, BACKEND_UP / CLUSTER_RECOVERED in cool, METRIC_DETAIL_CHANGED in accent. | + +Threshold-driven row tinting is consistent across all panes: + +- **Critical (hot, bold)**: p99 over `latency_p99_critical_ms` (500 ms + default), 5xx ratio over `error_ratio_critical_pct` (1 %), all + backends down for a cluster. +- **Pulse (background tint, 4 ticks ≈ 4 s)**: cluster appeared, cluster + disappeared, backend went down. Pulses take precedence over the + steady threshold tint so transitions catch the eye on rows that are + already red. + +A 5-line big-text alert banner overlays the active pane when the +threshold table is critical, with a copyable side strip for tmux +scrollback and screen readers. + +## Key bindings + +| Key | Action | +|-----|--------| +| `1`-`7` | Jump to the numbered tab. | +| `Tab` / `Shift-Tab` | Cycle tabs forward / backward. | +| `s` / `S` | Cycle / reverse the active sort column (CLUSTERS, BACKENDS). | +| `?` / `F1` | Toggle help overlay. | +| `q` / `Q` / `Ctrl-C` / `F10` | Quit. SIGINT / SIGTERM also restore the terminal cleanly. | + +The bottom function-key bar (`F1 Help · F2 Theme · F3 Find · F4 Filter +· F5 Pause · F6 Sort · F7 Detail- · F8 Detail+ · F9 Config · F10 Quit`) +reserves slots for actions wired in future milestones. The currently +active sort column appears on the right of the bar. + +## Cardinality (`SetMetricDetail` lease) + +The TUI auto-applies a `Backend`-level lease on startup so the +BACKENDS / per-backend rows on CLUSTERS and OVERVIEW carry real +data. The lease appears as an `EventKind::METRIC_DETAIL_CHANGED` event +in the EVENTS pane. Override with `--detail process|frontend|cluster| +backend` or `--lease-ttl-seconds N`. Workers that pre-date the verb +(e.g. inherited from a prior master across an `UpgradeMain`) reply +with the standard `unknown request type` error; the remaining workers +still apply the lease normally. Production deployments keep master +and workers in sync via the upgrade path, so this mixed-version state +is transient. + +## Skins + +`--skin ` (with `SOZU_TOP_SKIN` env override for k9s parity) +resolves a TOML skin file from one of: + +1. `$XDG_CONFIG_HOME/sozu/skins/.toml` (fallback `$HOME/.config/ + sozu/skins/.toml`). +2. `/etc/sozu/skins/.toml` for system-wide skins. + +Schema (all `#RRGGBB` hex strings; `categorical` is optional and falls +back to the built-in Okabe-Ito + Viridis high-end palette): + +```toml +primary = "#e8e8f0" # headings, focused tab, big-text numerals +secondary = "#b4b8c0" # status text, function-key labels +accent = "#56c0f0" # active sort column, selected row +cool = "#39ad98" # "all is well" sparkline tails +warm = "#f5bf4f" # mid-high sparkline tails +hot = "#e8545a" # alert peaks + row-critical +muted = "#606470" # tab labels not in focus, rule lines +categorical = [ + "#009e73", "#56b4e9", "#d55e00", "#cc79a7", + "#f0e442", "#0072b2", "#e69f00", "#f7d13c", "#5ec961", +] +``` + +Path safety: `--skin` rejects names containing `..`, `/`, or `\` so +`--skin ../../etc/passwd` cannot escape the skins directory. On a +missing file or parse error the TUI surfaces a one-line diagnostic in +the status bar and falls back to the built-in palette. + +## Glyphs + +Three sparkline / bar ramp modes resolve via this cascade unless +`--glyphs braille|block|tty` is passed explicitly: + +1. `TERM` reports `dumb`, `linux`, `xterm-old`, or `*-mono*` → + 7-bit ASCII (`. : - = + * # @`). +2. `LC_ALL` / `LC_CTYPE` / `LANG` ends in `UTF-8`/`UTF8` AND is not + `C` / `POSIX` / `C.UTF-8` → Braille mosaics + (`⠀ ⡀ ⣀ ⣄ ⣤ ⣦ ⣶ ⣷ ⣿`). +3. Otherwise: Unicode block elements (`▁ ▂ ▃ ▄ ▅ ▆ ▇ █`). Broadest + Unicode-terminal compatibility without nerd-font dependency. + +## Accessibility + +The default Okabe-Ito categorical palette is colour-blind safe in +isolation and distinguishable across the three common dichromatic +types. Critical/warm/cool tiers are always backed up by a redundant +glyph (`▲ ▼ ●` on the OVERVIEW pane, `▲ ▼` on sortable column +headers) so the colour signal isn't load-bearing on its own. + +## Mouse capture + +Mouse is on by default. Some multiplexers mis-route SGR mouse events +to the underlying shell when the TUI exits; if that happens to you, +add `--no-mouse`. SIGINT / SIGTERM / panic all restore the terminal +via the `RawModeGuard::Drop` path (raw mode disabled, mouse capture +released, cursor shown, alt-screen left) so the shell stays usable +even after a hard crash. + +## Test affordances + +- `--tick-once` drives one data tick + one render tick and exits. + Useful for smoke tests and CI snapshots. +- `--snapshot N` renders `N` frames and exits. +- The `insta` snapshot tests under `bin/src/ctl/top/snapshots/` + exercise every pane at 80x24 and 120x40; run with `cargo test + -p sozu --features tui snapshot_tests`. Accept changes with + `cargo insta review`. + +## Operational footguns + +- `--features tui` materially changes the binary size and dependency + graph. Default `sozu` builds stay lean; this is by design. +- The TUI's cardinality lease elevates the worker's keyspace while + running; the `EventKind::METRIC_DETAIL_CHANGED` audit trail lets + StatsD scrapers see the change. The configured floor is restored + on lease expiry (TTL) or explicit revoke (clean exit). +- `MetricDetail = Backend` can balloon the per-backend keyspace on + high-cardinality fleets — every backend gets its own labelled time + series at the StatsD / Prometheus sink. If you see operator-side + metric pipelines saturate, override with `--detail cluster` and + lease at the lower tier. +- The render loop pauses redraws when state is unchanged AND no + pulse is active; on a quiet system `sozu top` consumes ~2-3 % of + one core on a Linux 6.x kernel + alacritty. +- Transport layout: four synchronous threads (snapshots, listeners, + certs, events) open six unix-socket connections to the master per + invocation. The four polling threads keep one connection each; the + cardinality renewer holds a fifth for its half-TTL apply renewals + and a sixth parked for the final clear-on-exit. Sized for a single + operator on one terminal; spawning many concurrent `sozu top` + invocations from a wrapper script saturates the master's accept + backlog (`listen(2)` default = 128). +- Privacy: whatever you type in `--reason` lands in the audit log + (text + JSON sinks) AND fans out via `SubscribeEvents` to any + same-UID subscriber. Avoid embedding PII, customer IDs, or ticket + references that should not leave the host's audit boundary. + `client_id` and `reason` are length-capped server-side (64 / 256 + bytes) and stripped of `,`/`=`/control bytes before being rendered + into the audit columns, so the values cannot smuggle adjacent KV + columns into a SIEM that ingests on `, ` / `=`. diff --git a/e2e/COVERAGE.md b/e2e/COVERAGE.md index 59d9d06aa..86736fdd4 100644 --- a/e2e/COVERAGE.md +++ b/e2e/COVERAGE.md @@ -88,9 +88,11 @@ harnesses for a single feature cell function: // Cell function: takes (frontend_h2, backend_h2), returns State. fn try_basic_auth_cell(frontend_h2: bool, backend_h2: bool) -> State { ... } -// Macro emits try_basic_auth_h1_h1 / _h1_h2 / _h2_h1 / _h2_h2 wrappers -// plus matching test_basic_auth_h1_h1 / _h1_h2 / _h2_h1 / _h2_h2 +// Macro emits `pub mod basic_auth { ... }` containing +// try_h1_h1 / try_h1_h2 / try_h2_h1 / try_h2_h2 wrappers plus +// matching test_h1_h1 / test_h1_h2 / test_h2_h1 / test_h2_h2 // `#[test]` harnesses that wrap the cell in `repeat_until_error_or`. +// `cargo test basic_auth` filters all four cells. protocol_pair_matrix!(basic_auth, try_basic_auth_cell, "basic auth"); ``` diff --git a/e2e/Cargo.toml b/e2e/Cargo.toml index 77164d119..68077badc 100644 --- a/e2e/Cargo.toml +++ b/e2e/Cargo.toml @@ -36,10 +36,6 @@ tempfile = { workspace = true } # request consume the failure injection that FIX-18 just armed, # producing the wrong response code for both tests on CI. serial_test = { workspace = true } -# Identifier-concatenation macro used by the `protocol_pair_matrix!` -# helper in `tests/protocol_pair_matrix.rs`. `paste` is dev-only — the -# generated wrappers are pure test scaffolding. -paste = "1.0" [features] # Forward sozu-lib's `splice` feature so the dependency-tree compiles diff --git a/e2e/src/tests/protocol_pair_matrix.rs b/e2e/src/tests/protocol_pair_matrix.rs index fa0e5b084..60a96b5f4 100644 --- a/e2e/src/tests/protocol_pair_matrix.rs +++ b/e2e/src/tests/protocol_pair_matrix.rs @@ -88,10 +88,10 @@ const H2_BACKEND_RECORD_POLL_MS: u64 = 25; // ── protocol_pair_matrix! macro ───────────────────────────────────────── // -// Emits four `pub fn try__()` wrappers and their matching -// `#[test] fn test__()` harnesses for a feature cell function -// of the shape `fn try__cell(frontend_h2: bool, backend_h2: bool) -// -> State`. +// Emits a `pub mod { ... }` carrying four `pub fn try_h{1,2}_h{1,2}()` +// wrappers and four matching `#[test] fn test_h{1,2}_h{1,2}()` harnesses +// for a feature cell function of the shape +// `fn try__cell(frontend_h2: bool, backend_h2: bool) -> State`. // // Usage: // @@ -99,63 +99,77 @@ const H2_BACKEND_RECORD_POLL_MS: u64 = 25; // fn try_basic_auth_cell(frontend_h2: bool, backend_h2: bool) -> State { ... } // protocol_pair_matrix!(basic_auth, try_basic_auth_cell, "basic auth"); // ``` - +// +// The matrix is then reachable as `protocol_pair_matrix::basic_auth::test_h1_h1`, +// `..::test_h1_h2`, `..::test_h2_h1`, `..::test_h2_h2`. `cargo test +// basic_auth` filters all four cells. + +/// Emit the four `(frontend_h{1,2}, backend_h{1,2})` `try_*` wrappers +/// and the four matching `#[test]` cells. Identifiers are kept stable +/// by nesting the per-matrix functions inside a child module named +/// after `$name` — `macro_rules!` cannot concatenate identifiers +/// natively, so the module hop replaces what `paste!{ [] }` +/// used to do. The test harness still discovers each `#[test]` via the +/// `protocol_pair_matrix::$name::test_h{1,2}_h{1,2}` path; `cargo test +/// $name` filters them as before. No external code calls the generated +/// function names directly (verified by grep over `e2e/`), so the +/// nesting is invisible outside this file. macro_rules! protocol_pair_matrix { ($name:ident, $cell:ident, $description:literal) => { - paste::paste! { - pub fn []() -> $crate::tests::State { - $cell(false, false) + pub mod $name { + pub fn try_h1_h1() -> $crate::tests::State { + super::$cell(false, false) } - pub fn []() -> $crate::tests::State { - $cell(false, true) + pub fn try_h1_h2() -> $crate::tests::State { + super::$cell(false, true) } - pub fn []() -> $crate::tests::State { - $cell(true, false) + pub fn try_h2_h1() -> $crate::tests::State { + super::$cell(true, false) } - pub fn []() -> $crate::tests::State { - $cell(true, true) + pub fn try_h2_h2() -> $crate::tests::State { + super::$cell(true, true) } #[test] - fn []() { + fn test_h1_h1() { assert_eq!( $crate::tests::repeat_until_error_or( 2, concat!($description, " h1↔h1"), - [], + try_h1_h1, ), $crate::tests::State::Success, ); } #[test] - fn []() { + fn test_h1_h2() { assert_eq!( $crate::tests::repeat_until_error_or( 2, concat!($description, " h1↔h2c"), - [], + try_h1_h2, ), $crate::tests::State::Success, ); } #[test] - fn []() { + fn test_h2_h1() { assert_eq!( $crate::tests::repeat_until_error_or( 2, concat!($description, " h2↔h1"), - [], + try_h2_h1, ), $crate::tests::State::Success, ); } #[test] - fn []() { + fn test_h2_h2() { assert_eq!( $crate::tests::repeat_until_error_or( 2, concat!($description, " h2↔h2c"), - [], + try_h2_h2, ), $crate::tests::State::Success, ); diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index ef9bcf6c0..d12d8918c 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -1171,15 +1171,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "rustls-pemfile" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" -dependencies = [ - "rustls-pki-types", -] - [[package]] name = "rustls-pki-types" version = "1.14.1" @@ -1377,7 +1368,6 @@ dependencies = [ "rand 0.10.1", "regex", "rustls", - "rustls-pemfile", "rusty_ulid", "sha2", "slab", diff --git a/lib/Cargo.toml b/lib/Cargo.toml index d74aff2fe..14bffd6d8 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -47,7 +47,6 @@ rand = { workspace = true } regex = { workspace = true } rustls = { workspace = true } rustls-openssl = { workspace = true, optional = true } -rustls-pemfile = { workspace = true } rusty_ulid = { workspace = true } sha2 = { workspace = true } slab = { workspace = true } diff --git a/lib/benches/crypto_provider.rs b/lib/benches/crypto_provider.rs index 37a492f45..61d7e03b7 100644 --- a/lib/benches/crypto_provider.rs +++ b/lib/benches/crypto_provider.rs @@ -37,16 +37,9 @@ fn bench_certificate_loading(c: &mut Criterion) { fn bench_private_key_signing(c: &mut Criterion) { use rustls::pki_types::PrivateKeyDer; - use std::io::BufReader; - - let mut reader = BufReader::new(KEY_PEM.as_bytes()); - let item = rustls_pemfile::read_one(&mut reader).unwrap().unwrap(); - let private_key = match item { - rustls_pemfile::Item::Pkcs1Key(k) => PrivateKeyDer::from(k), - rustls_pemfile::Item::Pkcs8Key(k) => PrivateKeyDer::from(k), - rustls_pemfile::Item::Sec1Key(k) => PrivateKeyDer::from(k), - _ => panic!("unexpected key type"), - }; + use rustls::pki_types::pem::PemObject; + + let private_key = PrivateKeyDer::from_pem_slice(KEY_PEM.as_bytes()).unwrap(); c.bench_function("private_key_load", |b| { b.iter(|| any_supported_type(&private_key).unwrap()); diff --git a/lib/src/backends.rs b/lib/src/backends.rs index 44c8077fc..66edc1b4a 100644 --- a/lib/src/backends.rs +++ b/lib/src/backends.rs @@ -15,6 +15,7 @@ use sozu_command::{ state::ClusterId, }; +use crate::metrics::names; use crate::{ PeakEWMA, load_balancing::{LeastLoaded, LoadBalancingAlgorithm, PowerOfTwo, Random, RoundRobin}, @@ -265,6 +266,7 @@ impl std::ops::Drop for Backend { backend_id: Some(self.backend_id.to_owned()), address: Some(self.address.into()), cluster_id: None, + metric_detail: None, }); } } @@ -320,12 +322,17 @@ impl BackendMap { let (available, total) = list.evaluate_availability(); gauge!( - "cluster.available_backends", + names::cluster::AVAILABLE_BACKENDS, available, Some(cluster_id), None ); - gauge!("cluster.total_backends", total, Some(cluster_id), None); + gauge!( + names::cluster::TOTAL_BACKENDS, + total, + Some(cluster_id), + None + ); let new_state = if total > 0 && available == 0 { ClusterAvailability::AllDown @@ -340,12 +347,17 @@ impl BackendMap { match (prev, new_state) { (ClusterAvailability::Available, ClusterAvailability::AllDown) => { error!("cluster {}: all {} backends are down", cluster_id, total); - incr!("cluster.no_available_backends", Some(cluster_id), None); + incr!( + names::cluster::NO_AVAILABLE_BACKENDS, + Some(cluster_id), + None + ); push_event(Event { kind: EventKind::NoAvailableBackends as i32, cluster_id: Some(cluster_id.to_owned()), backend_id: None, address: None, + metric_detail: None, }); } (ClusterAvailability::AllDown, ClusterAvailability::Available) => { @@ -353,12 +365,13 @@ impl BackendMap { "cluster {}: backends recovered ({}/{} available)", cluster_id, available, total ); - incr!("cluster.available_recovered", Some(cluster_id), None); + incr!(names::cluster::AVAILABLE_RECOVERED, Some(cluster_id), None); push_event(Event { kind: EventKind::ClusterRecovered as i32, cluster_id: Some(cluster_id.to_owned()), backend_id: None, address: None, + metric_detail: None, }); } _ => {} @@ -396,6 +409,10 @@ impl BackendMap { backend.borrow_mut().health = HealthState::default(); } } + // Re-emit the rollup gauges so dashboards reflect the + // post-reset availability instead of holding the last + // health-check value indefinitely. + self.record_cluster_availability(cluster_id); } } } @@ -411,6 +428,15 @@ impl BackendMap { BackendList::import_configuration_state(backend_vec), ) })); + // Replay path inserts every cluster's backend list without + // touching the gauge emission sites used by add/remove/health. + // Latch `cluster.available_backends` and `.total_backends` here + // so a freshly-loaded worker reports correct values on the very + // first `QueryMetrics` instead of zero until something else + // mutates each cluster. + for cluster_id in backends.keys() { + self.record_cluster_availability(cluster_id); + } } pub fn add_backend(&mut self, cluster_id: &str, backend: Backend) { @@ -756,7 +782,7 @@ impl BackendList { ); self.fail_open_warned = true; } - count!("backends.fail_open", 1); + count!(names::backend::FAIL_OPEN, 1); self.load_balancing.next_available_backend(&mut backends) } @@ -1235,4 +1261,87 @@ mod backends_test { "helper must not insert a BackendList for an unknown cluster_id" ); } + + #[test] + fn import_configuration_state_latches_cluster_rollup_gauges() { + use crate::metrics::METRICS; + use sozu_command_lib::proto::command::QueryMetricsOptions; + // Unique cluster id so the assertion is not perturbed by gauges + // left in the thread-local METRICS aggregator by sibling tests. + let cluster_id = "c-import-rollup-9701"; + let mut map = BackendMap::new(); + let mut input = HashMap::new(); + input.insert( + cluster_id.to_owned(), + vec![sozu_command_lib::response::Backend { + cluster_id: cluster_id.to_owned(), + backend_id: "b1".to_owned(), + address: "127.0.0.1:9701".parse().unwrap(), + sticky_id: None, + load_balancing_parameters: None, + backup: None, + }], + ); + map.import_configuration_state(&input); + let response = METRICS + .with(|m| { + m.borrow_mut().query(&QueryMetricsOptions { + metric_names: vec![ + names::cluster::AVAILABLE_BACKENDS.to_owned(), + names::cluster::TOTAL_BACKENDS.to_owned(), + ], + cluster_ids: vec![cluster_id.to_owned()], + backend_ids: vec![], + list: false, + no_clusters: false, + workers: false, + }) + }) + .expect("metrics query succeeds"); + let cluster_metrics = match response.content_type { + Some( + sozu_command_lib::proto::command::response_content::ContentType::WorkerMetrics(wm), + ) => wm, + other => panic!("expected WorkerMetrics, got {other:?}"), + }; + let cm = cluster_metrics + .clusters + .get(cluster_id) + .expect("imported cluster must have a ClusterMetrics entry"); + // Without the import-time `record_cluster_availability` call the + // two rollup gauges would be absent here. The fix guarantees the + // pair lands without waiting for any follow-up backend mutation. + assert!( + cm.cluster.contains_key(names::cluster::AVAILABLE_BACKENDS), + "cluster.available_backends gauge must be latched at import time" + ); + assert!( + cm.cluster.contains_key(names::cluster::TOTAL_BACKENDS), + "cluster.total_backends gauge must be latched at import time" + ); + } + + #[test] + fn set_health_check_config_none_re_emits_rollup_after_reset() { + let mut map = BackendMap::new(); + let cluster_id = "c-hc-reset"; + // Seed the cluster with an unhealthy backend so `add_backend` + // drives the `availability` cell to AllDown. + map.add_backend(cluster_id, unhealthy_backend("b1", 9801)); + assert_eq!( + ClusterAvailability::AllDown, + map.backends.get(cluster_id).unwrap().availability.get(), + "test setup: unhealthy backend must register the cell at AllDown" + ); + // Disabling the health check resets backend health to the default + // pristine state AND must re-emit the rollup so the cell reflects + // the post-reset availability instead of the stale AllDown. + map.set_health_check_config(cluster_id, None); + assert_eq!( + ClusterAvailability::Available, + map.backends.get(cluster_id).unwrap().availability.get(), + "set_health_check_config(None) must re-emit the rollup after \ + resetting backend health, otherwise dashboards stay stuck at AllDown" + ); + } } diff --git a/lib/src/crypto.rs b/lib/src/crypto.rs index 1b74e1112..0beda2e3f 100644 --- a/lib/src/crypto.rs +++ b/lib/src/crypto.rs @@ -400,19 +400,11 @@ mod tests { #[test] fn can_load_rsa_private_key() { use rustls::pki_types::PrivateKeyDer; - use std::io::BufReader; + use rustls::pki_types::pem::PemObject; let key_pem = include_str!("../assets/key.pem"); - let mut reader = BufReader::new(key_pem.as_bytes()); - let item = rustls_pemfile::read_one(&mut reader) - .expect("failed to read PEM") - .expect("no PEM item found"); - let private_key = match item { - rustls_pemfile::Item::Pkcs1Key(k) => PrivateKeyDer::from(k), - rustls_pemfile::Item::Pkcs8Key(k) => PrivateKeyDer::from(k), - rustls_pemfile::Item::Sec1Key(k) => PrivateKeyDer::from(k), - _ => panic!("unexpected key type"), - }; + let private_key = + PrivateKeyDer::from_pem_slice(key_pem.as_bytes()).expect("failed to parse PEM key"); any_supported_type(&private_key).expect("provider must be able to load RSA private key"); } diff --git a/lib/src/health_check.rs b/lib/src/health_check.rs index 1fd8e862f..7add3e03d 100644 --- a/lib/src/health_check.rs +++ b/lib/src/health_check.rs @@ -22,6 +22,7 @@ use sozu_command::{ state::ClusterId, }; +use crate::metrics::names; use crate::{ backends::BackendMap, protocol::mux::{ @@ -469,16 +470,22 @@ impl HealthChecker { config.healthy_threshold, cluster_id ); - incr!("health_check.up"); - gauge!("backend.available", 1, Some(cluster_id), Some(backend_id)); + incr!(names::health_check::UP); + gauge!( + names::backend::AVAILABLE, + 1, + Some(cluster_id), + Some(backend_id) + ); push_event(Event { kind: EventKind::HealthCheckHealthy as i32, cluster_id: Some(cluster_id.to_owned()), backend_id: Some(backend_id.to_owned()), address: Some(address.into()), + metric_detail: None, }); } - count!("health_check.success", 1); + count!(names::health_check::SUCCESS, 1); } else { let transitioned = backend.health.record_failure(config.unhealthy_threshold); if transitioned { @@ -490,16 +497,22 @@ impl HealthChecker { config.unhealthy_threshold, cluster_id ); - incr!("health_check.down"); - gauge!("backend.available", 0, Some(cluster_id), Some(backend_id)); + incr!(names::health_check::DOWN); + gauge!( + names::backend::AVAILABLE, + 0, + Some(cluster_id), + Some(backend_id) + ); push_event(Event { kind: EventKind::HealthCheckUnhealthy as i32, cluster_id: Some(cluster_id.to_owned()), backend_id: Some(backend_id.to_owned()), address: Some(address.into()), + metric_detail: None, }); } - count!("health_check.failure", 1); + count!(names::health_check::FAILURE, 1); } // Emit the healthy-backend gauge on every result update for clusters diff --git a/lib/src/http.rs b/lib/src/http.rs index 197343bd0..b4edadf98 100644 --- a/lib/src/http.rs +++ b/lib/src/http.rs @@ -26,6 +26,7 @@ use sozu_command::{ state::{ClusterId, validate_h2_flood_knobs_http, validate_sozu_id_header}, }; +use crate::metrics::names; use crate::{ AcceptError, FrontendFromRequestError, L7ListenerHandler, L7Proxy, ListenerError, ListenerHandler, Protocol, ProxyConfiguration, ProxyError, ProxySession, SessionIsToBeClosed, @@ -136,7 +137,7 @@ impl HttpSession { let state = if expect_proxy { trace!("{} starting in expect proxy state", log_module_context!()); - gauge_add!("protocol.proxy.expect", 1); + gauge_add!(names::protocol::PROXY_EXPECT, 1); HttpStateMachine::Expect(ExpectProxyProtocol::new( container_frontend_timeout, @@ -145,7 +146,7 @@ impl HttpSession { request_id, )) } else { - gauge_add!("protocol.http", 1); + gauge_add!(names::protocol::HTTP, 1); let session_address = sock.peer_addr().ok(); let session_ulid = rusty_ulid::Ulid::generate(); let sock = crate::socket::SessionTcpStream::new(sock, session_ulid, session_address); @@ -266,8 +267,8 @@ impl HttpSession { }; mux.frontend.readiness_mut().event = expect.frontend_readiness.event; - gauge_add!("protocol.proxy.expect", -1); - gauge_add!("protocol.http", 1); + gauge_add!(names::protocol::PROXY_EXPECT, -1); + gauge_add!(names::protocol::HTTP, 1); Some(HttpStateMachine::Mux(mux)) } _ => { @@ -387,9 +388,9 @@ impl HttpSession { // http.active_requests was already decremented by generate_access_log() // in h1.rs when the 101 response was written (before MuxResult::Upgrade). - gauge_add!("protocol.http", -1); - gauge_add!("protocol.ws", 1); - gauge_add!("websocket.active_requests", 1); + gauge_add!(names::protocol::HTTP, -1); + gauge_add!(names::protocol::WS, 1); + gauge_add!(names::websocket::ACTIVE_REQUESTS, 1); Some(HttpStateMachine::WebSocket(pipe)) } @@ -417,19 +418,19 @@ impl ProxySession for HttpSession { // Restore gauges match self.state.marker() { - StateMarker::Expect => gauge_add!("protocol.proxy.expect", -1), - StateMarker::Mux => gauge_add!("protocol.http", -1), + StateMarker::Expect => gauge_add!(names::protocol::PROXY_EXPECT, -1), + StateMarker::Mux => gauge_add!(names::protocol::HTTP, -1), StateMarker::WebSocket => { - gauge_add!("protocol.ws", -1); - gauge_add!("websocket.active_requests", -1); + gauge_add!(names::protocol::WS, -1); + gauge_add!(names::websocket::ACTIVE_REQUESTS, -1); } } if self.state.failed() { match self.state.marker() { - StateMarker::Expect => incr!("http.upgrade.expect.failed"), - StateMarker::Mux => incr!("http.upgrade.mux.failed"), - StateMarker::WebSocket => incr!("http.upgrade.ws.failed"), + StateMarker::Expect => incr!(names::http::UPGRADE_EXPECT_FAILED), + StateMarker::Mux => incr!(names::http::UPGRADE_MUX_FAILED), + StateMarker::WebSocket => incr!(names::http::UPGRADE_WS_FAILED), } // FailedUpgrade means the socket was consumed by a failed upgrade // attempt, so we can only close the state (no-op) and remove the @@ -656,14 +657,18 @@ impl L7ListenerHandler for HttpListener { let host = unsafe { from_utf8_unchecked(hostname) }; let route = self.fronts.lookup(host, uri, method).map_err(|e| { - incr!("http.failed_backend_matching"); + incr!(names::http::FAILED_BACKEND_MATCHING); FrontendFromRequestError::NoClusterFound(e) })?; let now = Instant::now(); if let Some(cluster) = route.cluster_id.as_deref() { - time!("frontend_matching_time", cluster, (now - start).as_millis()); + time!( + names::event_loop::FRONTEND_MATCHING_TIME, + cluster, + (now - start).as_millis() + ); } Ok(route) @@ -959,7 +964,7 @@ impl HttpProxy { // Carrying any `hsts` field here is a misconfiguration rather // than a deliberate choice. if front.hsts.is_some() { - incr!("http.hsts.suppressed_plaintext"); + incr!(names::http::HSTS_SUPPRESSED_PLAINTEXT); return Err(ProxyError::HstsOnPlainHttp(front.address.into())); } diff --git a/lib/src/https.rs b/lib/src/https.rs index 9405fe08e..51aafa409 100644 --- a/lib/src/https.rs +++ b/lib/src/https.rs @@ -48,6 +48,7 @@ use sozu_command::{ }, }; +use crate::metrics::names; use crate::{ AcceptError, CachedTags, FrontendFromRequestError, L7ListenerHandler, L7Proxy, ListenerError, ListenerHandler, Protocol, ProxyConfiguration, ProxyError, ProxySession, SessionIsToBeClosed, @@ -170,13 +171,13 @@ impl HttpsSession { let state = if expect_proxy { trace!("{} starting in expect proxy state", log_module_context!()); - gauge_add!("protocol.proxy.expect", 1); + gauge_add!(names::protocol::PROXY_EXPECT, 1); HttpsStateMachine::Expect( ExpectProxyProtocol::new(container_frontend_timeout, sock, token, request_id), rustls_details, ) } else { - gauge_add!("protocol.tls.handshake", 1); + gauge_add!(names::protocol::TLS_HANDSHAKE, 1); HttpsStateMachine::Handshake(TlsHandshake::new( container_frontend_timeout, rustls_details, @@ -267,8 +268,8 @@ impl HttpsSession { handshake.frontend_readiness = readiness; handshake.frontend_readiness.event.insert(Ready::READABLE); - gauge_add!("protocol.proxy.expect", -1); - gauge_add!("protocol.tls.handshake", 1); + gauge_add!(names::protocol::PROXY_EXPECT, -1); + gauge_add!(names::protocol::TLS_HANDSHAKE, 1); return Some(HttpsStateMachine::Handshake(handshake)); } } @@ -327,7 +328,7 @@ impl HttpsSession { let (alpn, alpn_label): (AlpnProtocol, Option<&'static str>) = match alpn { Some("http/1.1") => { if disable_http11 { - incr!("https.alpn.rejected.http11_disabled"); + incr!(names::https::ALPN_REJECTED_HTTP11_DISABLED); warn!( "{} rejecting TLS connection: listener is H2-only but client negotiated http/1.1", log_context!(self) @@ -344,7 +345,7 @@ impl HttpsSession { // bleeding through some misconfiguration). Add a dedicated // counter so the SOC's "ALPN refusal" ratebar matches the // sum of the labelled buckets. - incr!("https.alpn.rejected.unsupported"); + incr!(names::https::ALPN_REJECTED_UNSUPPORTED); error!( "{} unsupported ALPN protocol: {}", log_context!(self), @@ -357,7 +358,7 @@ impl HttpsSession { // listener we instead drop the connection. None => { if disable_http11 { - incr!("https.alpn.rejected.http11_disabled"); + incr!(names::https::ALPN_REJECTED_HTTP11_DISABLED); warn!( "{} rejecting TLS connection: listener is H2-only but client did not negotiate ALPN", log_context!(self) @@ -387,7 +388,7 @@ impl HttpsSession { incr!(rustls_ciphersuite_str(cipher)); }; - gauge_add!("protocol.tls.handshake", -1); + gauge_add!(names::protocol::TLS_HANDSHAKE, -1); let session_ulid = rusty_ulid::Ulid::generate(); let front_stream = FrontRustls { @@ -502,7 +503,7 @@ impl HttpsSession { context.tls_alpn = alpn_label; let mut frontend = match alpn { AlpnProtocol::Http11 => { - incr!("http.alpn.http11"); + incr!(names::http::ALPN_HTTP11); context.create_stream(handshake.request_id, 1 << 16)?; mux::Connection::new_h1_server( session_ulid, @@ -511,7 +512,7 @@ impl HttpsSession { ) } AlpnProtocol::H2 => { - incr!("http.alpn.h2"); + incr!(names::http::ALPN_H2); let flood_config = self.listener.borrow().get_h2_flood_config(); let connection_config = self.listener.borrow().get_h2_connection_config(); let stream_idle_timeout = self.listener.borrow().get_h2_stream_idle_timeout(); @@ -542,7 +543,7 @@ impl HttpsSession { .event .insert(Ready::READABLE | Ready::WRITABLE); - gauge_add!("protocol.https", 1); + gauge_add!(names::protocol::HTTPS, 1); Some(HttpsStateMachine::Mux(Mux { configured_frontend_timeout: self.configured_frontend_timeout, frontend_token: self.frontend_token, @@ -668,9 +669,9 @@ impl HttpsSession { // http.active_requests was already decremented by generate_access_log() // in h1.rs when the 101 response was written (before MuxResult::Upgrade). - gauge_add!("protocol.https", -1); - gauge_add!("protocol.wss", 1); - gauge_add!("websocket.active_requests", 1); + gauge_add!(names::protocol::HTTPS, -1); + gauge_add!(names::protocol::WSS, 1); + gauge_add!(names::websocket::ACTIVE_REQUESTS, 1); Some(HttpsStateMachine::WebSocket(pipe)) } @@ -698,21 +699,21 @@ impl ProxySession for HttpsSession { // Restore gauges match self.state.marker() { - StateMarker::Expect => gauge_add!("protocol.proxy.expect", -1), - StateMarker::Handshake => gauge_add!("protocol.tls.handshake", -1), - StateMarker::Mux => gauge_add!("protocol.https", -1), + StateMarker::Expect => gauge_add!(names::protocol::PROXY_EXPECT, -1), + StateMarker::Handshake => gauge_add!(names::protocol::TLS_HANDSHAKE, -1), + StateMarker::Mux => gauge_add!(names::protocol::HTTPS, -1), StateMarker::WebSocket => { - gauge_add!("protocol.wss", -1); - gauge_add!("websocket.active_requests", -1); + gauge_add!(names::protocol::WSS, -1); + gauge_add!(names::websocket::ACTIVE_REQUESTS, -1); } } if self.state.failed() { match self.state.marker() { - StateMarker::Expect => incr!("https.upgrade.expect.failed"), - StateMarker::Handshake => incr!("https.upgrade.handshake.failed"), - StateMarker::Mux => incr!("https.upgrade.mux.failed"), - StateMarker::WebSocket => incr!("https.upgrade.wss.failed"), + StateMarker::Expect => incr!(names::https::UPGRADE_EXPECT_FAILED), + StateMarker::Handshake => incr!(names::https::UPGRADE_HANDSHAKE_FAILED), + StateMarker::Mux => incr!(names::https::UPGRADE_MUX_FAILED), + StateMarker::WebSocket => incr!(names::https::UPGRADE_WSS_FAILED), } // FailedUpgrade means the socket was consumed by a failed upgrade // attempt, so we can only close the state (no-op) and remove the @@ -933,14 +934,18 @@ impl L7ListenerHandler for HttpsListener { let host = unsafe { from_utf8_unchecked(hostname) }; let route = self.fronts.lookup(host, uri, method).map_err(|e| { - incr!("http.failed_backend_matching"); + incr!(names::http::FAILED_BACKEND_MATCHING); FrontendFromRequestError::NoClusterFound(e) })?; let now = Instant::now(); if let Some(cluster) = route.cluster_id.as_deref() { - time!("frontend_matching_time", cluster, (now - start).as_millis()); + time!( + names::event_loop::FRONTEND_MATCHING_TIME, + cluster, + (now - start).as_millis() + ); } Ok(route) @@ -1429,7 +1434,7 @@ impl HttpsListener { .fronts .refresh_inheriting_hsts(self.config.hsts.as_ref()); for _ in 0..refreshed { - crate::incr!("http.hsts.frontend_refreshed"); + crate::incr!(names::http::HSTS_FRONTEND_REFRESHED); } info!( "{} HTTPS listener {:?} HSTS default patched; refreshed {} inheriting \ @@ -1438,7 +1443,7 @@ impl HttpsListener { self.config.address, refreshed, ); - crate::incr!("http.hsts.listener_default_patched"); + crate::incr!(names::http::HSTS_LISTENER_DEFAULT_PATCHED); } Ok(()) diff --git a/lib/src/lib.rs b/lib/src/lib.rs index 9348dd5cd..6a2bf9bce 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -361,7 +361,7 @@ use sozu_command::{ }; use tls::CertificateResolverError; -use crate::{backends::BackendMap, router::RouteResult}; +use crate::{backends::BackendMap, metrics::names, router::RouteResult}; /// Anything that can be registered in mio (subscribe to kernel events) #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -1295,11 +1295,19 @@ impl SessionMetrics { let service_time = self.service_time(); if let Some(cluster_id) = context.cluster_id { - time!("request_time", cluster_id, request_time.as_millis()); - time!("service_time", cluster_id, service_time.as_millis()); + time!( + names::event_loop::REQUEST_TIME, + cluster_id, + request_time.as_millis() + ); + time!( + names::event_loop::SERVICE_TIME, + cluster_id, + service_time.as_millis() + ); } - time!("request_time", request_time.as_millis()); - time!("service_time", service_time.as_millis()); + time!(names::event_loop::REQUEST_TIME, request_time.as_millis()); + time!(names::event_loop::SERVICE_TIME, service_time.as_millis()); if let Some(backend_id) = self.backend_id.as_ref() { if let Some(backend_response_time) = self.backend_response_time() { @@ -1314,7 +1322,11 @@ impl SessionMetrics { } } - incr!("access_logs.count", context.cluster_id, context.backend_id); + incr!( + names::access_logs::COUNT, + context.cluster_id, + context.backend_id + ); } } diff --git a/lib/src/metrics/local_drain.rs b/lib/src/metrics/local_drain.rs index 8114ecc2e..9eb714ee8 100644 --- a/lib/src/metrics/local_drain.rs +++ b/lib/src/metrics/local_drain.rs @@ -676,19 +676,20 @@ mod tests { use sozu_command::proto::command::{FilteredMetrics, filtered_metrics::Inner}; use super::*; + use crate::metrics::names; #[test] fn receive_and_yield_backend_metrics() { let mut local_drain = LocalDrain::new("prefix".to_string()); local_drain.receive_metric( - "connections_per_backend", + names::backend::CONNECTIONS_PER_BACKEND, Some("test-cluster"), Some("test-backend-1"), MetricValue::Count(1), ); local_drain.receive_metric( - "connections_per_backend", + names::backend::CONNECTIONS_PER_BACKEND, Some("test-cluster"), Some("test-backend-1"), MetricValue::Count(1), @@ -696,7 +697,7 @@ mod tests { let mut expected_metrics_1 = BTreeMap::new(); expected_metrics_1.insert( - "connections_per_backend".to_string(), + names::backend::CONNECTIONS_PER_BACKEND.to_string(), FilteredMetrics { inner: Some(Inner::Count(2)), }, @@ -712,7 +713,7 @@ mod tests { local_drain .metrics_of_one_backend( "test-backend-1", - ["connections_per_backend".to_string()].as_ref(), + [names::backend::CONNECTIONS_PER_BACKEND.to_string()].as_ref(), ) .expect("could not query metrics for this backend") ) @@ -787,7 +788,7 @@ mod tests { let mut local_drain = LocalDrain::new("prefix".to_string()); local_drain.receive_metric( - "connections_per_backend", + names::backend::CONNECTIONS_PER_BACKEND, Some("test-cluster"), Some("test-backend-1"), MetricValue::GaugeAdd(-1), @@ -796,13 +797,13 @@ mod tests { let result = local_drain .metrics_of_one_backend( "test-backend-1", - ["connections_per_backend".to_string()].as_ref(), + [names::backend::CONNECTIONS_PER_BACKEND.to_string()].as_ref(), ) .expect("could not query metrics for this backend"); let gauge_value = match result .metrics - .get("connections_per_backend") + .get(names::backend::CONNECTIONS_PER_BACKEND) .and_then(|m| m.inner.as_ref()) { Some(Inner::Gauge(v)) => *v, @@ -822,13 +823,13 @@ mod tests { // First, create the gauge with a positive value then bring it to 0. local_drain.receive_metric( - "connections_per_backend", + names::backend::CONNECTIONS_PER_BACKEND, Some("test-cluster"), Some("test-backend-1"), MetricValue::GaugeAdd(1), ); local_drain.receive_metric( - "connections_per_backend", + names::backend::CONNECTIONS_PER_BACKEND, Some("test-cluster"), Some("test-backend-1"), MetricValue::GaugeAdd(-1), @@ -836,7 +837,7 @@ mod tests { // Now apply a second -1, which should clamp to 0 (the underflow case). local_drain.receive_metric( - "connections_per_backend", + names::backend::CONNECTIONS_PER_BACKEND, Some("test-cluster"), Some("test-backend-1"), MetricValue::GaugeAdd(-1), @@ -845,13 +846,13 @@ mod tests { let result = local_drain .metrics_of_one_backend( "test-backend-1", - ["connections_per_backend".to_string()].as_ref(), + [names::backend::CONNECTIONS_PER_BACKEND.to_string()].as_ref(), ) .expect("could not query metrics for this backend"); let gauge_value = match result .metrics - .get("connections_per_backend") + .get(names::backend::CONNECTIONS_PER_BACKEND) .and_then(|m| m.inner.as_ref()) { Some(Inner::Gauge(v)) => *v, @@ -869,7 +870,7 @@ mod tests { // Add a gauge (connections_per_backend) for a backend. local_drain.receive_metric( - "connections_per_backend", + names::backend::CONNECTIONS_PER_BACKEND, Some("test-cluster"), Some("test-backend-1"), MetricValue::GaugeAdd(3), @@ -890,7 +891,7 @@ mod tests { .metrics_of_one_backend( "test-backend-1", [ - "connections_per_backend".to_string(), + names::backend::CONNECTIONS_PER_BACKEND.to_string(), "backend.connections.error".to_string(), ] .as_ref(), @@ -900,7 +901,7 @@ mod tests { // Gauge should still be 3. let gauge_value = match result .metrics - .get("connections_per_backend") + .get(names::backend::CONNECTIONS_PER_BACKEND) .and_then(|m| m.inner.as_ref()) { Some(Inner::Gauge(v)) => *v, @@ -924,7 +925,7 @@ mod tests { // Connection opens: gauge goes to 1. local_drain.receive_metric( - "connections_per_backend", + names::backend::CONNECTIONS_PER_BACKEND, Some("test-cluster"), Some("test-backend-1"), MetricValue::GaugeAdd(1), @@ -935,7 +936,7 @@ mod tests { // Connection closes: gauge goes from 1 to 0 (not 0 to -1). local_drain.receive_metric( - "connections_per_backend", + names::backend::CONNECTIONS_PER_BACKEND, Some("test-cluster"), Some("test-backend-1"), MetricValue::GaugeAdd(-1), @@ -944,13 +945,13 @@ mod tests { let result = local_drain .metrics_of_one_backend( "test-backend-1", - ["connections_per_backend".to_string()].as_ref(), + [names::backend::CONNECTIONS_PER_BACKEND.to_string()].as_ref(), ) .expect("could not query metrics for this backend"); let gauge_value = match result .metrics - .get("connections_per_backend") + .get(names::backend::CONNECTIONS_PER_BACKEND) .and_then(|m| m.inner.as_ref()) { Some(Inner::Gauge(v)) => *v, @@ -974,20 +975,20 @@ mod tests { for _ in 0..2 { local_drain.receive_metric( - "http.status.2xx", + names::http::STATUS_2XX, Some("test-cluster"), Some("test-backend"), MetricValue::Count(1), ); local_drain.receive_metric( - "http.status.200", + names::http_status::S200, Some("test-cluster"), Some("test-backend"), MetricValue::Count(1), ); } local_drain.receive_metric( - "http.status.404", + names::http_status::S404, Some("test-cluster"), Some("test-backend"), MetricValue::Count(1), @@ -997,28 +998,28 @@ mod tests { .metrics_of_one_backend( "test-backend", [ - "http.status.2xx".to_string(), - "http.status.200".to_string(), - "http.status.404".to_string(), + names::http::STATUS_2XX.to_string(), + names::http_status::S200.to_string(), + names::http_status::S404.to_string(), ] .as_ref(), ) .expect("could not query metrics for this backend"); assert_eq!( - backend_metrics.metrics.get("http.status.2xx"), + backend_metrics.metrics.get(names::http::STATUS_2XX), Some(&FilteredMetrics { inner: Some(Inner::Count(2)), }) ); assert_eq!( - backend_metrics.metrics.get("http.status.200"), + backend_metrics.metrics.get(names::http_status::S200), Some(&FilteredMetrics { inner: Some(Inner::Count(2)), }) ); assert_eq!( - backend_metrics.metrics.get("http.status.404"), + backend_metrics.metrics.get(names::http_status::S404), Some(&FilteredMetrics { inner: Some(Inner::Count(1)), }) diff --git a/lib/src/metrics/mod.rs b/lib/src/metrics/mod.rs index f8c426173..13622e961 100644 --- a/lib/src/metrics/mod.rs +++ b/lib/src/metrics/mod.rs @@ -7,16 +7,17 @@ //! correctness bug (saturating clamp + warn log), not a rounding artefact. mod local_drain; +pub mod names; mod network_drain; mod writer; use std::{ cell::RefCell, - collections::BTreeMap, + collections::{BTreeMap, HashMap}, io::{self, Write}, net::SocketAddr, str, - time::Instant, + time::{Duration, Instant}, }; use mio::net::UdpSocket; @@ -231,6 +232,136 @@ pub trait Subscriber { ); } +/// How often `lease_tick` actually does work; cheaper than recomputing the +/// effective level on every metric emission. Polled at the top of the worker's +/// `notify` loop, so the cadence floats with traffic but is bounded by this. +const LEASE_TICK_INTERVAL: Duration = Duration::from_secs(5); + +/// Hard cap on lease TTL, mirroring the proto comment on `SetMetricDetail`. +/// Bounds the worst-case effect of a stuck renewer. +pub const LEASE_TTL_MAX: Duration = Duration::from_secs(300); + +/// Default lease TTL applied when the proto request omits `ttl_seconds` (or +/// passes `0`). Matches the proto comment. +pub const LEASE_TTL_DEFAULT: Duration = Duration::from_secs(60); + +/// Hard cap on the number of simultaneous leases held by the aggregator. +/// `lease_apply` rejects new entries (with [`LeaseApplyOutcome::Capped`]) +/// once the table reaches this size. Bounds the lease table's memory and +/// neutralises the CWE-770 vector where a same-UID attacker rolls +/// `client_id` faster than expiry to grow the map unbounded. 64 is well +/// above any plausible TUI fleet (one TUI per operator + a handful of +/// metric scrapers); legitimate callers renewing the same `client_id` +/// REPLACE rather than insert and therefore don't bump the count. +pub const LEASE_TABLE_CAP: usize = 64; + +/// Hard cap on `client_id` length accepted by `lease_apply`. The TUI +/// uses `top::<8 hex chars>` ≤ 24 bytes; 64 leaves headroom for +/// other operator-side scrapers while keeping the lease table's per- +/// entry memory bounded and the audit-log lease_id column small. +pub const LEASE_CLIENT_ID_MAX_BYTES: usize = 64; + +/// Outcome of [`Aggregator::lease_apply`]. The success arm returns the +/// `(previous_effective, new_effective)` pair the caller can use to +/// decide whether to emit a `MetricDetailChanged` audit event; the +/// failure arms surface bounded-input rejections. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum LeaseApplyOutcome { + /// Lease inserted / renewed. + Applied { + previous_effective: MetricDetailLevel, + new_effective: MetricDetailLevel, + }, + /// `client_id` length exceeds [`LEASE_CLIENT_ID_MAX_BYTES`]. + ClientIdTooLong, + /// Lease table is at [`LEASE_TABLE_CAP`] and the request is a new + /// insert (not a renewal). Callers MUST surface this as a `FAILURE` + /// to the wire so the lessor can back off. + TableFull, + /// The requested TTL exceeds [`LEASE_TTL_MAX`]. This arm is + /// theoretically unreachable in the normal flow because the + /// dispatch site rejects out-of-range values before reaching the + /// aggregator; surfacing it explicitly catches callers that bypass + /// dispatch (proto fuzzing, future internal use) instead of + /// silently clamping their intent. + TtlOutOfRange, + /// A renewal request was presented with a [`PeerBinding`] that + /// disagrees with the existing lease's apply-time binding. Returned + /// only when the existing binding is fully known (per + /// [`PeerBinding::is_known`]); unknown-binding leases (no + /// `SO_PEERCRED` available, pre-binding callers) continue to + /// accept any renewer. Closes the same-UID `client_id`-collision + /// takeover where an attacker re-applies a lease against a + /// victim's id and replaces the binding to lock the victim out of + /// their own clear. + Unauthorized, +} + +/// Master-populated peer binding stored alongside each lease so subsequent +/// `clear` requests can be authorised against the apply-time owner. The +/// binding pairs an OS pid (from `SO_PEERCRED` on Linux, captured by the +/// master at command-socket accept time) with the master-side connection +/// session ULID. A clear request must present BOTH values matching the +/// apply-time binding. A binding of `(None, None)` ("unknown") at apply +/// time means the connection had no peer credentials available — the +/// worker then accepts any clear for that `client_id` to preserve compat +/// with non-Linux callers and intermediate proxies. See the proto comment +/// on `SetMetricDetail.peer_pid` for the trust model. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +pub struct PeerBinding { + pub pid: Option, + /// Session ULID rendered as a `u128` (the master's session anchor). + /// Stored as the raw u128 rather than the original `Ulid` to avoid + /// dragging that crate into the metrics-aggregator dependencies. + pub session_ulid: Option, +} + +impl PeerBinding { + /// `true` when both halves are known. A fully-known binding is the + /// only one against which `clear` may be authorised; partial bindings + /// (one half `None`) degrade to "accept any clear" per the proto + /// contract on `SetMetricDetail.peer_pid` / `peer_session_ulid`. + pub fn is_known(&self) -> bool { + self.pid.is_some() && self.session_ulid.is_some() + } + + /// True when `self` and `other` are compatible (same `pid` + same + /// `session_ulid`) AND `self.is_known()`. Used by `lease_clear` to + /// reject mismatched clears. + pub fn matches(&self, other: &PeerBinding) -> bool { + self.is_known() && self.pid == other.pid && self.session_ulid == other.session_ulid + } +} + +/// One lease entry kept inside [`Aggregator::leases`]. Carries the +/// requested cardinality level, the absolute expiry instant, and the +/// master-supplied [`PeerBinding`] captured at apply time. +#[derive(Clone, Copy, Debug)] +struct LeaseEntry { + level: MetricDetailLevel, + expires_at: Instant, + binding: PeerBinding, +} + +/// Outcome of [`Aggregator::lease_clear`]. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum LeaseClearOutcome { + /// The lease was found, the binding matched (or was unknown at apply + /// time), and the entry has been removed. Carries the worker's + /// previous effective level so the caller can decide whether to + /// emit a `MetricDetailChanged` audit event. + Cleared { + previous_effective: MetricDetailLevel, + }, + /// No lease was held by the requested `client_id`. Silent no-op. + NotFound, + /// A lease existed but the peer binding presented with the clear did + /// not match the apply-time binding. The lease is left intact. The + /// worker MUST surface this as a `FAILURE` response to discourage + /// guessing attacks against another operator's lease. + Unauthorized, +} + pub struct Aggregator { /// appended to metric keys, usually "sozu-" prefix: String, @@ -238,21 +369,36 @@ pub struct Aggregator { network: Option, /// gather metrics locally, queried by the CLI local: LocalDrain, - /// Cardinality knob — filters `(cluster_id, backend_id)` labels at - /// emission time. Defaults to `Cluster` to preserve the historical - /// pre-knob behaviour (cluster-scoped metrics emitted, backend-scoped - /// labels dropped before this knob existed only when the macros didn't - /// pass them). - detail: MetricDetailLevel, + /// Static cardinality knob — set once at boot from `MetricsConfig.detail`. + /// Filters `(cluster_id, backend_id)` labels at emission time. Each level + /// is a SUPERSET of the previous one (`Process ⊆ Frontend ⊆ Cluster ⊆ Backend`). + configured: MetricDetailLevel, + /// Effective cardinality knob actually applied at emission. Equal to + /// `max(configured, max(active leases))`. Recomputed only when leases + /// change, so the hot path (`receive_metric`) reads a single field. + effective: MetricDetailLevel, + /// Active TTL leases keyed by `client_id` from `SetMetricDetail`. A live + /// lease holds the worker's effective level at-or-above its requested + /// detail until it expires or is explicitly cleared. Multiple clients + /// (e.g. several `sozu top` sessions) lease independently. + leases: HashMap, + /// Wall-clock anchor for the polled lease janitor. Updated on every + /// `lease_tick` call regardless of whether expiry happened, so the + /// caller's "is it time to tick?" check stays cheap. + last_lease_tick: Instant, } impl Aggregator { pub fn new(prefix: String) -> Aggregator { + let default_detail = MetricDetailLevel::default(); Aggregator { prefix: prefix.clone(), network: None, local: LocalDrain::new(prefix), - detail: MetricDetailLevel::default(), + configured: default_detail, + effective: default_detail, + leases: HashMap::new(), + last_lease_tick: Instant::now(), } } @@ -276,10 +422,179 @@ impl Aggregator { } } - /// Cardinality knob — see [`MetricDetailLevel`] and - /// [`filter_labels_for_detail`] for the per-level filtering rules. + /// Set the static cardinality floor (`MetricsConfig.detail` from the TOML + /// configuration). Live leases applied via [`Self::lease_apply`] can + /// elevate the effective level at runtime; the configured floor is the + /// lower bound the worker falls back to when no lease is active. + /// + /// See [`MetricDetailLevel`] and [`filter_labels_for_detail`] for the + /// per-level filtering rules. pub fn set_up_detail(&mut self, detail: MetricDetailLevel) { - self.detail = detail; + self.configured = detail; + self.recompute_effective(); + } + + /// Returns the static (configured) cardinality floor. Independent of + /// runtime leases. + pub fn detail_configured(&self) -> MetricDetailLevel { + self.configured + } + + /// Returns the cardinality level actually applied to emissions. Equal to + /// `max(configured, max(active leases))`. + pub fn detail_effective(&self) -> MetricDetailLevel { + self.effective + } + + /// Apply or renew a runtime cardinality lease for `client_id`. If a lease + /// for the same client already exists it is REPLACED (used for renewals). + /// `ttl` values above [`LEASE_TTL_MAX`] are **rejected** with + /// [`LeaseApplyOutcome::TtlOutOfRange`] (NOT clamped); callers must + /// handle that arm or pre-validate the TTL. On success the call returns + /// `(previous_effective, new_effective)` so callers can decide whether + /// to emit a `MetricDetailChanged` audit event. + /// + /// The proto contract on `SetMetricDetail.ttl_seconds` is that the worker + /// **rejects** out-of-range values with a `FAILURE` response — that + /// enforcement lives at the dispatch site in `lib/src/server.rs::notify`. + /// `lease_apply` itself returns [`LeaseApplyOutcome::TtlOutOfRange`] + /// when called with `ttl > LEASE_TTL_MAX` so callers that bypass the + /// dispatch site (proto fuzzing, future internal use) see a loud + /// rejection instead of silently capped semantics. Same shape for + /// over-long `client_id` and a full lease table — see + /// [`LeaseApplyOutcome`] for the failure arms. + pub fn lease_apply( + &mut self, + client_id: String, + level: MetricDetailLevel, + ttl: Duration, + binding: PeerBinding, + ) -> LeaseApplyOutcome { + if client_id.len() > LEASE_CLIENT_ID_MAX_BYTES { + return LeaseApplyOutcome::ClientIdTooLong; + } + if ttl > LEASE_TTL_MAX { + return LeaseApplyOutcome::TtlOutOfRange; + } + // Cap the table size BEFORE the insert, but only when the caller + // is inserting a fresh entry. Renewals (same `client_id` already + // present) REPLACE the existing entry and therefore keep the + // count stable — they must always succeed so an active operator + // never loses their lease just because the table is full. + let is_renewal = self.leases.contains_key(&client_id); + if !is_renewal && self.leases.len() >= LEASE_TABLE_CAP { + return LeaseApplyOutcome::TableFull; + } + // Renewal-binding gate: when a lease already exists for this + // `client_id` and its apply-time binding is fully known, the + // renewer's presented binding MUST match. Without this check + // any same-UID caller that learns the `client_id` (PID from + // /proc, suffix from audit log) could re-apply against it, + // overwriting the binding to point at the attacker's session + // and then driving the victim's Drop-time `clear` into + // `Unauthorized`. Unknown apply-time bindings continue to + // accept any renewer per the proto contract on + // `SetMetricDetail.peer_pid` / `peer_session_ulid`. + if is_renewal + && let Some(entry) = self.leases.get(&client_id) + && entry.binding.is_known() + && !entry.binding.matches(&binding) + { + return LeaseApplyOutcome::Unauthorized; + } + let expires_at = Instant::now() + ttl; + self.leases.insert( + client_id, + LeaseEntry { + level, + expires_at, + binding, + }, + ); + let previous_effective = self.effective; + self.recompute_effective(); + LeaseApplyOutcome::Applied { + previous_effective, + new_effective: self.effective, + } + } + + /// Explicitly release a lease keyed by `client_id`. The clear is + /// authorised against the apply-time [`PeerBinding`] when one was + /// recorded — see [`LeaseClearOutcome`] for the three result states. + /// A clear request with `presented = PeerBinding::default()` matches + /// only leases whose apply-time binding was also unknown, preserving + /// compat with pre-binding callers and platforms without + /// `SO_PEERCRED`. + pub fn lease_clear(&mut self, client_id: &str, presented: PeerBinding) -> LeaseClearOutcome { + let Some(entry) = self.leases.get(client_id) else { + return LeaseClearOutcome::NotFound; + }; + // If the apply-time binding is fully known we MUST authorise the + // clear against it; presenting `default()` (an empty binding) is + // a mismatch. If the apply-time binding is unknown (a pre-binding + // caller, or a non-Linux peer with no credentials), we permit + // any clear — there is nothing to authorise against. + if entry.binding.is_known() && !entry.binding.matches(&presented) { + return LeaseClearOutcome::Unauthorized; + } + self.leases.remove(client_id); + let previous = self.effective; + self.recompute_effective(); + LeaseClearOutcome::Cleared { + previous_effective: previous, + } + } + + /// Returns the number of active (non-expired-as-of-last-tick) leases. + /// Surfaced in `WorkerMetricDetailStatus` so the TUI can show + /// "another client is still leasing this level". + pub fn lease_count(&self) -> u32 { + self.leases.len() as u32 + } + + /// Polled lease-expiry janitor. Called from the worker's `notify` loop + /// (and from periodic timers); cheap when nothing has expired. Returns + /// `Some(previous_effective)` when at least one lease expired AND that + /// expiry actually moved the effective level (so the caller can emit a + /// `MetricDetailChanged` audit event), or `None` for the no-change path. + /// + /// `now` is parameterised so unit tests can advance the clock + /// deterministically without sleeping. + pub fn lease_tick(&mut self, now: Instant) -> Option { + self.last_lease_tick = now; + let before = self.leases.len(); + self.leases.retain(|_, entry| entry.expires_at > now); + if self.leases.len() == before { + return None; + } + let previous = self.effective; + self.recompute_effective(); + if previous != self.effective { + Some(previous) + } else { + None + } + } + + /// True when at least [`LEASE_TICK_INTERVAL`] has passed since the last + /// `lease_tick`. Use to gate the polled janitor at the top of `notify` + /// without paying a HashMap walk on every event-loop iteration. + pub fn lease_tick_due(&self, now: Instant) -> bool { + now.duration_since(self.last_lease_tick) >= LEASE_TICK_INTERVAL + } + + /// Recompute `effective = max(configured, max(active leases))`. Cheap (one + /// linear pass over the lease table) and only called on apply/clear/tick, + /// never on the metric-emission hot path. + fn recompute_effective(&mut self) { + let mut max_lease = self.configured; + for entry in self.leases.values() { + if entry.level > max_lease { + max_lease = entry.level; + } + } + self.effective = max_lease; } pub fn socket(&self) -> Option<&UdpSocket> { @@ -343,9 +658,11 @@ impl Subscriber for Aggregator { ) { // Apply the cardinality knob BEFORE handing the metric to either // drain. Both drains see the same filtered labels, keeping the local - // CLI view consistent with what statsd receives on the wire. + // CLI view consistent with what statsd receives on the wire. Reads + // `effective` (max of the configured floor and any active leases), + // which is recomputed off the hot path. let (cluster_id, backend_id) = - filter_labels_for_detail(self.detail, cluster_id, backend_id); + filter_labels_for_detail(self.effective, cluster_id, backend_id); if let Some(ref mut net) = self.network.as_mut() { net.receive_metric(label, cluster_id, backend_id, metric.to_owned()); } @@ -489,14 +806,14 @@ macro_rules! record_backend_metrics ( let cluster_id: &str = $cluster_id; let backend_id: &str = $backend_id; - m.receive_metric("bytes_in", Some(cluster_id), Some(backend_id), MetricValue::Count($bin as i64)); - m.receive_metric("bytes_out", Some(cluster_id), Some(backend_id), MetricValue::Count($bout as i64)); - m.receive_metric("backend_response_time", Some(cluster_id), Some(backend_id), MetricValue::Time($response_time as usize)); + m.receive_metric($crate::metrics::names::backend::BYTES_IN, Some(cluster_id), Some(backend_id), MetricValue::Count($bin as i64)); + m.receive_metric($crate::metrics::names::backend::BYTES_OUT, Some(cluster_id), Some(backend_id), MetricValue::Count($bout as i64)); + m.receive_metric($crate::metrics::names::backend::RESPONSE_TIME, Some(cluster_id), Some(backend_id), MetricValue::Time($response_time as usize)); if let Some(t) = $backend_connection_time { - m.receive_metric("backend_connection_time", Some(cluster_id), Some(backend_id), MetricValue::Time(t.as_millis() as usize)); + m.receive_metric($crate::metrics::names::backend::CONNECTION_TIME, Some(cluster_id), Some(backend_id), MetricValue::Time(t.as_millis() as usize)); } - m.receive_metric("requests", Some(cluster_id), Some(backend_id), MetricValue::Count(1)); + m.receive_metric($crate::metrics::names::backend::REQUESTS, Some(cluster_id), Some(backend_id), MetricValue::Count(1)); }); } ); @@ -557,6 +874,417 @@ mod tests { // Pre-knob behaviour preserved: if a worker / process never calls // `set_up_detail`, cluster-scoped metrics still reach the drains. let agg = Aggregator::new("sozu".to_owned()); - assert_eq!(agg.detail, MetricDetailLevel::Cluster); + assert_eq!(agg.detail_configured(), MetricDetailLevel::Cluster); + assert_eq!(agg.detail_effective(), MetricDetailLevel::Cluster); + assert_eq!(agg.lease_count(), 0); + } + + /// Fully-known binding used by tests that don't otherwise care about the + /// peer-binding mechanic. Two distinct values (`OWNER_*` / `OTHER_*`) + /// let `lease_clear` tests assert authorised vs unauthorised paths. + fn owner_binding() -> PeerBinding { + PeerBinding { + pid: Some(1234), + session_ulid: Some(0x0123_4567_89ab_cdef_0123_4567_89ab_cdefu128), + } + } + + fn other_binding() -> PeerBinding { + PeerBinding { + pid: Some(5678), + session_ulid: Some(0xfedc_ba98_7654_3210_fedc_ba98_7654_3210u128), + } + } + + /// Extract `(previous_effective, new_effective)` from a successful + /// `lease_apply`; panics on any failure arm so tests that don't care + /// about the failure paths stay compact. + fn unwrap_applied(outcome: LeaseApplyOutcome) -> (MetricDetailLevel, MetricDetailLevel) { + match outcome { + LeaseApplyOutcome::Applied { + previous_effective, + new_effective, + } => (previous_effective, new_effective), + other => panic!("expected LeaseApplyOutcome::Applied, got {other:?}"), + } + } + + #[test] + fn lease_apply_elevates_effective_above_configured() { + // Configured floor stays at Cluster; a lease for Backend lifts the + // effective level until the lease expires or is cleared. + let mut agg = Aggregator::new("sozu".to_owned()); + agg.set_up_detail(MetricDetailLevel::Cluster); + let (prev, new) = unwrap_applied(agg.lease_apply( + "test:1".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(60), + PeerBinding::default(), + )); + assert_eq!(prev, MetricDetailLevel::Cluster); + assert_eq!(new, MetricDetailLevel::Backend); + assert_eq!(agg.detail_configured(), MetricDetailLevel::Cluster); + assert_eq!(agg.detail_effective(), MetricDetailLevel::Backend); + assert_eq!(agg.lease_count(), 1); + } + + #[test] + fn lease_apply_below_configured_does_not_lower_effective() { + // A lease can only ELEVATE the floor, never push below `configured`. + let mut agg = Aggregator::new("sozu".to_owned()); + agg.set_up_detail(MetricDetailLevel::Backend); + let (prev, new) = unwrap_applied(agg.lease_apply( + "test:1".to_owned(), + MetricDetailLevel::Cluster, + Duration::from_secs(60), + PeerBinding::default(), + )); + assert_eq!(prev, MetricDetailLevel::Backend); + assert_eq!(new, MetricDetailLevel::Backend); + } + + #[test] + fn lease_apply_rejects_client_id_over_cap() { + // Defence-in-depth: even if dispatch lets a too-long id through, + // the aggregator refuses to store it. + let mut agg = Aggregator::new("sozu".to_owned()); + let too_long = "x".repeat(LEASE_CLIENT_ID_MAX_BYTES + 1); + assert_eq!( + agg.lease_apply( + too_long, + MetricDetailLevel::Backend, + Duration::from_secs(60), + PeerBinding::default(), + ), + LeaseApplyOutcome::ClientIdTooLong + ); + assert_eq!(agg.lease_count(), 0); + } + + #[test] + fn lease_apply_rejects_when_table_is_full() { + // Fill the table to capacity with distinct client_ids; one more + // insert is refused. A RENEWAL of an existing entry must still + // succeed (replaces in place, count unchanged). + let mut agg = Aggregator::new("sozu".to_owned()); + for i in 0..LEASE_TABLE_CAP { + assert!(matches!( + agg.lease_apply( + format!("client:{i:02}"), + MetricDetailLevel::Backend, + Duration::from_secs(60), + PeerBinding::default(), + ), + LeaseApplyOutcome::Applied { .. } + )); + } + assert_eq!(agg.lease_count() as usize, LEASE_TABLE_CAP); + // New distinct client → rejected. + assert_eq!( + agg.lease_apply( + "newcomer".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(60), + PeerBinding::default(), + ), + LeaseApplyOutcome::TableFull, + ); + assert_eq!(agg.lease_count() as usize, LEASE_TABLE_CAP); + // Renewal of an existing entry → still accepted. + assert!(matches!( + agg.lease_apply( + "client:00".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(120), + PeerBinding::default(), + ), + LeaseApplyOutcome::Applied { .. } + )); + assert_eq!(agg.lease_count() as usize, LEASE_TABLE_CAP); + } + + #[test] + fn lease_apply_rejects_ttl_over_max() { + // The aggregator no longer silently clamps oversize TTLs. + let mut agg = Aggregator::new("sozu".to_owned()); + assert_eq!( + agg.lease_apply( + "client:0".to_owned(), + MetricDetailLevel::Backend, + LEASE_TTL_MAX + Duration::from_secs(1), + PeerBinding::default(), + ), + LeaseApplyOutcome::TtlOutOfRange, + ); + assert_eq!(agg.lease_count(), 0); + } + + #[test] + fn lease_apply_renewal_replaces_previous_for_same_client() { + // The renewer client re-sends with the same `client_id`; the entry + // is REPLACED (not duplicated). Lease count stays at 1. + // Unknown bindings on both sides skip the renewal-binding gate. + let mut agg = Aggregator::new("sozu".to_owned()); + let _ = agg.lease_apply( + "renewer".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(30), + PeerBinding::default(), + ); + let _ = agg.lease_apply( + "renewer".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(60), + PeerBinding::default(), + ); + assert_eq!(agg.lease_count(), 1); + } + + #[test] + fn lease_apply_renewal_rejects_foreign_binding() { + // Same-UID `client_id` collision attack: the victim applies with + // a known binding; an attacker that learns the `client_id` + // attempts to renew under a different (pid, session_ulid). The + // renewal must be refused so the victim's apply-time binding + // remains the sole authoritative owner — both for subsequent + // renewals AND for the victim's own Drop-time `clear`. + let mut agg = Aggregator::new("sozu".to_owned()); + let victim = PeerBinding { + pid: Some(4242), + session_ulid: Some(0x0123_4567_89AB_CDEF_FEDC_BA98_7654_3210), + }; + let outcome = agg.lease_apply( + "topcli".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(60), + victim, + ); + assert!( + matches!(outcome, LeaseApplyOutcome::Applied { .. }), + "victim's initial apply must succeed" + ); + let attacker = PeerBinding { + pid: Some(9999), + session_ulid: Some(0xDEAD_BEEF_DEAD_BEEF_DEAD_BEEF_DEAD_BEEF), + }; + let outcome = agg.lease_apply( + "topcli".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(60), + attacker, + ); + assert_eq!( + outcome, + LeaseApplyOutcome::Unauthorized, + "renewal with a mismatched known binding must be refused" + ); + // The victim can still clear their own lease — proof the + // refused attempt did not corrupt the stored binding. + let clear = agg.lease_clear("topcli", victim); + assert!( + matches!(clear, LeaseClearOutcome::Cleared { .. }), + "victim's original binding must still clear cleanly after \ + the foreign-binding renewal was refused" + ); + } + + #[test] + fn lease_apply_renewal_with_matching_binding_succeeds() { + // Symmetry case: the legitimate owner re-applies with the same + // (pid, session_ulid). The renewal must succeed so the TUI's + // own renewer thread keeps the lease alive across its TTL. + let mut agg = Aggregator::new("sozu".to_owned()); + let owner = PeerBinding { + pid: Some(1234), + session_ulid: Some(0xAAAA_BBBB_CCCC_DDDD_EEEE_FFFF_0000_1111), + }; + let _ = agg.lease_apply( + "topcli".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(30), + owner, + ); + let outcome = agg.lease_apply( + "topcli".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(60), + owner, + ); + assert!( + matches!(outcome, LeaseApplyOutcome::Applied { .. }), + "renewal with matching binding must succeed (otherwise the \ + TUI's own renewer thread would be locked out)" + ); + } + + #[test] + fn lease_apply_max_merge_two_clients() { + // Two clients, two levels: effective = max(both leases, configured). + // Use `Process` as the floor so the Frontend lease is observable + // after the Backend lease is cleared (otherwise the configured + // Cluster floor would mask the Frontend lease). + let mut agg = Aggregator::new("sozu".to_owned()); + agg.set_up_detail(MetricDetailLevel::Process); + let _ = agg.lease_apply( + "scraper".to_owned(), + MetricDetailLevel::Frontend, + Duration::from_secs(60), + PeerBinding::default(), + ); + let _ = agg.lease_apply( + "topcli".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(60), + PeerBinding::default(), + ); + assert_eq!(agg.detail_effective(), MetricDetailLevel::Backend); + assert_eq!(agg.lease_count(), 2); + // Clearing the higher lease drops effective back to the lower one. + let outcome = agg.lease_clear("topcli", PeerBinding::default()); + assert_eq!( + outcome, + LeaseClearOutcome::Cleared { + previous_effective: MetricDetailLevel::Backend, + } + ); + assert_eq!(agg.detail_effective(), MetricDetailLevel::Frontend); + assert_eq!(agg.lease_count(), 1); + } + + #[test] + fn lease_clear_unknown_id_is_silent_noop() { + // Mismatched IDs are silently ignored (other clients' leases unaffected). + let mut agg = Aggregator::new("sozu".to_owned()); + let _ = agg.lease_apply( + "real".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(60), + PeerBinding::default(), + ); + assert_eq!( + agg.lease_clear("ghost", PeerBinding::default()), + LeaseClearOutcome::NotFound + ); + assert_eq!(agg.detail_effective(), MetricDetailLevel::Backend); + assert_eq!(agg.lease_count(), 1); + } + + #[test] + fn lease_clear_with_matching_binding_authorised() { + // Apply with a known binding, clear with the same binding -> Cleared. + let mut agg = Aggregator::new("sozu".to_owned()); + let _ = agg.lease_apply( + "owner-lease".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(60), + owner_binding(), + ); + let outcome = agg.lease_clear("owner-lease", owner_binding()); + assert!(matches!(outcome, LeaseClearOutcome::Cleared { .. })); + assert_eq!(agg.lease_count(), 0); + } + + #[test] + fn lease_clear_with_mismatched_binding_is_unauthorized() { + // Apply with one binding, attempt clear with a different binding -> + // Unauthorized; lease left intact. + let mut agg = Aggregator::new("sozu".to_owned()); + let _ = agg.lease_apply( + "owner-lease".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(60), + owner_binding(), + ); + let outcome = agg.lease_clear("owner-lease", other_binding()); + assert_eq!(outcome, LeaseClearOutcome::Unauthorized); + assert_eq!(agg.lease_count(), 1); + } + + #[test] + fn lease_clear_unknown_apply_binding_accepts_any_clear() { + // Pre-binding / non-Linux apply -> any clear authorised. + let mut agg = Aggregator::new("sozu".to_owned()); + let _ = agg.lease_apply( + "legacy".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(60), + PeerBinding::default(), + ); + let outcome = agg.lease_clear("legacy", owner_binding()); + assert!(matches!(outcome, LeaseClearOutcome::Cleared { .. })); + assert_eq!(agg.lease_count(), 0); + } + + #[test] + fn lease_clear_known_apply_rejects_default_clear() { + // Known apply binding -> a default ("unknown") clear is rejected. + let mut agg = Aggregator::new("sozu".to_owned()); + let _ = agg.lease_apply( + "owner-lease".to_owned(), + MetricDetailLevel::Backend, + Duration::from_secs(60), + owner_binding(), + ); + let outcome = agg.lease_clear("owner-lease", PeerBinding::default()); + assert_eq!(outcome, LeaseClearOutcome::Unauthorized); + } + + #[test] + fn lease_tick_expires_only_past_due_leases() { + // `lease_tick(now)` parameterises the clock so we can test expiry + // without sleeping. Setup: one lease past due, one still active. + // Use `Process` as the floor so the surviving Frontend lease drives + // the effective level after the Backend lease expires (otherwise + // the Cluster floor would mask it). + let mut agg = Aggregator::new("sozu".to_owned()); + agg.set_up_detail(MetricDetailLevel::Process); + let now = Instant::now(); + // Inject directly into the table to control expires_at deterministically. + agg.leases.insert( + "expired".to_owned(), + LeaseEntry { + level: MetricDetailLevel::Backend, + expires_at: now - Duration::from_secs(1), + binding: PeerBinding::default(), + }, + ); + agg.leases.insert( + "live".to_owned(), + LeaseEntry { + level: MetricDetailLevel::Frontend, + expires_at: now + Duration::from_secs(60), + binding: PeerBinding::default(), + }, + ); + agg.recompute_effective(); + assert_eq!(agg.detail_effective(), MetricDetailLevel::Backend); + let prev = agg.lease_tick(now); + assert_eq!(prev, Some(MetricDetailLevel::Backend)); + assert_eq!(agg.detail_effective(), MetricDetailLevel::Frontend); + assert_eq!(agg.lease_count(), 1); + } + + #[test] + fn lease_tick_no_change_returns_none() { + // No leases -> no-op tick, no audit signal. + let mut agg = Aggregator::new("sozu".to_owned()); + assert!(agg.lease_tick(Instant::now()).is_none()); + } + + #[test] + fn lease_apply_at_max_ttl_succeeds() { + // Boundary: exactly LEASE_TTL_MAX is allowed; LEASE_TTL_MAX + 1ns is + // rejected (covered by lease_apply_rejects_ttl_over_max above). + let mut agg = Aggregator::new("sozu".to_owned()); + let now = Instant::now(); + let outcome = agg.lease_apply( + "max".to_owned(), + MetricDetailLevel::Backend, + LEASE_TTL_MAX, + PeerBinding::default(), + ); + assert!(matches!(outcome, LeaseApplyOutcome::Applied { .. })); + let entry = agg.leases.get("max").unwrap(); + assert!(entry.expires_at <= now + LEASE_TTL_MAX + Duration::from_millis(50)); } } diff --git a/lib/src/metrics/names.rs b/lib/src/metrics/names.rs new file mode 100644 index 000000000..57017f337 --- /dev/null +++ b/lib/src/metrics/names.rs @@ -0,0 +1,323 @@ +//! Canonical metric-name constants. +//! +//! Every metric string emitted by Sōzu and consumed by the StatsD/Prometheus/ +//! TUI surface should reference a constant declared here rather than being +//! repeated as a literal. The constants intentionally preserve the dotted +//! string values byte-for-byte so dashboards and scrapers stay valid. +//! +//! Layout: one submodule per metric family (`http`, `h2`, `tcp`, `tls`, …), +//! constants inside are `UPPER_SNAKE_CASE` derived from the dotted suffix. +//! E.g. `h2.frames.tx.data` lives at [`h2::FRAMES_TX_DATA`]. +//! +//! When adding a new metric: +//! 1. Add the constant in the matching submodule (create one if needed). +//! 2. Reference the constant from the emission site +//! (`incr!(names::h2::FRAMES_TX_DATA)`) instead of repeating the literal. +//! 3. If the TUI or any scraper reads this metric, reference the same +//! constant on the read side too. + +/// Accept-queue counters and gauges, fed by the worker accept loop in +/// `lib/src/server.rs`. +pub mod accept_queue { + pub const BACKPRESSURE: &str = "accept_queue.backpressure"; + pub const CONNECTIONS: &str = "accept_queue.connections"; + pub const SATURATED_SECONDS: &str = "accept_queue.saturated_seconds"; + pub const TIMEOUT: &str = "accept_queue.timeout"; + pub const WAIT_TIME: &str = "accept_queue.wait_time"; +} + +/// Access-log infrastructure counters. +pub mod access_logs { + pub const COUNT: &str = "access_logs.count"; + pub const UNSENT: &str = "unsent-access-logs"; +} + +/// Per-backend bandwidth + connection counters emitted by both H1 and the +/// H2 mux. Front-bytes and back-bytes are accounted separately so dashboards +/// can compare upstream vs downstream traffic. +pub mod backend { + pub const BYTES_IN: &str = "bytes_in"; + pub const BYTES_OUT: &str = "bytes_out"; + pub const BACK_BYTES_IN: &str = "back_bytes_in"; + pub const BACK_BYTES_OUT: &str = "back_bytes_out"; + pub const AVAILABLE: &str = "backend.available"; + pub const CONNECTIONS: &str = "backend.connections"; + pub const FLOW_CONTROL_PAUSED: &str = "backend.flow_control.paused"; + pub const POOL_HIT: &str = "backend.pool.hit"; + pub const POOL_MISS: &str = "backend.pool.miss"; + pub const POOL_SIZE: &str = "backend.pool.size"; + pub const CONNECTIONS_PER_BACKEND: &str = "connections_per_backend"; + pub const CONNECTION_TIME: &str = "backend_connection_time"; + pub const RESPONSE_TIME: &str = "backend_response_time"; + pub const REQUESTS: &str = "requests"; + pub const FAIL_OPEN: &str = "backends.fail_open"; +} + +/// Buffer-pool gauges and counters. +pub mod buffer { + pub const CAPACITY: &str = "buffer.capacity"; + pub const IN_USE: &str = "buffer.in_use"; + pub const USAGE_PERCENT: &str = "buffer.usage_percent"; +} + +/// Client-side connection gauges, populated by the worker accept loop. +pub mod client { + pub const CONNECTIONS: &str = "client.connections"; + pub const CONNECTIONS_MAX: &str = "client.connections_max"; +} + +/// Per-cluster aggregate gauges. +pub mod cluster { + pub const AVAILABLE_BACKENDS: &str = "cluster.available_backends"; + pub const AVAILABLE_RECOVERED: &str = "cluster.available_recovered"; + pub const NO_AVAILABLE_BACKENDS: &str = "cluster.no_available_backends"; + pub const TOTAL_BACKENDS: &str = "cluster.total_backends"; +} + +/// Configuration-state inventory gauges, refreshed when the master +/// fans out a state change. +pub mod configuration { + pub const BACKENDS: &str = "configuration.backends"; + pub const CLUSTERS: &str = "configuration.clusters"; + pub const FRONTENDS: &str = "configuration.frontends"; +} + +/// Event-loop timing counters. +pub mod event_loop { + pub const EPOLL_TIME: &str = "epoll_time"; + pub const EVENT_LOOP_TIME: &str = "event_loop_time"; + pub const FRONTEND_MATCHING_TIME: &str = "frontend_matching_time"; + pub const REGEX_MATCHING_TIME: &str = "regex_matching_time"; + pub const REQUEST_TIME: &str = "request_time"; + pub const SERVICE_TIME: &str = "service_time"; +} + +/// Health-check transition counters. +pub mod health_check { + pub const UP: &str = "health_check.up"; + pub const DOWN: &str = "health_check.down"; + pub const SUCCESS: &str = "health_check.success"; + pub const FAILURE: &str = "health_check.failure"; +} + +/// H1 protocol counters. +pub mod h1 { + pub const BACKEND_EOF_BEFORE_MESSAGE_COMPLETE: &str = "h1.backend_eof_before_message_complete"; +} + +/// H2 multiplexer counters — frame TX, flood mitigations, header policy +/// rejections, signal-writable rearm sites, and other H2-specific buckets. +pub mod h2 { + pub const CLOSE_WITH_ACTIVE_STREAMS: &str = "h2.close_with_active_streams"; + pub const COALESCING_ACCEPTED: &str = "h2.coalescing.accepted"; + pub const CONNECTION_ACTIVE_STREAMS: &str = "h2.connection.active_streams"; + pub const CONNECTION_PENDING_WINDOW_UPDATES: &str = "h2.connection.pending_window_updates"; + pub const CONNECTION_WINDOW_BYTES: &str = "h2.connection.window_bytes"; + pub const FLOW_CONTROL_STALL: &str = "h2.flow_control_stall"; + + // Frame-TX counters (frame type fanout). + pub const FRAMES_TX_CONTINUATION: &str = "h2.frames.tx.continuation"; + pub const FRAMES_TX_DATA: &str = "h2.frames.tx.data"; + pub const FRAMES_TX_GOAWAY: &str = "h2.frames.tx.goaway"; + pub const FRAMES_TX_HEADERS: &str = "h2.frames.tx.headers"; + pub const FRAMES_TX_PING_ACK: &str = "h2.frames.tx.ping_ack"; + pub const FRAMES_TX_RST_STREAM: &str = "h2.frames.tx.rst_stream"; + pub const FRAMES_TX_SETTINGS: &str = "h2.frames.tx.settings"; + pub const FRAMES_TX_WINDOW_UPDATE: &str = "h2.frames.tx.window_update"; + + // Header-policy rejections (the `h2.headers.rejected.*` family). + pub const HEADERS_NO_STREAM_ERROR: &str = "h2.headers_no_stream.error"; + pub const HEADERS_REJECTED_BUDGET_OVERRUN: &str = "h2.headers.rejected.budget_overrun"; + pub const HEADERS_REJECTED_TOTAL: &str = "h2.headers.rejected.total"; + + pub const RST_STREAM_RECEIVED_PRE_RESPONSE_START: &str = + "h2.rst_stream.received.pre_response_start"; + + // Writable-rearm signal counters — one per rearm reason. + pub const SIGNAL_WRITABLE_REARMED_CONTROL_QUEUE: &str = + "h2.signal.writable.rearmed.control_queue"; + pub const SIGNAL_WRITABLE_REARMED_DEFAULT_ANSWER: &str = + "h2.signal.writable.rearmed.default_answer"; + pub const SIGNAL_WRITABLE_REARMED_FORCEFULLY_TERMINATE_ANSWER: &str = + "h2.signal.writable.rearmed.forcefully_terminate_answer"; + pub const SIGNAL_WRITABLE_REARMED_PEER_DATA: &str = "h2.signal.writable.rearmed.peer_data"; + pub const SIGNAL_WRITABLE_REARMED_PEER_HEADERS: &str = + "h2.signal.writable.rearmed.peer_headers"; + pub const SIGNAL_WRITABLE_REARMED_PRIORITY_UPDATE: &str = + "h2.signal.writable.rearmed.priority_update"; + + pub const TRAILERS_DROPPED_CONTENT_LENGTH: &str = "h2.trailers_dropped_content_length"; + pub const TRAILER_SPOOF_VECTOR_ELIDED: &str = "h2.trailer.spoof_vector_elided"; + pub const WINDOW_UPDATE_DROPPED: &str = "h2.window_update_dropped"; + + // Flood-mitigation violation counters — one per flood class the H2 + // mux's `H2FloodDetector` recognises. Surfaced in the TUI's H2 pane + // so operators can spot a flood-pattern before it pages. + pub const FLOOD_VIOLATION_CONTINUATION: &str = "h2.flood.violation.continuation"; + pub const FLOOD_VIOLATION_GLITCH_WINDOW: &str = "h2.flood.violation.glitch_window"; + pub const FLOOD_VIOLATION_MADE_YOU_RESET: &str = "h2.flood.violation.made_you_reset"; + pub const FLOOD_VIOLATION_PING: &str = "h2.flood.violation.ping"; + pub const FLOOD_VIOLATION_PRIORITY: &str = "h2.flood.violation.priority"; + pub const FLOOD_VIOLATION_RAPID_RESET: &str = "h2.flood.violation.rapid_reset"; + pub const FLOOD_VIOLATION_SETTINGS: &str = "h2.flood.violation.settings"; +} + +/// HTTP counters (H1 + H2 share these); see `https` for the HTTPS-specific +/// variants and `h2` for H2-frame-level counters. +pub mod http { + pub const ERR_400: &str = "http.400.errors"; + pub const ERR_404: &str = "http.404.errors"; + pub const ACTIVE_REQUESTS: &str = "http.active_requests"; + pub const ALPN_H2: &str = "http.alpn.h2"; + pub const ALPN_HTTP11: &str = "http.alpn.http11"; + pub const BACKEND_PARSE_ERRORS: &str = "http.backend_parse_errors"; + pub const E2E_H2: &str = "http.e2e.h2"; + pub const E2E_HTTP11: &str = "http.e2e.http11"; + pub const EARLY_RESPONSE_CLOSE: &str = "http.early_response_close"; + pub const FAILED_BACKEND_MATCHING: &str = "http.failed_backend_matching"; + pub const FRONTEND_PARSE_ERRORS: &str = "http.frontend_parse_errors"; + pub const HSTS_FRONTEND_ADDED: &str = "http.hsts.frontend_added"; + pub const HSTS_FRONTEND_REFRESHED: &str = "http.hsts.frontend_refreshed"; + pub const HSTS_LISTENER_DEFAULT_PATCHED: &str = "http.hsts.listener_default_patched"; + pub const HSTS_SUPPRESSED_PLAINTEXT: &str = "http.hsts.suppressed_plaintext"; + pub const HSTS_UNRENDERED: &str = "http.hsts.unrendered"; + pub const INFINITE_LOOP_ERROR: &str = "http.infinite_loop.error"; + pub const REDIRECT_TEMPLATE_COMPILE_ERROR: &str = "http.redirect_template.compile_error"; + pub const REQUESTS: &str = "http.requests"; + pub const SNI_AUTHORITY_MISMATCH: &str = "http.sni_authority_mismatch"; + pub const STATUS_1XX: &str = "http.status.1xx"; + pub const STATUS_2XX: &str = "http.status.2xx"; + pub const STATUS_3XX: &str = "http.status.3xx"; + pub const STATUS_4XX: &str = "http.status.4xx"; + pub const STATUS_5XX: &str = "http.status.5xx"; + pub const STATUS_OTHER: &str = "http.status.other"; + pub const TRUSTING_X_PORT: &str = "http.trusting.x_port"; + pub const TRUSTING_X_PORT_DIFF: &str = "http.trusting.x_port.diff"; + pub const TRUSTING_X_PROTO: &str = "http.trusting.x_proto"; + pub const TRUSTING_X_PROTO_DIFF: &str = "http.trusting.x_proto.diff"; + pub const UPGRADE_EXPECT_FAILED: &str = "http.upgrade.expect.failed"; + pub const UPGRADE_MUX_FAILED: &str = "http.upgrade.mux.failed"; + pub const UPGRADE_WS_FAILED: &str = "http.upgrade.ws.failed"; + pub const X_REQUEST_ID_GENERATED: &str = "http.x_request_id.generated"; + pub const X_REQUEST_ID_PROPAGATED: &str = "http.x_request_id.propagated"; +} + +/// Per-status-code HTTP counters. Only the codes Sōzu either generates as +/// a default answer or that operators routinely chart get a dedicated bucket; +/// the rest fold into the `http::STATUS_*XX` parent buckets. +pub mod http_status { + pub const S200: &str = "http.status.200"; + pub const S201: &str = "http.status.201"; + pub const S204: &str = "http.status.204"; + pub const S301: &str = "http.status.301"; + pub const S302: &str = "http.status.302"; + pub const S304: &str = "http.status.304"; + pub const S400: &str = "http.status.400"; + pub const S401: &str = "http.status.401"; + pub const S403: &str = "http.status.403"; + pub const S404: &str = "http.status.404"; + pub const S408: &str = "http.status.408"; + pub const S413: &str = "http.status.413"; + pub const S429: &str = "http.status.429"; + pub const S500: &str = "http.status.500"; + pub const S502: &str = "http.status.502"; + pub const S503: &str = "http.status.503"; + pub const S504: &str = "http.status.504"; + pub const S507: &str = "http.status.507"; +} + +/// HTTPS-specific counters; see `http` for the HTTP family these complement. +pub mod https { + pub const ALPN_REJECTED_HTTP11_DISABLED: &str = "https.alpn.rejected.http11_disabled"; + pub const ALPN_REJECTED_UNSUPPORTED: &str = "https.alpn.rejected.unsupported"; + pub const UPGRADE_EXPECT_FAILED: &str = "https.upgrade.expect.failed"; + pub const UPGRADE_HANDSHAKE_FAILED: &str = "https.upgrade.handshake.failed"; + pub const UPGRADE_MUX_FAILED: &str = "https.upgrade.mux.failed"; + pub const UPGRADE_WSS_FAILED: &str = "https.upgrade.wss.failed"; +} + +/// Per-listener counters. +pub mod listener { + pub const ACCEPTED_TOTAL: &str = "listener.accepted.total"; + pub const CONNECTION_CAPPED: &str = "listener.connection_capped"; +} + +/// Pipe-protocol counters. +pub mod pipe { + pub const ERRORS: &str = "pipe.errors"; +} + +/// Protocol-type counters that increment once per session and track which +/// protocol carried it end-to-end. +pub mod protocol { + pub const HTTP: &str = "protocol.http"; + pub const HTTPS: &str = "protocol.https"; + pub const PROXY_EXPECT: &str = "protocol.proxy.expect"; + pub const PROXY_RELAY: &str = "protocol.proxy.relay"; + pub const PROXY_SEND: &str = "protocol.proxy.send"; + pub const TCP: &str = "protocol.tcp"; + pub const TLS_HANDSHAKE: &str = "protocol.tls.handshake"; + pub const WS: &str = "protocol.ws"; + pub const WSS: &str = "protocol.wss"; +} + +/// PROXY-protocol counters. +pub mod proxy_protocol { + pub const ERRORS: &str = "proxy_protocol.errors"; +} + +/// `rustls`-specific counters for read/write infinite-loop guards. +pub mod rustls { + pub const READ_ERROR: &str = "rustls.read.error"; + pub const READ_INFINITE_LOOP_ERROR: &str = "rustls.read.infinite_loop.error"; + pub const WRITE_ERROR: &str = "rustls.write.error"; + pub const WRITE_INFINITE_LOOP_ERROR: &str = "rustls.write.infinite_loop.error"; +} + +/// Generic session-level counters. +pub mod sessions { + pub const EVICTED: &str = "sessions.evicted"; +} + +/// Slab-allocator gauges. +pub mod slab { + pub const CAPACITY: &str = "slab.capacity"; + pub const ENTRIES: &str = "slab.entries"; + pub const USAGE_PERCENT: &str = "slab.usage_percent"; +} + +/// Raw-socket counters. +pub mod socket { + pub const READ_INFINITE_LOOP_ERROR: &str = "socket.read.infinite_loop.error"; + pub const WRITE_INFINITE_LOOP_ERROR: &str = "socket.write.infinite_loop.error"; +} + +/// TCP protocol counters. +pub mod tcp { + pub const INFINITE_LOOP_ERROR: &str = "tcp.infinite_loop.error"; + pub const READ_ERROR: &str = "tcp.read.error"; + pub const REQUESTS: &str = "tcp.requests"; + pub const UPGRADE_EXPECT_FAILED: &str = "tcp.upgrade.expect.failed"; + pub const UPGRADE_PIPE_FAILED: &str = "tcp.upgrade.pipe.failed"; + pub const UPGRADE_RELAY_FAILED: &str = "tcp.upgrade.relay.failed"; + pub const UPGRADE_SEND_FAILED: &str = "tcp.upgrade.send.failed"; + pub const WRITE_ERROR: &str = "tcp.write.error"; +} + +/// TLS counters (certificate inventory, handshake timing). +pub mod tls { + pub const CERT_MIN_EXPIRES_AT_SECONDS: &str = "tls.cert.min_expires_at_seconds"; + pub const DEFAULT_CERT_USED: &str = "tls.default_cert_used"; + pub const HANDSHAKE_MS: &str = "tls.handshake_ms"; +} + +/// WebSocket counters. +pub mod websocket { + pub const ACTIVE_REQUESTS: &str = "websocket.active_requests"; +} + +/// Miscellaneous counters that don't fit a richer family. +pub mod misc { + pub const PANIC: &str = "panic"; + pub const ZOMBIES: &str = "zombies"; +} diff --git a/lib/src/pool.rs b/lib/src/pool.rs index f2140fe32..0f1ffa8cb 100644 --- a/lib/src/pool.rs +++ b/lib/src/pool.rs @@ -13,6 +13,8 @@ use std::{ sync::atomic::{AtomicUsize, Ordering}, }; +use crate::metrics::names; + static BUFFER_COUNT: AtomicUsize = AtomicUsize::new(0); pub struct Pool { @@ -49,7 +51,7 @@ impl Pool { }) .map(|c| { let old_buffer_count = BUFFER_COUNT.fetch_add(1, Ordering::SeqCst); - gauge!("buffer.in_use", old_buffer_count + 1); + gauge!(names::buffer::IN_USE, old_buffer_count + 1); Checkout { inner: c } }) } @@ -113,7 +115,7 @@ impl ops::DerefMut for Checkout { impl Drop for Checkout { fn drop(&mut self) { let old_buffer_count = BUFFER_COUNT.fetch_sub(1, Ordering::SeqCst); - gauge!("buffer.in_use", old_buffer_count - 1); + gauge!(names::buffer::IN_USE, old_buffer_count - 1); } } diff --git a/lib/src/protocol/kawa_h1/editor.rs b/lib/src/protocol/kawa_h1/editor.rs index 70a5c5a5e..a565fd6cd 100644 --- a/lib/src/protocol/kawa_h1/editor.rs +++ b/lib/src/protocol/kawa_h1/editor.rs @@ -16,6 +16,7 @@ use std::{ use rusty_ulid::Ulid; use sozu_command_lib::logging::LogContext; +use crate::metrics::names; use crate::{ Protocol, RetrieveClusterError, pool::Checkout, @@ -523,10 +524,10 @@ impl HttpContext { } else if compare_no_case(key, b"X-Forwarded-Proto") { has_x_proto = true; // header.val = kawa::Store::Static(proto.as_bytes()); - incr!("http.trusting.x_proto"); + incr!(names::http::TRUSTING_X_PROTO); let val = header.val.data(buf); if !compare_no_case(val, proto.as_bytes()) { - incr!("http.trusting.x_proto.diff"); + incr!(names::http::TRUSTING_X_PROTO_DIFF); debug!( "{} Trusting X-Forwarded-Proto for {:?} even though {:?} != {}", self.log_context(), @@ -538,12 +539,12 @@ impl HttpContext { } else if compare_no_case(key, b"X-Forwarded-Port") { has_x_port = true; // header.val = kawa::Store::from_string(public_port.to_string()); - incr!("http.trusting.x_port"); + incr!(names::http::TRUSTING_X_PORT); let val = header.val.data(buf); let mut port_buf = itoa::Buffer::new(); let expected = port_buf.format(public_port); if !compare_no_case(val, expected.as_bytes()) { - incr!("http.trusting.x_port.diff"); + incr!(names::http::TRUSTING_X_PORT_DIFF); debug!( "{} Trusting X-Forwarded-Port for {:?} even though {:?} != {}", self.log_context(), @@ -733,7 +734,7 @@ impl HttpContext { // Either way, `self.x_request_id` is populated so the access log // records the exact value forwarded to the backend. if has_x_request_id { - incr!("http.x_request_id.propagated"); + incr!(names::http::X_REQUEST_ID_PROPAGATED); } else { let value = self.id.to_string(); request.push_block(kawa::Block::Header(kawa::Pair { @@ -741,7 +742,7 @@ impl HttpContext { val: kawa::Store::from_string(value.clone()), })); self.x_request_id = Some(value); - incr!("http.x_request_id.generated"); + incr!(names::http::X_REQUEST_ID_GENERATED); } // Create a custom correlation header (defaults to "Sozu-Id", can be diff --git a/lib/src/protocol/kawa_h1/mod.rs b/lib/src/protocol/kawa_h1/mod.rs index d3030737f..cca0ef412 100644 --- a/lib/src/protocol/kawa_h1/mod.rs +++ b/lib/src/protocol/kawa_h1/mod.rs @@ -28,6 +28,7 @@ use sozu_command::{ }; // use time::{Duration, Instant}; +use crate::metrics::names; use crate::{ AcceptError, BackendConnectAction, BackendConnectionError, BackendConnectionStatus, L7ListenerHandler, L7Proxy, ListenerHandler, Protocol, ProxySession, Readiness, @@ -339,7 +340,7 @@ impl Http Http 0 { self.request_stream.storage.fill(size); - count!("bytes_in", size as i64); + count!(names::backend::BYTES_IN, size as i64); metrics.bin += size; // if self.kawa_request.storage.is_full() { // self.frontend_readiness.interest.remove(Ready::READABLE); @@ -487,12 +488,12 @@ impl Http Http 0 { response_stream.consume(size); - count!("bytes_out", size as i64); + count!(names::backend::BYTES_OUT, size as i64); metrics.bout += size; self.backend_readiness.interest.insert(Ready::READABLE); } @@ -644,7 +645,7 @@ impl Http Http Http 0 { self.request_stream.consume(size); - count!("back_bytes_out", size as i64); + count!(names::backend::BACK_BYTES_OUT, size as i64); metrics.backend_bout += size; self.frontend_readiness.interest.insert(Ready::READABLE); self.backend_readiness.interest.insert(Ready::READABLE); @@ -859,7 +860,7 @@ impl Http 0 { response_stream.storage.fill(size); - count!("back_bytes_in", size as i64); + count!(names::backend::BACK_BYTES_IN, size as i64); metrics.backend_bin += size; // if self.kawa_response.storage.is_full() { // self.backend_readiness.interest.remove(Ready::READABLE); @@ -904,7 +905,7 @@ impl Http Http Http incr!("http.400.errors"), + DefaultAnswer::Answer400 { .. } => incr!(names::http::ERR_400), DefaultAnswer::Answer401 { .. } => incr!( "http.401.errors", self.context.cluster_id.as_deref(), self.context.backend_id.as_deref() ), - DefaultAnswer::Answer404 { .. } => incr!("http.404.errors"), + DefaultAnswer::Answer404 { .. } => incr!(names::http::ERR_404), DefaultAnswer::Answer408 { .. } => incr!( "http.408.errors", self.context.cluster_id.as_deref(), @@ -1310,9 +1311,9 @@ impl Http Http Http Http Http Http Http SessionState //if the state was initial, the connection was already reset if !self.request_stream.is_initial() { - gauge_add!("http.active_requests", -1); + gauge_add!(names::http::ACTIVE_REQUESTS, -1); if let Some(b) = self.backend.as_mut() { let mut backend = b.borrow_mut(); @@ -2268,23 +2271,43 @@ fn save_http_status_metric(status: Option, context: LogContext) { if let Some(status) = status { match status { 100..=199 => { - incr!("http.status.1xx", context.cluster_id, context.backend_id); + incr!( + names::http::STATUS_1XX, + context.cluster_id, + context.backend_id + ); } 200..=299 => { - incr!("http.status.2xx", context.cluster_id, context.backend_id); + incr!( + names::http::STATUS_2XX, + context.cluster_id, + context.backend_id + ); } 300..=399 => { - incr!("http.status.3xx", context.cluster_id, context.backend_id); + incr!( + names::http::STATUS_3XX, + context.cluster_id, + context.backend_id + ); } 400..=499 => { - incr!("http.status.4xx", context.cluster_id, context.backend_id); + incr!( + names::http::STATUS_4XX, + context.cluster_id, + context.backend_id + ); } 500..=599 => { - incr!("http.status.5xx", context.cluster_id, context.backend_id); + incr!( + names::http::STATUS_5XX, + context.cluster_id, + context.backend_id + ); } _ => { // http responses with other codes (protocol error) - incr!("http.status.other"); + incr!(names::http::STATUS_OTHER); } } diff --git a/lib/src/protocol/mux/answers.rs b/lib/src/protocol/mux/answers.rs index f854156a3..d4b633c30 100644 --- a/lib/src/protocol/mux/answers.rs +++ b/lib/src/protocol/mux/answers.rs @@ -7,6 +7,7 @@ use sozu_command::logging::ansi_palette; use super::{GenericHttpStream, H2Error, Readiness, Stream, StreamState}; +use crate::metrics::names; use crate::protocol::http::{DefaultAnswer, answers::HttpAnswers}; /// Module-level prefix used on every log line emitted from the mux @@ -219,9 +220,9 @@ pub(crate) fn set_default_answer_with_retry_after( 301 => "http.301.redirection", 302 => "http.302.redirection", 308 => "http.308.redirection", - 400 => "http.400.errors", + 400 => names::http::ERR_400, 401 => "http.401.errors", - 404 => "http.404.errors", + 404 => names::http::ERR_404, 408 => "http.408.errors", 413 => "http.413.errors", 421 => "http.421.errors", @@ -298,7 +299,7 @@ pub(crate) fn set_default_answer_with_retry_after( log_module_context!(), code ); - incr!("http.redirect_template.compile_error"); + incr!(names::http::REDIRECT_TEMPLATE_COMPILE_ERROR); } result }) @@ -323,7 +324,7 @@ pub(crate) fn set_default_answer_with_retry_after( context.status = Some(resolved_status); stream.state = StreamState::Unlinked; readiness.arm_writable(); - incr!("h2.signal.writable.rearmed.default_answer"); + incr!(names::h2::SIGNAL_WRITABLE_REARMED_DEFAULT_ANSWER); } /// Forcefully terminates a kawa message by setting the "end_stream" flag and setting the parsing_phase to Error. @@ -339,7 +340,7 @@ pub(crate) fn forcefully_terminate_answer( kawa.parsing_phase.error(error.as_str().into()); stream.state = StreamState::Unlinked; readiness.arm_writable(); - incr!("h2.signal.writable.rearmed.forcefully_terminate_answer"); + incr!(names::h2::SIGNAL_WRITABLE_REARMED_FORCEFULLY_TERMINATE_ANSWER); } #[cfg(test)] diff --git a/lib/src/protocol/mux/connection.rs b/lib/src/protocol/mux/connection.rs index c1416ee92..f8e9c08ae 100644 --- a/lib/src/protocol/mux/connection.rs +++ b/lib/src/protocol/mux/connection.rs @@ -31,6 +31,7 @@ use super::{ Position, Router, h2::{self, H2StreamId}, }; +use crate::metrics::names; use crate::{ L7ListenerHandler, ListenerHandler, Readiness, backends::Backend, pool::Pool, socket::SocketHandler, timer::TimeoutContainer, @@ -371,14 +372,14 @@ impl Connection { if let Position::Client(cluster_id, backend, _) = self.position() { let mut backend_borrow = backend.borrow_mut(); backend_borrow.dec_connections(); - gauge_add!("backend.connections", -1); + gauge_add!(names::backend::CONNECTIONS, -1); // Pair with the `+1` at `router.rs::connect` (new-dial path). // This is the graceful-close decrement, used both by the dead // backend path in `mod.rs::back_readable` (which routes through // `client.close()`) and by any explicit Connection::close. - gauge_add!("backend.pool.size", -1); + gauge_add!(names::backend::POOL_SIZE, -1); gauge_add!( - "connections_per_backend", + names::backend::CONNECTIONS_PER_BACKEND, -1, Some(cluster_id), Some(&backend_borrow.backend_id) diff --git a/lib/src/protocol/mux/converter.rs b/lib/src/protocol/mux/converter.rs index 9682e7684..5891c046a 100644 --- a/lib/src/protocol/mux/converter.rs +++ b/lib/src/protocol/mux/converter.rs @@ -14,6 +14,7 @@ use kawa::{ }; use sozu_command::logging::ansi_palette; +use crate::metrics::names; use crate::protocol::{ http::parser::compare_no_case, mux::{ @@ -443,7 +444,7 @@ impl BlockConverter for H2BlockConverter<'_> { self.window, data.len() ); - incr!("h2.flow_control_stall"); + incr!(names::h2::FLOW_CONTROL_STALL); if self.position_is_client { // Direction-scoped counterpart: the proxy → backend // write was paused because the backend's HTTP/2 @@ -453,7 +454,7 @@ impl BlockConverter for H2BlockConverter<'_> { // updates, so the stall metric is the only signal // available without plumbing an explicit boundary // through `flush_stream_out`. - incr!("backend.flow_control.paused"); + incr!(names::backend::FLOW_CONTROL_PAUSED); } kawa.blocks.push_front(Block::Chunk(Chunk { data })); return false; @@ -477,7 +478,7 @@ impl BlockConverter for H2BlockConverter<'_> { } kawa.push_out(Store::from_slice(&header)); kawa.push_out(data); - incr!("h2.frames.tx.data"); + incr!(names::h2::FRAMES_TX_DATA); // kawa.push_delimiter(); // RFC 9218 §4: incremental streams yield to the scheduler // after every DATA frame so same-urgency incremental peers @@ -547,7 +548,7 @@ impl BlockConverter for H2BlockConverter<'_> { } kawa.push_out(Store::from_slice(&header)); kawa.push_out(Store::from_vec(payload)); - incr!("h2.frames.tx.headers"); + incr!(names::h2::FRAMES_TX_HEADERS); true } else { let chunks = payload.chunks(self.max_frame_size); @@ -577,7 +578,7 @@ impl BlockConverter for H2BlockConverter<'_> { ); return false; } - incr!("h2.frames.tx.headers"); + incr!(names::h2::FRAMES_TX_HEADERS); } else if let Err(e) = gen_frame_header( &mut header, &FrameHeader { @@ -594,7 +595,7 @@ impl BlockConverter for H2BlockConverter<'_> { ); return false; } else { - incr!("h2.frames.tx.continuation"); + incr!(names::h2::FRAMES_TX_CONTINUATION); } kawa.push_out(Store::from_slice(&header)); kawa.push_out(Store::from_slice(chunk)); @@ -623,7 +624,7 @@ impl BlockConverter for H2BlockConverter<'_> { return false; } kawa.push_out(Store::from_slice(&header)); - incr!("h2.frames.tx.data"); + incr!(names::h2::FRAMES_TX_DATA); } } } @@ -671,7 +672,7 @@ impl BlockConverter for H2BlockConverter<'_> { // path) would still try to encode headers/data after our RST. kawa.blocks.clear(); self.out.clear(); - incr!("h2.headers.rejected.budget_overrun"); + incr!(names::h2::HEADERS_REJECTED_BUDGET_OVERRUN); return; } if !self.out.is_empty() { diff --git a/lib/src/protocol/mux/h1.rs b/lib/src/protocol/mux/h1.rs index 817644610..49ada2f39 100644 --- a/lib/src/protocol/mux/h1.rs +++ b/lib/src/protocol/mux/h1.rs @@ -10,6 +10,7 @@ use std::{io::IoSlice, time::Instant}; use rusty_ulid::Ulid; use sozu_command::{logging::ansi_palette, ready::Ready}; +use crate::metrics::names; use crate::{ L7ListenerHandler, ListenerHandler, Readiness, protocol::mux::{ @@ -171,7 +172,7 @@ impl ConnectionH1 { log_module_context!(), stream_id ); - incr!("h1.backend_eof_before_message_complete"); + incr!(names::h1::BACKEND_EOF_BEFORE_MESSAGE_COMPLETE); kawa.parsing_phase .error(kawa::ParsingErrorKind::Processing { message: "INTERNAL_ERROR", @@ -269,7 +270,7 @@ impl ConnectionH1 { if kawa.is_error() { match self.position { Position::Client(..) => { - incr!("http.backend_parse_errors"); + incr!(names::http::BACKEND_PARSE_ERRORS); let StreamState::Linked(token) = stream.state else { error!( "{} client stream in error is not in Linked state", @@ -282,7 +283,7 @@ impl ConnectionH1 { endpoint.end_stream(token, global_stream_id, context); } Position::Server => { - incr!("http.frontend_parse_errors"); + incr!(names::http::FRONTEND_PARSE_ERRORS); let answers = answers_rc.borrow(); set_default_answer(stream, &mut self.readiness, 400, &answers); } @@ -356,8 +357,8 @@ impl ConnectionH1 { } self.requests += 1; trace!("{} REQUESTS: {}", log_context!(self), self.requests); - incr!("http.requests"); - gauge_add!("http.active_requests", 1); + incr!(names::http::REQUESTS); + gauge_add!(names::http::ACTIVE_REQUESTS, 1); parts.metrics.service_start(); // Set request_counted after the last use of `parts` to satisfy the borrow checker stream.request_counted = true; @@ -552,7 +553,7 @@ impl ConnectionH1 { } _ => {} } - incr!("http.e2e.http11"); + incr!(names::http::E2E_HTTP11); stream.metrics.backend_stop(); let client_rtt = socket_rtt(self.socket.socket_ref()); let server_rtt = stream @@ -606,8 +607,8 @@ impl ConnectionH1 { set_default_answer(stream, &mut self.readiness, 400, &answers); } else if is_main { self.requests += 1; - incr!("http.requests"); - gauge_add!("http.active_requests", 1); + incr!(names::http::REQUESTS); + gauge_add!(names::http::ACTIVE_REQUESTS, 1); stream.metrics.service_start(); stream.request_counted = true; stream.state = StreamState::Link; diff --git a/lib/src/protocol/mux/h2.rs b/lib/src/protocol/mux/h2.rs index deb9a62c5..a7c7cb222 100644 --- a/lib/src/protocol/mux/h2.rs +++ b/lib/src/protocol/mux/h2.rs @@ -27,6 +27,7 @@ const _: () = assert!( use rusty_ulid::Ulid; use sozu_command::{logging::ansi_palette, ready::Ready}; +use crate::metrics::names; use crate::{ L7ListenerHandler, ListenerHandler, Protocol, Readiness, SessionMetrics, protocol::mux::{ @@ -2094,7 +2095,7 @@ impl ConnectionH2 { match serializer::gen_settings(kawa.storage.space(), &self.local_settings) { Ok((_, size)) => { kawa.storage.fill(size); - incr!("h2.frames.tx.settings"); + incr!(names::h2::FRAMES_TX_SETTINGS); // RFC 9113 §6.5: start tracking SETTINGS ACK timeout self.settings_sent_at = Some(Instant::now()); } @@ -2210,13 +2211,13 @@ impl ConnectionH2 { let ds = snapshot.1 as i64 - prev.1 as i64; let du = snapshot.2 as i64 - prev.2 as i64; if dw != 0 { - gauge_add!("h2.connection.window_bytes", dw); + gauge_add!(names::h2::CONNECTION_WINDOW_BYTES, dw); } if ds != 0 { - gauge_add!("h2.connection.active_streams", ds); + gauge_add!(names::h2::CONNECTION_ACTIVE_STREAMS, ds); } if du != 0 { - gauge_add!("h2.connection.pending_window_updates", du); + gauge_add!(names::h2::CONNECTION_PENDING_WINDOW_UPDATES, du); } self.last_gauge_snapshot = Some(snapshot); } @@ -2231,13 +2232,13 @@ impl ConnectionH2 { fn release_connection_gauges(&mut self) { if let Some((w, s, u)) = self.last_gauge_snapshot.take() { if w != 0 { - gauge_add!("h2.connection.window_bytes", -(w as i64)); + gauge_add!(names::h2::CONNECTION_WINDOW_BYTES, -(w as i64)); } if s != 0 { - gauge_add!("h2.connection.active_streams", -(s as i64)); + gauge_add!(names::h2::CONNECTION_ACTIVE_STREAMS, -(s as i64)); } if u != 0 { - gauge_add!("h2.connection.pending_window_updates", -(u as i64)); + gauge_add!(names::h2::CONNECTION_PENDING_WINDOW_UPDATES, -(u as i64)); } } } @@ -3014,7 +3015,7 @@ impl ConnectionH2 { "finalize_write: retained WRITABLE (control queue non-empty)".to_owned(), )); self.readiness.arm_writable(); - incr!("h2.signal.writable.rearmed.control_queue"); + incr!(names::h2::SIGNAL_WRITABLE_REARMED_CONTROL_QUEUE); } else { // We wrote everything #[cfg(debug_assertions)] @@ -3087,7 +3088,7 @@ impl ConnectionH2 { Ok((_, size)) => { offset += size; written_ids.push(stream_id); - incr!("h2.frames.tx.window_update"); + incr!(names::h2::FRAMES_TX_WINDOW_UPDATE); } Err(_) => { // Buffer full — stop here, remaining entries stay in the map @@ -3246,7 +3247,7 @@ impl ConnectionH2 { match serializer::gen_settings(kawa.storage.space(), &self.local_settings) { Ok((_, size)) => { kawa.storage.fill(size); - incr!("h2.frames.tx.settings"); + incr!(names::h2::FRAMES_TX_SETTINGS); // RFC 9113 §6.5: start tracking SETTINGS ACK timeout self.settings_sent_at = Some(Instant::now()); } @@ -3434,7 +3435,7 @@ impl ConnectionH2 { where L: ListenerHandler + L7ListenerHandler, { - incr!("http.e2e.h2"); + incr!(names::http::E2E_H2); stream.metrics.backend_stop(); stream.generate_access_log( false, @@ -3540,7 +3541,7 @@ impl ConnectionH2 { stream_id, increment ); - incr!("h2.window_update_dropped"); + incr!(names::h2::WINDOW_UPDATE_DROPPED); } self.readiness.arm_writable(); } @@ -3805,7 +3806,7 @@ impl ConnectionH2 { /// with that result — the flood detector tripped its lifetime cap /// and converted to a connection-wide GOAWAY. fn account_emitted_rst(&mut self, error: H2Error) -> Option { - incr!("h2.frames.tx.rst_stream"); + incr!(names::h2::FRAMES_TX_RST_STREAM); count!(metric_for_rst_stream_sent(error), 1); if !matches!(error, H2Error::NoError) { if let Some(violation) = self.flood_detector.record_rst_emitted() { @@ -4038,7 +4039,7 @@ impl ConnectionH2 { match serializer::gen_goaway(kawa.storage.space(), self.highest_peer_stream_id, error) { Ok((_, size)) => { kawa.storage.fill(size); - incr!("h2.frames.tx.goaway"); + incr!(names::h2::FRAMES_TX_GOAWAY); self.state = H2State::GoAway; self.expect_write = Some(H2StreamId::Zero); self.readiness.interest = Ready::WRITABLE | Ready::HUP | Ready::ERROR; @@ -4098,7 +4099,7 @@ impl ConnectionH2 { match serializer::gen_goaway(kawa.storage.space(), STREAM_ID_MAX, H2Error::NoError) { Ok((_, size)) => { kawa.storage.fill(size); - incr!("h2.frames.tx.goaway"); + incr!(names::h2::FRAMES_TX_GOAWAY); // Stay in the current state so the connection can continue processing // existing streams. The second GOAWAY will transition to GoAway state. // Keep READABLE so in-flight request bodies can still be received @@ -4560,7 +4561,7 @@ impl ConnectionH2 { // just pushed into stream.back; the synthetic event is the // only wake path. LIFECYCLE invariant 15. endpoint.readiness_mut(token).arm_writable(); - incr!("h2.signal.writable.rearmed.peer_data"); + incr!(names::h2::SIGNAL_WRITABLE_REARMED_PEER_DATA); } } MuxResult::Continue @@ -4606,7 +4607,7 @@ impl ConnectionH2 { log_context!(self), self ); - incr!("h2.headers_no_stream.error"); + incr!(names::h2::HEADERS_NO_STREAM_ERROR); self.attribute_bytes_to_overhead(); return self.force_disconnect(); }; @@ -4654,8 +4655,8 @@ impl ConnectionH2 { kawa.storage.clear(); if let Err((error, global)) = status { match self.position { - Position::Client(..) => incr!("http.backend_parse_errors"), - Position::Server => incr!("http.frontend_parse_errors"), + Position::Client(..) => incr!(names::http::BACKEND_PARSE_ERRORS), + Position::Server => incr!(names::http::FRONTEND_PARSE_ERRORS), } if global { error!( @@ -4701,12 +4702,12 @@ impl ConnectionH2 { if let StreamState::Linked(token) = stream.state { // Mirror of handle_data_frame's rearm. LIFECYCLE invariant 15. endpoint.readiness_mut(token).arm_writable(); - incr!("h2.signal.writable.rearmed.peer_headers"); + incr!(names::h2::SIGNAL_WRITABLE_REARMED_PEER_HEADERS); } // was_initial prevents trailers from triggering connection if was_initial && self.position.is_server() { - incr!("http.requests"); - gauge_add!("http.active_requests", 1); + incr!(names::http::REQUESTS); + gauge_add!(names::http::ACTIVE_REQUESTS, 1); stream.metrics.service_start(); stream.request_counted = true; stream.state = StreamState::Link; @@ -4826,7 +4827,7 @@ impl ConnectionH2 { // stripped WRITABLE, the scheduler won't re-run without a synthetic // wake — pair the interest insert with signal_pending_write. self.readiness.arm_writable(); - incr!("h2.signal.writable.rearmed.priority_update"); + incr!(names::h2::SIGNAL_WRITABLE_REARMED_PRIORITY_UPDATE); MuxResult::Continue } @@ -4882,7 +4883,7 @@ impl ConnectionH2 { // so the SOC can alert on the rate of pre-response RSTs without // having to differentiate by error code. if !response_started { - count!("h2.rst_stream.received.pre_response_start", 1); + count!(names::h2::RST_STREAM_RECEIVED_PRE_RESPONSE_START, 1); } debug!( "{} RstStream({} -> {})", @@ -5076,7 +5077,7 @@ impl ConnectionH2 { match serializer::gen_ping_acknowledgement(kawa.storage.space(), &ping.payload) { Ok((_, size)) => { kawa.storage.fill(size); - incr!("h2.frames.tx.ping_ack"); + incr!(names::h2::FRAMES_TX_PING_ACK); } Err(error) => { error!( @@ -5662,7 +5663,7 @@ impl ConnectionH2 { if buf.len() >= frame.len() { buf[..frame.len()].copy_from_slice(&frame); kawa.storage.fill(frame.len()); - incr!("h2.frames.tx.rst_stream"); + incr!(names::h2::FRAMES_TX_RST_STREAM); count!(metric_for_rst_stream_sent(H2Error::Cancel), 1); self.readiness.arm_writable(); self.rst_sent.insert(id); @@ -6361,7 +6362,7 @@ mod tests { keys.push(metric_for_goaway_received(unknown_code)); keys.push(metric_for_rst_stream_received(unknown_code)); // …and the dedicated Rapid Reset signature counter. - keys.push("h2.rst_stream.received.pre_response_start"); + keys.push(names::h2::RST_STREAM_RECEIVED_PRE_RESPONSE_START); for key in &keys { assert!( diff --git a/lib/src/protocol/mux/mod.rs b/lib/src/protocol/mux/mod.rs index 4138b2212..9c3c9e84a 100644 --- a/lib/src/protocol/mux/mod.rs +++ b/lib/src/protocol/mux/mod.rs @@ -144,6 +144,7 @@ pub(crate) mod serializer; mod shared; pub mod stream; +use crate::metrics::names; use crate::{ BackendConnectionError, L7ListenerHandler, L7Proxy, ListenerHandler, ProxySession, Readiness, RetrieveClusterError, SessionIsToBeClosed, SessionMetrics, SessionResult, StateResult, @@ -223,16 +224,16 @@ impl Position { /// Increment the global `count!()` counter for bytes read on this side. pub fn count_bytes_in_counter(&self, size: usize) { match self { - Position::Client(..) => count!("back_bytes_in", size as i64), - Position::Server => count!("bytes_in", size as i64), + Position::Client(..) => count!(names::backend::BACK_BYTES_IN, size as i64), + Position::Server => count!(names::backend::BYTES_IN, size as i64), } } /// Increment the global `count!()` counter for bytes written on this side. pub fn count_bytes_out_counter(&self, size: usize) { match self { - Position::Client(..) => count!("back_bytes_out", size as i64), - Position::Server => count!("bytes_out", size as i64), + Position::Client(..) => count!(names::backend::BACK_BYTES_OUT, size as i64), + Position::Server => count!(names::backend::BYTES_OUT, size as i64), } } @@ -846,7 +847,7 @@ impl= MAX_LOOP_ITERATIONS { - incr!("http.infinite_loop.error"); + incr!(names::http::INFINITE_LOOP_ERROR); if self.frontend.has_pending_write() { debug!( "{} Mux loop budget exhausted while frontend flush pending: {:?}", @@ -1663,7 +1666,7 @@ impl { let mut backend_borrow = backend.borrow_mut(); backend_borrow.dec_connections(); - gauge_add!("backend.connections", -1); + gauge_add!(names::backend::CONNECTIONS, -1); // Second `-1` site for `backend.pool.size` (the first is // in `connection.rs::pre_close_client_bookkeeping`). This // path runs during session teardown when the frontend @@ -1761,9 +1764,9 @@ impl = match context.tls_cert_names.as_deref() { - Some(names) => authority_matched_cert_name(host, names), + Some(cert_names) => authority_matched_cert_name(host, cert_names), None => { if authority_matches_sni(host, sni) { Some(sni) @@ -618,7 +620,7 @@ impl Router { // same predicate doesn't silently double-count // sequential `Host:` reuse as "coalescing". if !authority_matches_sni(host, sni) && context.tls_alpn == Some("h2") { - incr!("h2.coalescing.accepted"); + incr!(names::h2::COALESCING_ACCEPTED); debug!( "{} accepted coalesced authority {:?} (SNI {:?}, matched SAN {:?})", log_module_context!(context), @@ -629,7 +631,7 @@ impl Router { } } None => { - incr!("http.sni_authority_mismatch"); + incr!(names::http::SNI_AUTHORITY_MISMATCH); warn!( "{} rejecting request: TLS cert SANs do not cover :authority {:?} (SNI {:?})", log_module_context!(context), diff --git a/lib/src/protocol/mux/stream.rs b/lib/src/protocol/mux/stream.rs index da6f3de28..1e0a17506 100644 --- a/lib/src/protocol/mux/stream.rs +++ b/lib/src/protocol/mux/stream.rs @@ -16,6 +16,7 @@ use mio::Token; use sozu_command::logging::ansi_palette; use super::{GenericHttpStream, Position}; +use crate::metrics::names; use crate::{ L7ListenerHandler, ListenerHandler, Protocol, SessionMetrics, pool::Pool, protocol::http::editor::HttpContext, @@ -64,7 +65,7 @@ pub struct Stream { pub front_data_received: usize, /// Tracks total DATA payload bytes received on the backend for content-length validation (RFC 9113 §8.1.1) pub back_data_received: usize, - /// True when `gauge_add!("http.active_requests", 1)` was emitted for this stream. + /// True when `gauge_add!(names::http::ACTIVE_REQUESTS, 1)` was emitted for this stream. /// Prevents underflow when `generate_access_log` is called for streams that never /// had their request fully parsed (idle timeouts, malformed requests). pub request_counted: bool, @@ -231,7 +232,7 @@ impl Stream { // errors) takes precedence when both are present. let message = message.or(context.access_log_message); if self.request_counted { - gauge_add!("http.active_requests", -1); + gauge_add!(names::http::ACTIVE_REQUESTS, -1); self.request_counted = false; } if error { @@ -263,12 +264,12 @@ impl Stream { // the short-list shared with the H1 path (`save_http_status_metric`). let bucket_key = if let Some(status) = context.status { match status { - 100..=199 => "http.status.1xx", - 200..=299 => "http.status.2xx", - 300..=399 => "http.status.3xx", - 400..=499 => "http.status.4xx", - 500..=599 => "http.status.5xx", - _ => "http.status.other", + 100..=199 => names::http::STATUS_1XX, + 200..=299 => names::http::STATUS_2XX, + 300..=399 => names::http::STATUS_3XX, + 400..=499 => names::http::STATUS_4XX, + 500..=599 => names::http::STATUS_5XX, + _ => names::http::STATUS_OTHER, } } else { "http.status.none" @@ -308,7 +309,7 @@ impl Stream { log_access! { error, - on_failure: { incr!("unsent-access-logs") }, + on_failure: { incr!(names::access_logs::UNSENT) }, message, context: context.log_context(), session_address: context.session_address, diff --git a/lib/src/protocol/pipe.rs b/lib/src/protocol/pipe.rs index ecd0b79f9..e57777478 100644 --- a/lib/src/protocol/pipe.rs +++ b/lib/src/protocol/pipe.rs @@ -15,6 +15,7 @@ use sozu_command::{ logging::{EndpointRecord, LogContext, ansi_palette}, }; +use crate::metrics::names; use crate::{ L7Proxy, ListenerHandler, Protocol, Readiness, SessionMetrics, SessionResult, StateResult, backends::Backend, @@ -311,7 +312,7 @@ impl Pipe { metrics.register_end_of_session(&context); log_access!( error, - on_failure: { incr!("unsent-access-logs") }, + on_failure: { incr!(names::access_logs::UNSENT) }, message, context, session_address: self.get_session_address(), @@ -346,7 +347,7 @@ impl Pipe { } pub fn log_request_error(&self, metrics: &SessionMetrics, message: &str) { - incr!("pipe.errors"); + incr!(names::pipe::ERRORS); error!( "{} Could not process request properly got: {}", log_context!(self), @@ -507,7 +508,7 @@ impl Pipe { //FIXME: replace with copy() self.frontend_buffer.fill(sz); - count!("bytes_in", sz as i64); + count!(names::backend::BYTES_IN, sz as i64); metrics.bin += sz; if self.frontend_buffer.available_space() == 0 { @@ -575,7 +576,7 @@ impl Pipe { while res == SocketResult::Continue { // no more data in buffer, stop here if self.backend_buffer.available_data() == 0 { - count!("bytes_out", sz as i64); + count!(names::backend::BYTES_OUT, sz as i64); metrics.bout += sz; self.backend_readiness.interest.insert(Ready::READABLE); self.frontend_readiness.interest.remove(Ready::WRITABLE); @@ -596,7 +597,7 @@ impl Pipe { if !self.check_connections() { metrics.bout += sz; - count!("bytes_out", sz as i64); + count!(names::backend::BYTES_OUT, sz as i64); self.frontend_readiness.reset(); self.backend_readiness.reset(); self.log_request_success(metrics); @@ -605,7 +606,7 @@ impl Pipe { } if sz > 0 { - count!("bytes_out", sz as i64); + count!(names::backend::BYTES_OUT, sz as i64); self.backend_readiness.interest.insert(Ready::READABLE); metrics.bout += sz; } @@ -665,7 +666,7 @@ impl Pipe { if self.frontend_buffer.available_data() == 0 { self.frontend_readiness.interest.insert(Ready::READABLE); self.backend_readiness.interest.remove(Ready::WRITABLE); - count!("back_bytes_out", sz as i64); + count!(names::backend::BACK_BYTES_OUT, sz as i64); metrics.backend_bout += sz; return SessionResult::Continue; } @@ -685,7 +686,7 @@ impl Pipe { } } - count!("back_bytes_out", sz as i64); + count!(names::backend::BACK_BYTES_OUT, sz as i64); metrics.backend_bout += sz; if !self.check_connections() { @@ -749,7 +750,7 @@ impl Pipe { } if size > 0 { self.frontend_readiness.interest.insert(Ready::WRITABLE); - count!("back_bytes_in", size as i64); + count!(names::backend::BACK_BYTES_IN, size as i64); metrics.backend_bin += size; } @@ -819,7 +820,7 @@ impl Pipe { if sz > 0 { self.splice_pipe.as_mut().unwrap().in_pipe_pending += sz; - count!("bytes_in", sz as i64); + count!(names::backend::BYTES_IN, sz as i64); metrics.bin += sz; self.backend_readiness.interest.insert(Ready::WRITABLE); } else { @@ -883,7 +884,7 @@ impl Pipe { let pending = self.splice_out_pending(); // no more data in pipe, stop here if pending == 0 { - count!("bytes_out", sz as i64); + count!(names::backend::BYTES_OUT, sz as i64); metrics.bout += sz; self.backend_readiness.interest.insert(Ready::READABLE); self.frontend_readiness.interest.remove(Ready::WRITABLE); @@ -909,7 +910,7 @@ impl Pipe { if !self.check_connections() { metrics.bout += sz; - count!("bytes_out", sz as i64); + count!(names::backend::BYTES_OUT, sz as i64); self.frontend_readiness.reset(); self.backend_readiness.reset(); self.log_request_success(metrics); @@ -918,7 +919,7 @@ impl Pipe { } if sz > 0 { - count!("bytes_out", sz as i64); + count!(names::backend::BYTES_OUT, sz as i64); self.backend_readiness.interest.insert(Ready::READABLE); metrics.bout += sz; } @@ -976,7 +977,7 @@ impl Pipe { if pending == 0 { self.frontend_readiness.interest.insert(Ready::READABLE); self.backend_readiness.interest.remove(Ready::WRITABLE); - count!("back_bytes_out", sz as i64); + count!(names::backend::BACK_BYTES_OUT, sz as i64); metrics.backend_bout += sz; return SessionResult::Continue; } @@ -1001,7 +1002,7 @@ impl Pipe { } } - count!("back_bytes_out", sz as i64); + count!(names::backend::BACK_BYTES_OUT, sz as i64); metrics.backend_bout += sz; if !self.check_connections() { @@ -1071,7 +1072,7 @@ impl Pipe { if size > 0 { self.splice_pipe.as_mut().unwrap().out_pipe_pending += size; self.frontend_readiness.interest.insert(Ready::WRITABLE); - count!("back_bytes_in", size as i64); + count!(names::backend::BACK_BYTES_IN, size as i64); metrics.backend_bin += size; } @@ -1231,7 +1232,7 @@ impl SessionState for Pipe { MAX_LOOP_ITERATIONS ); - incr!("http.infinite_loop.error"); + incr!(names::http::INFINITE_LOOP_ERROR); self.print_state(self.protocol_string()); return SessionResult::Close; diff --git a/lib/src/protocol/proxy_protocol/expect.rs b/lib/src/protocol/proxy_protocol/expect.rs index 3e8c96023..2f04e5115 100644 --- a/lib/src/protocol/proxy_protocol/expect.rs +++ b/lib/src/protocol/proxy_protocol/expect.rs @@ -17,6 +17,7 @@ use sozu_command::{ }; use super::{header::ProxyAddr, parser::parse_v2_header}; +use crate::metrics::names; use crate::{ Protocol, Readiness, SessionMetrics, StateResult, pool::Checkout, @@ -135,7 +136,7 @@ impl ExpectProxyProtocol { if sz > 0 { self.index += sz; - count!("bytes_in", sz as i64); + count!(names::backend::BYTES_IN, sz as i64); metrics.bin += sz; if self.index == self.frontend_buffer.len() { @@ -153,7 +154,7 @@ impl ExpectProxyProtocol { metrics.bin, metrics.bout ); - incr!("proxy_protocol.errors"); + incr!(names::proxy_protocol::ERRORS); self.frontend_readiness.reset(); return SessionResult::Close; } @@ -206,7 +207,7 @@ impl ExpectProxyProtocol { "{} proxy protocol header exceeds maximum size (232 bytes), closing", log_context!(self) ); - incr!("proxy_protocol.errors"); + incr!(names::proxy_protocol::ERRORS); self.frontend_readiness.reset(); return SessionResult::Close; } @@ -220,7 +221,7 @@ impl ExpectProxyProtocol { log_context!(self), e.input.to_hex(16) ); - incr!("proxy_protocol.errors"); + incr!(names::proxy_protocol::ERRORS); self.frontend_readiness.reset(); SessionResult::Close } @@ -339,7 +340,7 @@ impl SessionState for ExpectProxyProtocol { log_context!(self), MAX_LOOP_ITERATIONS ); - incr!("http.infinite_loop.error"); + incr!(names::http::INFINITE_LOOP_ERROR); self.print_state(""); diff --git a/lib/src/protocol/proxy_protocol/relay.rs b/lib/src/protocol/proxy_protocol/relay.rs index 104f8654f..524d1f1ee 100644 --- a/lib/src/protocol/proxy_protocol/relay.rs +++ b/lib/src/protocol/proxy_protocol/relay.rs @@ -12,6 +12,7 @@ use nom::{Err, Offset}; use rusty_ulid::Ulid; use sozu_command::logging::ansi_palette; +use crate::metrics::names; use crate::{ Protocol, Readiness, SessionMetrics, SessionResult, pool::Checkout, @@ -117,7 +118,7 @@ impl RelayProxyProtocol { if sz > 0 { self.frontend_buffer.fill(sz); - count!("bytes_in", sz as i64); + count!(names::backend::BYTES_IN, sz as i64); metrics.bin += sz; if res == SocketResult::Error { @@ -125,7 +126,7 @@ impl RelayProxyProtocol { "{} front socket error, closing the connection", log_context!(self) ); - incr!("proxy_protocol.errors"); + incr!(names::proxy_protocol::ERRORS); self.frontend_readiness.reset(); self.backend_readiness.reset(); return SessionResult::Close; @@ -178,7 +179,7 @@ impl RelayProxyProtocol { Ok(sz) => { self.cursor_header += sz; - count!("back_bytes_out", sz as i64); + count!(names::backend::BACK_BYTES_OUT, sz as i64); metrics.backend_bout += sz; self.frontend_buffer.consume(sz); @@ -188,7 +189,7 @@ impl RelayProxyProtocol { } } Err(e) => { - incr!("proxy_protocol.errors"); + incr!(names::proxy_protocol::ERRORS); self.frontend_readiness.reset(); self.backend_readiness.reset(); debug!("{} write error: {}", log_context!(self), e); diff --git a/lib/src/protocol/proxy_protocol/send.rs b/lib/src/protocol/proxy_protocol/send.rs index 64c52a0dd..0d82f7190 100644 --- a/lib/src/protocol/proxy_protocol/send.rs +++ b/lib/src/protocol/proxy_protocol/send.rs @@ -15,6 +15,7 @@ use mio::{Token, net::TcpStream}; use rusty_ulid::Ulid; use sozu_command::logging::ansi_palette; +use crate::metrics::names; use crate::{ BackendConnectionStatus, Protocol, Readiness, SessionMetrics, SessionResult, pool::Checkout, @@ -136,7 +137,7 @@ impl SendProxyProtocol { match socket.write(&header[self.cursor_header..]) { Ok(sz) => { self.cursor_header += sz; - count!("back_bytes_out", sz as i64); + count!(names::backend::BACK_BYTES_OUT, sz as i64); metrics.backend_bout += sz; if self.cursor_header == header.len() { @@ -150,7 +151,7 @@ impl SendProxyProtocol { return SessionResult::Continue; } e => { - incr!("proxy_protocol.errors"); + incr!(names::proxy_protocol::ERRORS); debug!("{} write error: {:?}", log_context!(self), e); return SessionResult::Close; } diff --git a/lib/src/protocol/rustls.rs b/lib/src/protocol/rustls.rs index cba1c8dec..727a8df39 100644 --- a/lib/src/protocol/rustls.rs +++ b/lib/src/protocol/rustls.rs @@ -16,6 +16,7 @@ use sozu_command::{ logging::{LogContext, ansi_palette}, }; +use crate::metrics::names; use crate::{ Readiness, Ready, SessionMetrics, SessionResult, StateResult, protocol::SessionState, timer::TimeoutContainer, @@ -190,7 +191,7 @@ impl TlsHandshake { self.frontend_readiness.event.insert(Ready::READABLE); self.frontend_readiness.interest.insert(Ready::WRITABLE); if let Some(elapsed_ms) = self.record_handshake_duration_ms() { - time!("tls.handshake_ms", elapsed_ms); + time!(names::tls::HANDSHAKE_MS, elapsed_ms); } SessionResult::Upgrade } @@ -251,14 +252,14 @@ impl TlsHandshake { } else if self.session.wants_read() { self.frontend_readiness.interest.insert(Ready::READABLE); if let Some(elapsed_ms) = self.record_handshake_duration_ms() { - time!("tls.handshake_ms", elapsed_ms); + time!(names::tls::HANDSHAKE_MS, elapsed_ms); } SessionResult::Upgrade } else { self.frontend_readiness.interest.insert(Ready::WRITABLE); self.frontend_readiness.interest.insert(Ready::READABLE); if let Some(elapsed_ms) = self.record_handshake_duration_ms() { - time!("tls.handshake_ms", elapsed_ms); + time!(names::tls::HANDSHAKE_MS, elapsed_ms); } SessionResult::Upgrade } @@ -400,7 +401,7 @@ impl SessionState for TlsHandshake { MAX_LOOP_ITERATIONS ); - incr!("http.infinite_loop.error"); + incr!(names::http::INFINITE_LOOP_ERROR); self.print_state("HTTPS"); return SessionResult::Close; diff --git a/lib/src/router/mod.rs b/lib/src/router/mod.rs index 49b5bfe50..511e433d7 100644 --- a/lib/src/router/mod.rs +++ b/lib/src/router/mod.rs @@ -18,6 +18,7 @@ use sozu_command::{ state::ClusterId, }; +use crate::metrics::names; use crate::{ protocol::{http::editor::HeaderEditMode, http::parser::Method}, router::pattern_trie::{TrieMatches, TrieNode, TrieSubMatch}, @@ -700,7 +701,10 @@ impl DomainRule { let start = Instant::now(); let is_a_match = r.is_match(hostname); let now = Instant::now(); - time!("regex_matching_time", (now - start).as_millis()); + time!( + names::event_loop::REGEX_MATCHING_TIME, + (now - start).as_millis() + ); is_a_match } } @@ -780,7 +784,10 @@ impl PathRule { let start = Instant::now(); let is_a_match = regex.is_match(path); let now = Instant::now(); - time!("regex_matching_time", (now - start).as_millis()); + time!( + names::event_loop::REGEX_MATCHING_TIME, + (now - start).as_millis() + ); if is_a_match { PathRuleResult::Regex @@ -1363,7 +1370,7 @@ impl Frontend { val: rendered.into_bytes().into(), mode, }); - crate::incr!("http.hsts.frontend_added"); + crate::incr!(names::http::HSTS_FRONTEND_ADDED); } return Ok(Self { @@ -1495,7 +1502,7 @@ impl Frontend { val: rendered.into_bytes().into(), mode, }); - crate::incr!("http.hsts.frontend_added"); + crate::incr!(names::http::HSTS_FRONTEND_ADDED); } else { // Both upstream config layers (FileHstsConfig::to_proto and // build_hsts_from_cli) substitute DEFAULT_HSTS_MAX_AGE when @@ -1512,7 +1519,7 @@ impl Frontend { log_module_context!(), cluster_id, ); - crate::incr!("http.hsts.unrendered"); + crate::incr!(names::http::HSTS_UNRENDERED); } } diff --git a/lib/src/server.rs b/lib/src/server.rs index 710d94fc0..4e86a078c 100644 --- a/lib/src/server.rs +++ b/lib/src/server.rs @@ -7,6 +7,7 @@ use std::{ net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr}, os::unix::io::{AsRawFd, FromRawFd}, rc::Rc, + str::FromStr, sync::LazyLock, time::{Duration, Instant}, }; @@ -18,21 +19,23 @@ use mio::{ use slab::Slab; use sozu_command::{ channel::Channel, + config::MetricDetailLevel, logging, proto::command::{ ActivateListener, AddBackend, CertificatesWithFingerprints, Cluster, ClusterHashes, - ClusterInformations, DeactivateListener, Event, HttpListenerConfig, HttpsListenerConfig, - InitialState, ListenerType, LoadBalancingAlgorithms, LoadMetric, MetricsConfiguration, - RemoveBackend, Request, ResponseStatus, ServerConfig, - TcpListenerConfig as CommandTcpListener, UpdateHttpListenerConfig, - UpdateHttpsListenerConfig, UpdateTcpListenerConfig, WorkerRequest, WorkerResponse, - request::RequestType, response_content::ContentType, + ClusterInformations, DeactivateListener, Event, EventKind, HttpListenerConfig, + HttpsListenerConfig, InitialState, ListenerType, LoadBalancingAlgorithms, LoadMetric, + MetricDetail, MetricsConfiguration, RemoveBackend, Request, ResponseContent, + ResponseStatus, ServerConfig, TcpListenerConfig as CommandTcpListener, + UpdateHttpListenerConfig, UpdateHttpsListenerConfig, UpdateTcpListenerConfig, + WorkerRequest, WorkerResponse, request::RequestType, response_content::ContentType, }, ready::Ready, scm_socket::{Listeners, ScmSocket, ScmSocketError}, state::ConfigState, }; +use crate::metrics::names; use crate::{ AcceptError, Protocol, ProxyConfiguration, ProxySession, SessionIsToBeClosed, backends::{Backend, BackendMap}, @@ -135,6 +138,61 @@ pub fn push_event(event: Event) { }); } +/// Build the `WorkerMetricDetailStatus` content payload returned in +/// every successful `SetMetricDetail` worker response. The master +/// collects these across the fan-out and assembles them into +/// `MetricDetailStatus.workers[]` so the TUI sees each +/// worker's actual aggregator state instead of the master's view. +fn worker_metric_detail_status_content( + configured: MetricDetailLevel, + effective: MetricDetailLevel, + previous_effective: MetricDetailLevel, + active_lease_count: u32, +) -> ResponseContent { + use sozu_command::proto::command::WorkerMetricDetailStatus; + ContentType::WorkerMetricDetailStatus(WorkerMetricDetailStatus { + configured: MetricDetail::from(configured) as i32, + effective: MetricDetail::from(effective) as i32, + previous_effective: MetricDetail::from(previous_effective) as i32, + active_lease_count, + }) + .into() +} + +/// Build a `METRIC_DETAIL_CHANGED` event carrying the worker-local +/// transition payload (previous/effective levels + transition kind). +/// `client_id` is `Some(_)` for explicit apply/clear, `None` for the +/// polled janitor's bulk expiry. The master folds this Event into the +/// audit log alongside operator-initiated transitions emitted at the +/// dispatch site in `bin/src/command/requests.rs::worker_request`. +fn push_metric_detail_transition( + previous: MetricDetailLevel, + effective: MetricDetailLevel, + transition_kind: &'static str, + client_id: Option, +) { + use sozu_command::proto::command::MetricDetailTransition; + // No-op when nothing actually changed. Defence-in-depth — every + // caller already gates on `previous != effective`, but + // double-checking here means future call sites can't accidentally + // emit a "ghost" transition. + if previous == effective { + return; + } + push_event(Event { + kind: EventKind::MetricDetailChanged as i32, + cluster_id: None, + backend_id: None, + address: None, + metric_detail: Some(MetricDetailTransition { + previous_effective: MetricDetail::from(previous) as i32, + effective: MetricDetail::from(effective) as i32, + transition_kind: transition_kind.to_owned(), + client_id, + }), + }); +} + #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct ListenToken(pub usize); #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] @@ -371,7 +429,7 @@ impl SessionManager { pub fn check_limits(&mut self) -> bool { if self.nb_connections >= self.max_connections { error!("max number of session connection reached, flushing the accept queue"); - gauge!("accept_queue.backpressure", 1); + gauge!(names::accept_queue::BACKPRESSURE, 1); self.can_accept = false; return false; } @@ -382,7 +440,7 @@ impl SessionManager { "nb_connections: {}, max_connections: {}", self.nb_connections, self.max_connections ); - gauge!("accept_queue.backpressure", 1); + gauge!(names::accept_queue::BACKPRESSURE, 1); self.can_accept = false; return false; @@ -403,7 +461,7 @@ impl SessionManager { // `server.live` so all proxy gauges advance in lock-step. Keeping // `client.connections` per-event preserves the high-resolution // signal scrapers expect. - gauge!("client.connections", self.nb_connections); + gauge!(names::client::CONNECTIONS, self.nb_connections); } /// Decrements the number of sessions, start accepting new connections @@ -411,7 +469,7 @@ impl SessionManager { pub fn decr(&mut self) { assert!(self.nb_connections != 0); self.nb_connections -= 1; - gauge!("client.connections", self.nb_connections); + gauge!(names::client::CONNECTIONS, self.nb_connections); // do not be ready to accept right away, wait until we get back to 10% capacity if !self.can_accept && self.nb_connections < self.max_connections * 90 / 100 { @@ -419,7 +477,7 @@ impl SessionManager { "nb_connections = {}, max_connections = {}, starting to accept again", self.nb_connections, self.max_connections ); - gauge!("accept_queue.backpressure", 0); + gauge!(names::accept_queue::BACKPRESSURE, 0); self.can_accept = true; } } @@ -805,7 +863,10 @@ impl Server { } let after_epoll = Instant::now(); - time!("epoll_time", (after_epoll - self.loop_start).as_millis()); + time!( + names::event_loop::EPOLL_TIME, + (after_epoll - self.loop_start).as_millis() + ); self.loop_start = after_epoll; self.send_queue(); @@ -918,8 +979,8 @@ impl Server { let slab_capacity = sessions.slab.capacity(); let accept_threshold = sessions.accept_slab_threshold(); - gauge!("client.connections", nb_connections); - gauge!("client.connections_max", max_connections); + gauge!(names::client::CONNECTIONS, nb_connections); + gauge!(names::client::CONNECTIONS_MAX, max_connections); if max_connections > 0 { gauge!( "client.connections_percent", @@ -927,10 +988,10 @@ impl Server { ); } - gauge!("slab.entries", slab_len); - gauge!("slab.capacity", slab_capacity); + gauge!(names::slab::ENTRIES, slab_len); + gauge!(names::slab::CAPACITY, slab_capacity); if slab_capacity > 0 { - gauge!("slab.usage_percent", slab_len * 100 / slab_capacity); + gauge!(names::slab::USAGE_PERCENT, slab_len * 100 / slab_capacity); } if accept_threshold > 0 { gauge!( @@ -948,10 +1009,10 @@ impl Server { let pool = self.pool.borrow(); let used = pool.inner.used(); let capacity = pool.inner.capacity(); - gauge!("buffer.in_use", used); - gauge!("buffer.capacity", capacity); + gauge!(names::buffer::IN_USE, used); + gauge!(names::buffer::CAPACITY, capacity); if capacity > 0 { - gauge!("buffer.usage_percent", used * 100 / capacity); + gauge!(names::buffer::USAGE_PERCENT, used * 100 / capacity); } } // 1Hz tick for `accept_queue.saturated_seconds`. Increments once @@ -964,7 +1025,7 @@ impl Server { let now = Instant::now(); if now.duration_since(self.last_saturation_tick) >= ACCEPT_SATURATION_TICK { if !self.sessions.borrow().can_accept { - incr!("accept_queue.saturated_seconds"); + incr!(names::accept_queue::SATURATED_SECONDS); } self.last_saturation_tick = now; } @@ -1008,7 +1069,10 @@ impl Server { fn reset_loop_time_and_get_timeout(&mut self) -> Option { let now = Instant::now(); - time!("event_loop_time", (now - self.loop_start).as_millis()); + time!( + names::event_loop::EVENT_LOOP_TIME, + (now - self.loop_start).as_millis() + ); let mut timeout = match self.should_poll_at.as_ref() { None => self.poll_timeout, @@ -1129,7 +1193,7 @@ impl Server { } let zombie_count = zombie_tokens.len() as i64; - count!("zombies", zombie_count); + count!(names::misc::ZOMBIES, zombie_count); let remaining_count = self.shut_down_sessions_by_frontend_tokens(zombie_tokens); info!( @@ -1281,6 +1345,36 @@ impl Server { } fn notify(&mut self, message: WorkerRequest) { + // Polled lease-expiry janitor: SetMetricDetail leases self-expire after + // their TTL so a crashed `sozu top` cannot permanently elevate metrics + // cardinality. The janitor runs at most every LEASE_TICK_INTERVAL, + // gated by `lease_tick_due` so the hot path of `notify` doesn't pay + // the HashMap walk on every iteration. Single-threaded worker, so + // `borrow_mut` is safe here. + let now = std::time::Instant::now(); + // Capture (previous, effective) before releasing the borrow so we + // can emit an Event afterwards. Holding `METRICS.borrow_mut` + // across `push_event` would re-enter the same thread-local from + // inside `QUEUE.with` (safe but conceptually noisy); the + // two-step split keeps the borrow scopes minimal. + let lease_tick_transition = METRICS.with(|metrics| { + let mut m = metrics.borrow_mut(); + if !m.lease_tick_due(now) { + return None; + } + let previous = m.lease_tick(now)?; + let effective = m.detail_effective(); + Some((previous, effective)) + }); + if let Some((previous, effective)) = lease_tick_transition { + // The janitor retired one or more leases AND the effective + // level moved. Surface the worker-local transition as an + // Event so the master folds it into the audit log (closes + // the gap where TUI-crashed lease expiry was previously + // silent). `client_id` is `None` because the janitor may + // have retired multiple leases at once. + push_metric_detail_transition(previous, effective, "lease_tick_expired", None); + } match &message.content.request_type { Some(RequestType::ConfigureMetrics(configuration)) => { match MetricsConfiguration::try_from(*configuration) { @@ -1309,6 +1403,263 @@ impl Server { }); return; } + // Runtime cardinality lease verb — apply, renew, or clear a lease + // on this worker's `Aggregator`. The lease bumps `effective` to + // `max(configured, max(active leases))`; expiry runs on the polled + // janitor below. Master-side aggregation into `MetricDetailStatus` + // lands in a follow-up; for now the worker acks with a bare OK so + // the existing `worker_request` fan-out path can collect. + Some(RequestType::SetMetricDetail(req)) => { + // Master populates the peer binding from the connecting + // `ClientSession` before fan-out (`bin/src/command/ + // requests.rs::worker_request`). A pre-binding caller or a + // platform without `SO_PEERCRED` yields `PeerBinding::default()` + // — clears against that lease are accepted from anyone, per + // the proto contract on `SetMetricDetail.peer_pid`. + let presented_binding = crate::metrics::PeerBinding { + pid: req.peer_pid, + // Master sends Crockford-base32 ULIDs (`Ulid::to_string`); + // accept those, with a fallback hex parse for callers that + // happen to send `0x…` form. A failed parse degrades to + // `None` — the lease store treats that as "binding + // unknown" per the proto contract. + session_ulid: req.peer_session_ulid.as_deref().and_then(|s| { + rusty_ulid::Ulid::from_str(s) + .map(u128::from) + .ok() + .or_else(|| u128::from_str_radix(s.trim_start_matches("0x"), 16).ok()) + }), + }; + if req.clear.unwrap_or(false) { + // Defense-in-depth: the master pre-validates `client_id` + // length at the dispatch site, but worker IPC is not + // master-only — fuzz harnesses, serial_test-flagged + // integration tests, and future internal callers can + // issue an oversized clear directly. Mirror the apply + // path's `ClientIdTooLong` arm so an unbounded HashMap + // lookup is never driven by an operator-supplied string + // here either. The reason string echoes the byte length + // but not the operator bytes themselves (symmetric with + // the audit-column-smuggling guard on the apply path). + if req.client_id.len() > crate::metrics::LEASE_CLIENT_ID_MAX_BYTES { + let msg = format!( + "SetMetricDetail: clear client_id length {} exceeds {} bytes", + req.client_id.len(), + crate::metrics::LEASE_CLIENT_ID_MAX_BYTES, + ); + error!("{}", msg); + push_queue(WorkerResponse::error(message.id.clone(), msg)); + return; + } + // Capture transition fields + post-clear snapshot + // before releasing the borrow so we can emit an + // Event after AND build the WorkerMetricDetailStatus + // payload that the master folds into + // `MetricDetailStatus.workers[]`. Without + // this payload the master used its own view as a + // stand-in for the worker's per-aggregator state. + let (outcome, effective_after, configured_after, lease_count_after) = METRICS + .with(|metrics| { + let mut m = metrics.borrow_mut(); + let outcome = m.lease_clear(&req.client_id, presented_binding); + ( + outcome, + m.detail_effective(), + m.detail_configured(), + m.lease_count(), + ) + }); + match outcome { + crate::metrics::LeaseClearOutcome::Cleared { previous_effective } => { + push_metric_detail_transition( + previous_effective, + effective_after, + "lease_clear", + Some(req.client_id.clone()), + ); + push_queue(WorkerResponse::ok_with_content( + message.id.clone(), + worker_metric_detail_status_content( + configured_after, + effective_after, + previous_effective, + lease_count_after, + ), + )); + } + crate::metrics::LeaseClearOutcome::NotFound => { + // Silent no-op: no lease existed for that + // id. The worker's state is unchanged so + // previous_effective == effective. + push_queue(WorkerResponse::ok_with_content( + message.id.clone(), + worker_metric_detail_status_content( + configured_after, + effective_after, + effective_after, + lease_count_after, + ), + )); + } + crate::metrics::LeaseClearOutcome::Unauthorized => { + // Do NOT echo `req.client_id` here: the operator- + // supplied bytes flow back through the master's + // worker→reason aggregation into the audit line's + // `reason=` column, which is sanitised for control + // bytes only. The dedicated `lease_id=` audit + // column already carries the operator string + // through `sanitize_for_audit_kv`, so re-embedding + // it here would let a value containing `,` or `=` + // forge a sibling KV pair against SIEM consumers + // that split on `, key=value`. + let msg = "SetMetricDetail: clear refused (peer \ + binding does not match the apply-time owner)" + .to_owned(); + error!("{}", msg); + push_queue(WorkerResponse::error(message.id.clone(), msg)); + } + } + return; + } + let detail_proto = match req.detail { + Some(d) => d, + None => { + // Operator-supplied `client_id` is intentionally + // omitted from the reason string: the dedicated + // `lease_id=` audit column carries it through the + // strict KV sanitiser. See the matching comment on + // the `Unauthorized` arm above for the column- + // smuggling rationale. + let msg = "SetMetricDetail without `detail` and without `clear`".to_owned(); + error!("{}", msg); + push_queue(WorkerResponse::error(message.id.clone(), msg)); + return; + } + }; + let detail_enum = match MetricDetail::try_from(detail_proto) { + Ok(d) => d, + Err(e) => { + let msg = + format!("SetMetricDetail: invalid MetricDetail variant {detail_proto}"); + error!("{}: {}", msg, e); + push_queue(WorkerResponse::error(message.id.clone(), msg)); + return; + } + }; + let level = MetricDetailLevel::from(detail_enum); + // Bound the worst case BEFORE we touch the aggregator: the + // proto contract on `SetMetricDetail.ttl_seconds` says the + // worker rejects values larger than `LEASE_TTL_MAX` so a + // stuck operator-side renewer (or a buggy third-party client) + // cannot lock the worker into elevated cardinality. The + // `Aggregator::lease_apply` clamp is still in place as a + // defence-in-depth net for code paths that bypass this + // dispatch (proto fuzzing, future internal callers). + if let Some(t) = req.ttl_seconds + && u64::from(t) > crate::metrics::LEASE_TTL_MAX.as_secs() + { + let msg = format!( + "SetMetricDetail: ttl_seconds={t} exceeds LEASE_TTL_MAX={}", + crate::metrics::LEASE_TTL_MAX.as_secs() + ); + error!("{}", msg); + push_queue(WorkerResponse::error(message.id.clone(), msg)); + return; + } + let ttl_seconds = req.ttl_seconds.filter(|&t| t > 0).unwrap_or_else(|| { + // The default fits in a u32 by construction + // (LEASE_TTL_DEFAULT = 60 s); the lossy `as u32` cast + // is replaced with a checked conversion so any + // future tweak past `u32::MAX` seconds (≈ 136 years) + // can't silently truncate. Falls through to 60 s on + // the theoretical overflow path. + u32::try_from(crate::metrics::LEASE_TTL_DEFAULT.as_secs()).unwrap_or(60) + }); + let ttl = std::time::Duration::from_secs(ttl_seconds.into()); + let (outcome, configured_after, lease_count_after) = METRICS.with(|metrics| { + let mut m = metrics.borrow_mut(); + let outcome = + m.lease_apply(req.client_id.clone(), level, ttl, presented_binding); + (outcome, m.detail_configured(), m.lease_count()) + }); + match outcome { + crate::metrics::LeaseApplyOutcome::Applied { + previous_effective, + new_effective, + } => { + push_metric_detail_transition( + previous_effective, + new_effective, + "lease_apply", + Some(req.client_id.clone()), + ); + push_queue(WorkerResponse::ok_with_content( + message.id.clone(), + worker_metric_detail_status_content( + configured_after, + new_effective, + previous_effective, + lease_count_after, + ), + )); + } + crate::metrics::LeaseApplyOutcome::ClientIdTooLong => { + let msg = format!( + "SetMetricDetail: client_id length {} exceeds {} bytes", + req.client_id.len(), + crate::metrics::LEASE_CLIENT_ID_MAX_BYTES, + ); + error!("{}", msg); + push_queue(WorkerResponse::error(message.id.clone(), msg)); + } + crate::metrics::LeaseApplyOutcome::TableFull => { + // Same audit-column-smuggling guard as the + // `Unauthorized` and missing-detail arms: the + // operator-supplied `client_id` is already rendered + // safely through the strict KV sanitiser in the + // audit envelope's `lease_id=` column, so we keep + // it out of the reason string. + let msg = format!( + "SetMetricDetail: lease table at capacity ({} entries); reject new \ + apply — operators must retry after an active lease expires or is \ + cleared", + crate::metrics::LEASE_TABLE_CAP, + ); + error!("{}", msg); + push_queue(WorkerResponse::error(message.id.clone(), msg)); + } + crate::metrics::LeaseApplyOutcome::TtlOutOfRange => { + // Unreachable in the normal flow: the dispatch-time + // gate above already rejected ttl > LEASE_TTL_MAX. + // Surface explicitly so any future bypass (proto + // fuzzing, internal callers) fails loud rather + // than silently capping the lessor's intent. + let msg = format!( + "SetMetricDetail: ttl exceeds LEASE_TTL_MAX={} (internal contract \ + violation: dispatch gate should have rejected)", + crate::metrics::LEASE_TTL_MAX.as_secs(), + ); + error!("{}", msg); + push_queue(WorkerResponse::error(message.id.clone(), msg)); + } + crate::metrics::LeaseApplyOutcome::Unauthorized => { + // A renewal arrived against an existing lease whose + // apply-time peer binding does not match the + // presented one. The `client_id` is intentionally + // omitted from the error string — the audit-log + // row already carries it in the dedicated + // `lease_id` column, and echoing it here would + // route operator-controlled bytes through the + // freeform reason field. + let msg = "SetMetricDetail: renewal refused (peer binding does not \ + match the apply-time owner)" + .to_owned(); + error!("{}", msg); + push_queue(WorkerResponse::error(message.id.clone(), msg)); + } + } + return; + } Some(RequestType::Logging(logging_filter)) => { info!( "{} changing logging filter to {}", @@ -2059,7 +2410,7 @@ impl Server { // peer raced to close — recorded as `None` and silently // skipped for the per-source counter. let peer = sock.peer_addr().ok(); - incr!("listener.accepted.total"); + incr!(names::listener::ACCEPTED_TOTAL); incr!(proto_key); if let Some(peer_addr) = peer.as_ref() { incr!(per_source_bucket(peer_addr)); @@ -2087,15 +2438,15 @@ impl Server { } } - gauge!("accept_queue.connections", self.accept_queue.len()); + gauge!(names::accept_queue::CONNECTIONS, self.accept_queue.len()); } pub fn create_sessions(&mut self) { while let Some((sock, token, protocol, timestamp, _peer)) = self.accept_queue.pop_back() { let wait_time = Instant::now() - timestamp; - time!("accept_queue.wait_time", wait_time.as_millis()); + time!(names::accept_queue::WAIT_TIME, wait_time.as_millis()); if wait_time > self.accept_queue_timeout { - incr!("accept_queue.timeout"); + incr!(names::accept_queue::TIMEOUT); continue; } @@ -2105,7 +2456,7 @@ impl Server { // `listener.connection_capped` counts the popped socket so // the counter aligns with `check_limits` invocations rather // than with queue depth at the time of refusal. - incr!("listener.connection_capped"); + incr!(names::listener::CONNECTION_CAPPED); if !self.evict_on_queue_full { break; @@ -2138,7 +2489,7 @@ impl Server { break; } - count!("sessions.evicted", evicted as i64); + count!(names::sessions::EVICTED, evicted as i64); warn!( "evicted {} least recently active sessions to make room", evicted @@ -2190,7 +2541,7 @@ impl Server { self.sessions.borrow_mut().incr(); } - gauge!("accept_queue.connections", self.accept_queue.len()); + gauge!(names::accept_queue::CONNECTIONS, self.accept_queue.len()); } pub fn ready(&mut self, token: Token, events: Ready) { diff --git a/lib/src/socket.rs b/lib/src/socket.rs index c7e7ce560..9846836b0 100644 --- a/lib/src/socket.rs +++ b/lib/src/socket.rs @@ -18,6 +18,8 @@ use rusty_ulid::Ulid; use socket2::{Domain, Protocol, Socket, Type}; use sozu_command::{config::MAX_LOOP_ITERATIONS, logging::ansi_palette}; +use crate::metrics::names; + #[derive(thiserror::Error, Debug)] pub enum ServerBindError { #[error("could not set bind to socket: {0}")] @@ -181,7 +183,7 @@ fn tcp_socket_read( "{} MAX_LOOP_ITERATION reached in TcpStream::socket_read", log_socket_module_prefix(stream, session_ulid, configured_peer) ); - incr!("socket.read.infinite_loop.error"); + incr!(names::socket::READ_INFINITE_LOOP_ERROR); return (size, SocketResult::Error); } if size == buf.len() { @@ -248,7 +250,7 @@ fn tcp_socket_write( "{} MAX_LOOP_ITERATION reached in TcpStream::socket_write", log_socket_module_prefix(stream, session_ulid, configured_peer) ); - incr!("socket.write.infinite_loop.error"); + incr!(names::socket::WRITE_INFINITE_LOOP_ERROR); return (size, SocketResult::Error); } if size == buf.len() { @@ -263,7 +265,7 @@ fn tcp_socket_write( | ErrorKind::ConnectionAborted | ErrorKind::BrokenPipe | ErrorKind::ConnectionRefused => { - incr!("tcp.write.error"); + incr!(names::tcp::WRITE_ERROR); return (size, SocketResult::Closed); } // Noisy-expected transport failures (see `tcp_socket_read` @@ -279,7 +281,7 @@ fn tcp_socket_write( log_socket_module_prefix(stream, session_ulid, configured_peer), e ); - incr!("tcp.write.error"); + incr!(names::tcp::WRITE_ERROR); return (size, SocketResult::Error); } _ => { @@ -289,7 +291,7 @@ fn tcp_socket_write( log_socket_module_prefix(stream, session_ulid, configured_peer), e ); - incr!("tcp.write.error"); + incr!(names::tcp::WRITE_ERROR); return (size, SocketResult::Error); } }, @@ -311,7 +313,7 @@ fn tcp_socket_write_vectored( | ErrorKind::ConnectionAborted | ErrorKind::BrokenPipe | ErrorKind::ConnectionRefused => { - incr!("tcp.write.error"); + incr!(names::tcp::WRITE_ERROR); (0, SocketResult::Closed) } // Noisy-expected transport failures (see `tcp_socket_read` for @@ -325,7 +327,7 @@ fn tcp_socket_write_vectored( log_socket_module_prefix(stream, session_ulid, configured_peer), e ); - incr!("tcp.write.error"); + incr!(names::tcp::WRITE_ERROR); (0, SocketResult::Error) } _ => { @@ -335,7 +337,7 @@ fn tcp_socket_write_vectored( log_socket_module_prefix(stream, session_ulid, configured_peer), e ); - incr!("tcp.write.error"); + incr!(names::tcp::WRITE_ERROR); (0, SocketResult::Error) } }, @@ -368,11 +370,11 @@ impl SocketHandler for TcpStream { } fn read_error(&self) { - incr!("tcp.read.error"); + incr!(names::tcp::READ_ERROR); } fn write_error(&self) { - incr!("tcp.write.error"); + incr!(names::tcp::WRITE_ERROR); } } @@ -452,11 +454,11 @@ impl SocketHandler for SessionTcpStream { } fn read_error(&self) { - incr!("tcp.read.error"); + incr!(names::tcp::READ_ERROR); } fn write_error(&self) { - incr!("tcp.write.error"); + incr!(names::tcp::WRITE_ERROR); } fn session_ulid(&self) -> Option { @@ -502,7 +504,7 @@ impl SocketHandler for FrontRustls { "{} MAX_LOOP_ITERATION reached in FrontRustls::socket_read", log_socket_context!(self) ); - incr!("rustls.read.infinite_loop.error"); + incr!(names::rustls::READ_INFINITE_LOOP_ERROR); is_error = true; break; } @@ -639,7 +641,7 @@ impl SocketHandler for FrontRustls { "{} MAX_LOOP_ITERATION reached in FrontRustls::socket_write", log_socket_context!(self) ); - incr!("rustls.write.infinite_loop.error"); + incr!(names::rustls::WRITE_INFINITE_LOOP_ERROR); is_error = true; break; } @@ -665,7 +667,7 @@ impl SocketHandler for FrontRustls { | ErrorKind::ConnectionAborted | ErrorKind::BrokenPipe => { //FIXME: this should probably not happen here - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); is_closed = true; self.peer_reset = true; break; @@ -676,7 +678,7 @@ impl SocketHandler for FrontRustls { log_socket_context!(self), e ); - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); is_error = true; break; } @@ -698,7 +700,7 @@ impl SocketHandler for FrontRustls { ErrorKind::ConnectionReset | ErrorKind::ConnectionAborted | ErrorKind::BrokenPipe => { - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); is_closed = true; self.peer_reset = true; break; @@ -709,7 +711,7 @@ impl SocketHandler for FrontRustls { log_socket_context!(self), e ); - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); is_error = true; break; } @@ -736,7 +738,7 @@ impl SocketHandler for FrontRustls { ErrorKind::ConnectionReset | ErrorKind::ConnectionAborted | ErrorKind::BrokenPipe => { - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); is_closed = true; self.peer_reset = true; break; @@ -747,7 +749,7 @@ impl SocketHandler for FrontRustls { log_socket_context!(self), e ); - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); is_error = true; break; } @@ -802,7 +804,7 @@ impl SocketHandler for FrontRustls { "{} MAX_LOOP_ITERATION reached in FrontRustls::socket_write_vectored", log_socket_context!(self) ); - incr!("rustls.write.infinite_loop.error"); + incr!(names::rustls::WRITE_INFINITE_LOOP_ERROR); is_error = true; break; } @@ -829,7 +831,7 @@ impl SocketHandler for FrontRustls { ErrorKind::ConnectionReset | ErrorKind::ConnectionAborted | ErrorKind::BrokenPipe => { - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); is_closed = true; self.peer_reset = true; break; @@ -840,7 +842,7 @@ impl SocketHandler for FrontRustls { log_socket_context!(self), e ); - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); is_error = true; break; } @@ -865,7 +867,7 @@ impl SocketHandler for FrontRustls { ErrorKind::ConnectionReset | ErrorKind::ConnectionAborted | ErrorKind::BrokenPipe => { - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); is_closed = true; self.peer_reset = true; break; @@ -876,7 +878,7 @@ impl SocketHandler for FrontRustls { log_socket_context!(self), e ); - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); is_error = true; break; } @@ -900,7 +902,7 @@ impl SocketHandler for FrontRustls { ErrorKind::ConnectionReset | ErrorKind::ConnectionAborted | ErrorKind::BrokenPipe => { - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); is_closed = true; self.peer_reset = true; break; @@ -911,7 +913,7 @@ impl SocketHandler for FrontRustls { log_socket_context!(self), e ); - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); is_error = true; break; } @@ -933,7 +935,7 @@ impl SocketHandler for FrontRustls { ErrorKind::ConnectionReset | ErrorKind::ConnectionAborted | ErrorKind::BrokenPipe => { - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); is_closed = true; self.peer_reset = true; break; @@ -944,7 +946,7 @@ impl SocketHandler for FrontRustls { log_socket_context!(self), e ); - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); is_error = true; break; } @@ -998,11 +1000,11 @@ impl SocketHandler for FrontRustls { } fn read_error(&self) { - incr!("rustls.read.error"); + incr!(names::rustls::READ_ERROR); } fn write_error(&self) { - incr!("rustls.write.error"); + incr!(names::rustls::WRITE_ERROR); } fn session_ulid(&self) -> Option { diff --git a/lib/src/tcp.rs b/lib/src/tcp.rs index 9e783c9ae..26b79b677 100644 --- a/lib/src/tcp.rs +++ b/lib/src/tcp.rs @@ -21,6 +21,7 @@ use sozu_command::{ proto::command::request::RequestType, }; +use crate::metrics::names; use crate::{ AcceptError, BackendConnectAction, BackendConnectionError, BackendConnectionStatus, CachedTags, ListenerError, ListenerHandler, Protocol, ProxyConfiguration, ProxyError, ProxySession, @@ -161,7 +162,7 @@ impl TcpSession { let state = match proxy_protocol { Some(ProxyProtocolConfig::RelayHeader) => { backend_buffer_session = Some(backend_buffer); - gauge_add!("protocol.proxy.relay", 1); + gauge_add!(names::protocol::PROXY_RELAY, 1); TcpStateMachine::RelayProxyProtocol(RelayProxyProtocol::new( socket, frontend_token, @@ -173,7 +174,7 @@ impl TcpSession { Some(ProxyProtocolConfig::ExpectHeader) => { frontend_buffer_session = Some(frontend_buffer); backend_buffer_session = Some(backend_buffer); - gauge_add!("protocol.proxy.expect", 1); + gauge_add!(names::protocol::PROXY_EXPECT, 1); TcpStateMachine::ExpectProxyProtocol(ExpectProxyProtocol::new( container_frontend_timeout.clone(), socket, @@ -184,7 +185,7 @@ impl TcpSession { Some(ProxyProtocolConfig::SendHeader) => { frontend_buffer_session = Some(frontend_buffer); backend_buffer_session = Some(backend_buffer); - gauge_add!("protocol.proxy.send", 1); + gauge_add!(names::protocol::PROXY_SEND, 1); TcpStateMachine::SendProxyProtocol(SendProxyProtocol::new( socket, frontend_token, @@ -193,7 +194,7 @@ impl TcpSession { )) } None => { - gauge_add!("protocol.tcp", 1); + gauge_add!(names::protocol::TCP, 1); let mut pipe = Pipe::new( backend_buffer, backend_id.clone(), @@ -271,7 +272,7 @@ impl TcpSession { let context = self.log_context(); self.metrics.register_end_of_session(&context); info_access!( - on_failure: { incr!("unsent-access-logs") }, + on_failure: { incr!(names::access_logs::UNSENT) }, message: None, context, session_address: self.frontend_address, @@ -433,8 +434,8 @@ impl TcpSession { ); pipe.set_cluster_id(self.cluster_id.clone()); - gauge_add!("protocol.proxy.send", -1); - gauge_add!("protocol.tcp", 1); + gauge_add!(names::protocol::PROXY_SEND, -1); + gauge_add!(names::protocol::TCP, 1); return Some(TcpStateMachine::Pipe(pipe)); } @@ -450,8 +451,8 @@ impl TcpSession { let mut pipe = rpp.into_pipe(self.backend_buffer.take().unwrap(), self.listener.clone()); pipe.set_cluster_id(self.cluster_id.clone()); - gauge_add!("protocol.proxy.relay", -1); - gauge_add!("protocol.tcp", 1); + gauge_add!(names::protocol::PROXY_RELAY, -1); + gauge_add!(names::protocol::TCP, 1); return Some(TcpStateMachine::Pipe(pipe)); } @@ -476,8 +477,8 @@ impl TcpSession { ); pipe.set_cluster_id(self.cluster_id.clone()); - gauge_add!("protocol.proxy.expect", -1); - gauge_add!("protocol.tcp", 1); + gauge_add!(names::protocol::PROXY_EXPECT, -1); + gauge_add!(names::protocol::TCP, 1); return Some(TcpStateMachine::Pipe(pipe)); } @@ -555,9 +556,9 @@ impl TcpSession { self.backend_connected = status; if status == BackendConnectionStatus::Connected { - gauge_add!("backend.connections", 1); + gauge_add!(names::backend::CONNECTIONS, 1); gauge_add!( - "connections_per_backend", + names::backend::CONNECTIONS_PER_BACKEND, 1, self.cluster_id.as_deref(), self.metrics.backend_id.as_deref() @@ -583,7 +584,7 @@ impl TcpSession { self.metrics.backend_id.as_deref() ); gauge!( - "backend.available", + names::backend::AVAILABLE, 1, self.cluster_id.as_deref(), self.metrics.backend_id.as_deref() @@ -599,6 +600,7 @@ impl TcpSession { backend_id: Some(backend.backend_id.to_owned()), address: Some(backend.address.into()), cluster_id: None, + metric_detail: None, }); } @@ -646,7 +648,7 @@ impl TcpSession { self.metrics.backend_id.as_deref() ); gauge!( - "backend.available", + names::backend::AVAILABLE, 0, self.cluster_id.as_deref(), self.metrics.backend_id.as_deref() @@ -657,6 +659,7 @@ impl TcpSession { backend_id: Some(backend.backend_id.to_owned()), address: Some(backend.address.into()), cluster_id: None, + metric_detail: None, }); } } @@ -851,7 +854,7 @@ impl TcpSession { MAX_LOOP_ITERATIONS ); - incr!("tcp.infinite_loop.error"); + incr!(names::tcp::INFINITE_LOOP_ERROR); let front_interest = self.front_readiness().interest & self.front_readiness().event; let back_interest = self @@ -922,9 +925,9 @@ impl TcpSession { } if back_connected == BackendConnectionStatus::Connected { - gauge_add!("backend.connections", -1); + gauge_add!(names::backend::CONNECTIONS, -1); gauge_add!( - "connections_per_backend", + names::backend::CONNECTIONS_PER_BACKEND, -1, self.cluster_id.as_deref(), self.metrics.backend_id.as_deref() @@ -1083,18 +1086,18 @@ impl ProxySession for TcpSession { // Restore gauges match self.state.marker() { - StateMarker::Pipe => gauge_add!("protocol.tcp", -1), - StateMarker::SendProxyProtocol => gauge_add!("protocol.proxy.send", -1), - StateMarker::RelayProxyProtocol => gauge_add!("protocol.proxy.relay", -1), - StateMarker::ExpectProxyProtocol => gauge_add!("protocol.proxy.expect", -1), + StateMarker::Pipe => gauge_add!(names::protocol::TCP, -1), + StateMarker::SendProxyProtocol => gauge_add!(names::protocol::PROXY_SEND, -1), + StateMarker::RelayProxyProtocol => gauge_add!(names::protocol::PROXY_RELAY, -1), + StateMarker::ExpectProxyProtocol => gauge_add!(names::protocol::PROXY_EXPECT, -1), } if self.state.failed() { match self.state.marker() { - StateMarker::Pipe => incr!("tcp.upgrade.pipe.failed"), - StateMarker::SendProxyProtocol => incr!("tcp.upgrade.send.failed"), - StateMarker::RelayProxyProtocol => incr!("tcp.upgrade.relay.failed"), - StateMarker::ExpectProxyProtocol => incr!("tcp.upgrade.expect.failed"), + StateMarker::Pipe => incr!(names::tcp::UPGRADE_PIPE_FAILED), + StateMarker::SendProxyProtocol => incr!(names::tcp::UPGRADE_SEND_FAILED), + StateMarker::RelayProxyProtocol => incr!(names::tcp::UPGRADE_RELAY_FAILED), + StateMarker::ExpectProxyProtocol => incr!(names::tcp::UPGRADE_EXPECT_FAILED), } return; } @@ -1680,7 +1683,7 @@ impl ProxyConfiguration for TcpProxy { "{} Buffer capacity has been reached, stopping to accept new connections for now", log_module_context!() ); - gauge!("accept_queue.backpressure", 1); + gauge!(names::accept_queue::BACKPRESSURE, 1); self.sessions.borrow_mut().can_accept = false; return Err(AcceptError::BufferCapacityReached); @@ -1743,7 +1746,7 @@ impl ProxyConfiguration for TcpProxy { frontend_sock, wait_time, ); - incr!("tcp.requests"); + incr!(names::tcp::REQUESTS); let session = Rc::new(RefCell::new(session)); entry.insert(session); diff --git a/lib/src/tls.rs b/lib/src/tls.rs index 64f66a2fd..2fe84e1b8 100644 --- a/lib/src/tls.rs +++ b/lib/src/tls.rs @@ -8,13 +8,12 @@ use std::collections::HashSet; use std::{ collections::HashMap, fmt, - io::BufReader, str::FromStr, sync::{Arc, LazyLock, Mutex}, }; use rustls::{ - pki_types::{CertificateDer, PrivateKeyDer}, + pki_types::{CertificateDer, PrivateKeyDer, pem::PemObject}, server::{ClientHello, ResolvesServerCert}, sign::CertifiedKey, }; @@ -30,6 +29,7 @@ use sozu_command::{ proto::command::{AddCertificate, CertificateAndKey, ReplaceCertificate, SocketAddress}, }; +use crate::metrics::names; use crate::router::pattern_trie::{Key, KeyValue, TrieNode}; /// Module-level prefix used on every log line emitted from this module. @@ -155,21 +155,16 @@ impl TryFrom<&AddCertificate> for CertifiedKeyWrapper { ); } - let mut key_reader = BufReader::new(cert.key.as_bytes()); - - let item = match rustls_pemfile::read_one(&mut key_reader) - .map_err(|_| CertificateResolverError::EmptyKeys)? - { - Some(item) => item, - None => return Err(CertificateResolverError::EmptyKeys), - }; - - let private_key = match item { - rustls_pemfile::Item::Pkcs1Key(rsa_key) => PrivateKeyDer::from(rsa_key), - rustls_pemfile::Item::Pkcs8Key(pkcs8_key) => PrivateKeyDer::from(pkcs8_key), - rustls_pemfile::Item::Sec1Key(ec_key) => PrivateKeyDer::from(ec_key), - _ => return Err(CertificateResolverError::EmptyKeys), - }; + // Parse the PEM-encoded private key into a `PrivateKeyDer` via + // `rustls-pki-types`'s `PemObject` trait. `from_pem_slice` accepts + // PKCS1 / PKCS8 / SEC1 key formats the same way the old + // `rustls-pemfile::read_one` + per-variant `From::from` chain did, + // and folds the empty-input / no-PEM-object / unsupported-format + // cases into a single `Err` we surface as `EmptyKeys` (the + // existing variant covers any failure to extract a key from the + // supplied PEM blob). + let private_key = PrivateKeyDer::from_pem_slice(cert.key.as_bytes()) + .map_err(|_| CertificateResolverError::EmptyKeys)?; match any_supported_type(&private_key) { Ok(signing_key) => { @@ -237,7 +232,7 @@ impl CertificateResolver { return; }; let clamped = min_expiration.max(0) as usize; - gauge!("tls.cert.min_expires_at_seconds", clamped); + gauge!(names::tls::CERT_MIN_EXPIRES_AT_SECONDS, clamped); } /// persist a certificate, after ensuring validity, and checking if it can replace another certificate. @@ -529,7 +524,7 @@ impl ResolvesServerCert for MutexCertificateResolver { log_module_context!(), name ); - incr!("tls.default_cert_used"); + incr!(names::tls::DEFAULT_CERT_USED); DEFAULT_CERTIFICATE.clone() } }