From 936a327b5495ebb796f3041db7ae2c2568e488ba Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Thu, 26 Mar 2026 14:08:05 +0000 Subject: [PATCH 1/8] [multicast] Bump maghemite and OPTE dependencies + packaging for non-main builds Bump maghemite and OPTE to versions with the latest multicast support. OPTE now has the option to be installed via p5p package override from buildomat rather than directly downloading xde/opteadm binaries. The override mechanism (tools/opte_version_override) is sourced and packaged for use with install_opte.sh, deploy.sh, releng, and CI to install the unpublished OPTE build until it lands in the helios pkg repo. Note: CI check added to reject OPTE_COMMIT override on PRs targeting main. --- .github/buildomat/jobs/deploy.sh | 31 ++++---- .github/buildomat/jobs/package.sh | 2 + .github/workflows/check-opte-ver.yml | 15 ++++ Cargo.lock | 99 ++++++++++++----------- Cargo.toml | 10 +-- dev-tools/releng/src/main.rs | 113 +++++++++++++++++++++++++-- package-manifest.toml | 12 +-- tools/install_opte.sh | 60 +++++++++----- tools/maghemite_ddm_openapi_version | 2 +- tools/maghemite_mg_openapi_version | 2 +- tools/maghemite_mgd_checksums | 4 +- tools/opte_version | 2 +- tools/opte_version_override | 19 ++++- workspace-hack/Cargo.toml | 4 +- 14 files changed, 266 insertions(+), 109 deletions(-) diff --git a/.github/buildomat/jobs/deploy.sh b/.github/buildomat/jobs/deploy.sh index d8065e058c3..a9c42b7abc2 100755 --- a/.github/buildomat/jobs/deploy.sh +++ b/.github/buildomat/jobs/deploy.sh @@ -33,9 +33,6 @@ _exit_trap() { local status=$? set +o errexit - if [[ "x$OPTE_COMMIT" != "x" ]]; then - pfexec cp /tmp/opteadm /opt/oxide/opte/bin/opteadm - fi # # Stop cron in all zones (to stop logadm log rotation) @@ -134,19 +131,6 @@ z_swadm () { pfexec zlogin oxz_switch /opt/oxide/dendrite/bin/swadm $@ } -# only set this if you want to override the version of opte/xde installed by the -# install_opte.sh script -OPTE_COMMIT="" -if [[ "x$OPTE_COMMIT" != "x" ]]; then - curl -sSfOL https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/module/$OPTE_COMMIT/xde - pfexec rem_drv xde || true - pfexec mv xde /kernel/drv/amd64/xde - pfexec add_drv xde || true - curl -sSfOL https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/release/$OPTE_COMMIT/opteadm - chmod +x opteadm - cp opteadm /tmp/opteadm - pfexec mv opteadm /opt/oxide/opte/bin/opteadm -fi # # XXX work around 14537 (UFS should not allow directories to be unlinked) which @@ -197,6 +181,21 @@ ptime -m tar xvzf /input/package/work/package.tar.gz # shellcheck source=/dev/null source .github/buildomat/ci-env.sh +# Source the OPTE override (if any) from the canonical location and apply it. +# When set, install the override p5p from buildomat instead of using the +# version baked into the ramdisk image. The version must be pinned explicitly +# because IPS version ordering does not match semver. +# shellcheck source=/dev/null +source tools/opte_version_override +if [[ "x$OPTE_COMMIT" != "x" ]]; then + OPTE_VERSION="$(cat tools/opte_version)" + P5P_URL="https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/repo/$OPTE_COMMIT/opte.p5p" + P5P_PATH="/tmp/opte-override.p5p" + curl -sSfL -o "$P5P_PATH" "$P5P_URL" + pfexec pkg install -g "$P5P_PATH" "driver/network/opte@$OPTE_VERSION" + rm -f "$P5P_PATH" +fi + # Ask buildomat for the range of extra addresses that we're allowed to use, and # break them up into the ranges we need. diff --git a/.github/buildomat/jobs/package.sh b/.github/buildomat/jobs/package.sh index b43b91e9ec4..78df41dc5f6 100755 --- a/.github/buildomat/jobs/package.sh +++ b/.github/buildomat/jobs/package.sh @@ -60,5 +60,7 @@ files=( target/release/xtask target/debug/bootstrap tests/* + tools/opte_version + tools/opte_version_override ) ptime -m tar cvzf $WORK/package.tar.gz "${files[@]}" "${packages[@]}" diff --git a/.github/workflows/check-opte-ver.yml b/.github/workflows/check-opte-ver.yml index e516eeacbe6..4cad1f04ef8 100644 --- a/.github/workflows/check-opte-ver.yml +++ b/.github/workflows/check-opte-ver.yml @@ -5,6 +5,7 @@ on: - '.github/workflows/check-opte-ver.yml' - 'Cargo.toml' - 'tools/opte_version' + - 'tools/opte_version_override' jobs: check-opte-ver: runs-on: ubuntu-22.04 @@ -18,3 +19,17 @@ jobs: run: cargo install toml-cli@0.2.3 - name: Check OPTE version and rev match run: ./tools/ci_check_opte_ver.sh + check-opte-override: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + ref: ${{ github.event.pull_request.head.sha }} # see omicron#4461 + - name: Reject OPTE override on main + run: | + source tools/opte_version_override + if [[ "x$OPTE_COMMIT" != "x" ]]; then + echo "::error::OPTE_COMMIT is set in tools/opte_version_override." + echo "::error::The OPTE override must be cleared before merging to main." + exit 1 + fi diff --git a/Cargo.lock b/Cargo.lock index 3e902cfe04b..a5aeb263438 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1696,7 +1696,7 @@ version = "3.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] @@ -2513,7 +2513,7 @@ dependencies = [ [[package]] name = "ddm-admin-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?rev=b603c9f3dccefcf1d3e941c04505ff6bdd1826b6#b603c9f3dccefcf1d3e941c04505ff6bdd1826b6" +source = "git+https://github.com/oxidecomputer/maghemite?branch=multicast-e2e#11786c0d91beca4ae6f51aedfc7079fac71b3461" dependencies = [ "oxnet", "progenitor 0.13.0", @@ -2925,11 +2925,11 @@ dependencies = [ [[package]] name = "dlpi" version = "0.2.0" -source = "git+https://github.com/oxidecomputer/dlpi-sys#d9645f8d61187e76384474b1100f6537fb644993" +source = "git+https://github.com/oxidecomputer/dlpi-sys#7cce2d3ab9dcac909642e1d1060f27bb2549cfdc" dependencies = [ "libc", "libdlpi-sys", - "num_enum 0.7.5", + "num_enum 0.7.6", "pretty-hex", "thiserror 2.0.18", ] @@ -3534,7 +3534,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -4993,7 +4993,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.2", + "socket2 0.5.10", "system-configuration", "tokio", "tower-layer", @@ -5188,7 +5188,7 @@ dependencies = [ [[package]] name = "illumos-sys-hdrs" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=e547d07b08c3f3d6c821c9eb7a958adcffce6e56#e547d07b08c3f3d6c821c9eb7a958adcffce6e56" +source = "git+https://github.com/oxidecomputer/opte?rev=c570ac2126dbbebbd8e98e73b580c5be6b7e460e#c570ac2126dbbebbd8e98e73b580c5be6b7e460e" dependencies = [ "bitflags 2.11.0", ] @@ -5658,7 +5658,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" dependencies = [ "hermit-abi 0.5.2", "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -5735,7 +5735,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -5870,7 +5870,7 @@ dependencies = [ [[package]] name = "kstat-macro" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=e547d07b08c3f3d6c821c9eb7a958adcffce6e56#e547d07b08c3f3d6c821c9eb7a958adcffce6e56" +source = "git+https://github.com/oxidecomputer/opte?rev=c570ac2126dbbebbd8e98e73b580c5be6b7e460e#c570ac2126dbbebbd8e98e73b580c5be6b7e460e" dependencies = [ "quote", "syn 2.0.117", @@ -5944,7 +5944,7 @@ checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" [[package]] name = "libdlpi-sys" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/dlpi-sys#d9645f8d61187e76384474b1100f6537fb644993" +source = "git+https://github.com/oxidecomputer/dlpi-sys#7cce2d3ab9dcac909642e1d1060f27bb2549cfdc" [[package]] name = "libefi-illumos" @@ -6066,14 +6066,14 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" [[package]] name = "libnet" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/netadm-sys?branch=main#1e40efd8497973ef3b7d6f0285510424f53f43c5" +source = "git+https://github.com/oxidecomputer/netadm-sys?branch=main#e9bf1e519ce304bf9cc6e87b50b9c64a77c8b0c1" dependencies = [ "anyhow", "cfg-if", "colored 3.1.1", "dlpi", "libc", - "num_enum 0.7.5", + "num_enum 0.7.6", "nvpair", "nvpair-sys", "oxnet", @@ -6082,7 +6082,7 @@ dependencies = [ "socket2 0.6.2", "thiserror 2.0.18", "tracing", - "winnow 0.7.14", + "winnow 1.0.0", ] [[package]] @@ -6460,7 +6460,7 @@ dependencies = [ [[package]] name = "mg-admin-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?rev=b603c9f3dccefcf1d3e941c04505ff6bdd1826b6#b603c9f3dccefcf1d3e941c04505ff6bdd1826b6" +source = "git+https://github.com/oxidecomputer/maghemite?branch=multicast-e2e#11786c0d91beca4ae6f51aedfc7079fac71b3461" dependencies = [ "chrono", "colored 3.1.1", @@ -7901,7 +7901,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -8033,11 +8033,11 @@ dependencies = [ [[package]] name = "num_enum" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1207a7e20ad57b847bbddc6776b968420d38292bbfe2089accff5e19e82454c" +checksum = "5d0bca838442ec211fa11de3a8b0e0e8f3a4522575b5c4c06ed722e005036f26" dependencies = [ - "num_enum_derive 0.7.5", + "num_enum_derive 0.7.6", "rustversion", ] @@ -8055,11 +8055,11 @@ dependencies = [ [[package]] name = "num_enum_derive" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff32365de1b6743cb203b710788263c44a03de03802daf96092f2da4fe6ba4d7" +checksum = "680998035259dcfcafe653688bf2aa6d3e2dc05e98be6ab46afb089dc84f1df8" dependencies = [ - "proc-macro-crate 3.4.0", + "proc-macro-crate 1.3.1", "proc-macro2", "quote", "syn 2.0.117", @@ -9376,7 +9376,7 @@ dependencies = [ "uuid", "vergen", "vergen-lib", - "winnow 0.7.14", + "winnow 0.7.15", "x509-cert", "zerocopy 0.8.40", "zeroize", @@ -9521,7 +9521,7 @@ dependencies = [ [[package]] name = "opte" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=e547d07b08c3f3d6c821c9eb7a958adcffce6e56#e547d07b08c3f3d6c821c9eb7a958adcffce6e56" +source = "git+https://github.com/oxidecomputer/opte?rev=c570ac2126dbbebbd8e98e73b580c5be6b7e460e#c570ac2126dbbebbd8e98e73b580c5be6b7e460e" dependencies = [ "bitflags 2.11.0", "dyn-clone", @@ -9540,7 +9540,7 @@ dependencies = [ [[package]] name = "opte-api" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=e547d07b08c3f3d6c821c9eb7a958adcffce6e56#e547d07b08c3f3d6c821c9eb7a958adcffce6e56" +source = "git+https://github.com/oxidecomputer/opte?rev=c570ac2126dbbebbd8e98e73b580c5be6b7e460e#c570ac2126dbbebbd8e98e73b580c5be6b7e460e" dependencies = [ "illumos-sys-hdrs", "ingot", @@ -9553,7 +9553,7 @@ dependencies = [ [[package]] name = "opte-ioctl" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=e547d07b08c3f3d6c821c9eb7a958adcffce6e56#e547d07b08c3f3d6c821c9eb7a958adcffce6e56" +source = "git+https://github.com/oxidecomputer/opte?rev=c570ac2126dbbebbd8e98e73b580c5be6b7e460e#c570ac2126dbbebbd8e98e73b580c5be6b7e460e" dependencies = [ "libc", "libnet", @@ -9650,7 +9650,7 @@ dependencies = [ [[package]] name = "oxide-vpc" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=e547d07b08c3f3d6c821c9eb7a958adcffce6e56#e547d07b08c3f3d6c821c9eb7a958adcffce6e56" +source = "git+https://github.com/oxidecomputer/opte?rev=c570ac2126dbbebbd8e98e73b580c5be6b7e460e#c570ac2126dbbebbd8e98e73b580c5be6b7e460e" dependencies = [ "cfg-if", "illumos-sys-hdrs", @@ -10507,7 +10507,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9cd31dcfdbbd7431a807ef4df6edd6473228e94d5c805e8cf671227a21bad068" dependencies = [ "anyhow", - "itertools 0.14.0", + "itertools 0.12.1", "proc-macro2", "quote", "rand 0.8.5", @@ -11435,7 +11435,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls 0.23.37", - "socket2 0.6.2", + "socket2 0.5.10", "thiserror 2.0.18", "tokio", "tracing", @@ -11473,9 +11473,9 @@ dependencies = [ "cfg_aliases 0.2.1", "libc", "once_cell", - "socket2 0.6.2", + "socket2 0.5.10", "tracing", - "windows-sys 0.60.2", + "windows-sys 0.52.0", ] [[package]] @@ -11737,7 +11737,7 @@ dependencies = [ [[package]] name = "rdb-types" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?rev=b603c9f3dccefcf1d3e941c04505ff6bdd1826b6#b603c9f3dccefcf1d3e941c04505ff6bdd1826b6" +source = "git+https://github.com/oxidecomputer/maghemite?branch=multicast-e2e#11786c0d91beca4ae6f51aedfc7079fac71b3461" dependencies = [ "oxnet", "schemars 0.8.22", @@ -12390,7 +12390,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.11.0", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -12493,7 +12493,7 @@ dependencies = [ "security-framework", "security-framework-sys", "webpki-root-certs", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -13933,7 +13933,7 @@ version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1c97747dbf44bb1ca44a561ece23508e99cb592e862f22222dcf42f51d1e451" dependencies = [ - "heck 0.5.0", + "heck 0.4.1", "proc-macro2", "quote", "syn 2.0.117", @@ -14570,7 +14570,7 @@ dependencies = [ "getrandom 0.4.1", "once_cell", "rustix 1.1.3", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -14590,7 +14590,7 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8c27177b12a6399ffc08b98f76f7c9a1f4fe9fc967c784c5a071fa8d93cf7e1" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -15103,7 +15103,7 @@ dependencies = [ "toml_datetime 0.7.5+spec-1.1.0", "toml_parser", "toml_writer", - "winnow 0.7.14", + "winnow 0.7.15", ] [[package]] @@ -15118,7 +15118,7 @@ dependencies = [ "toml_datetime 1.0.0+spec-1.1.0", "toml_parser", "toml_writer", - "winnow 0.7.14", + "winnow 0.7.15", ] [[package]] @@ -15172,7 +15172,7 @@ dependencies = [ "serde_spanned 0.6.9", "toml_datetime 0.6.11", "toml_write", - "winnow 0.7.14", + "winnow 0.7.15", ] [[package]] @@ -15184,7 +15184,7 @@ dependencies = [ "indexmap 2.13.0", "toml_datetime 0.7.5+spec-1.1.0", "toml_parser", - "winnow 0.7.14", + "winnow 0.7.15", ] [[package]] @@ -15193,7 +15193,7 @@ version = "1.0.9+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4" dependencies = [ - "winnow 0.7.14", + "winnow 0.7.15", ] [[package]] @@ -16908,7 +16908,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] @@ -17304,9 +17304,18 @@ dependencies = [ [[package]] name = "winnow" -version = "0.7.14" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + +[[package]] +name = "winnow" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +checksum = "a90e88e4667264a994d34e6d1ab2d26d398dcdca8b7f52bec8668957517fc7d8" dependencies = [ "memchr", ] diff --git a/Cargo.toml b/Cargo.toml index 0e2fbafb1fd..a9c0b3b8302 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -589,8 +589,8 @@ ntp-admin-api = { path = "ntp-admin/api" } ntp-admin-client = { path = "clients/ntp-admin-client" } ntp-admin-types = { path = "ntp-admin/types" } ntp-admin-types-versions = { path = "ntp-admin/types/versions" } -mg-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "b603c9f3dccefcf1d3e941c04505ff6bdd1826b6" } -ddm-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "b603c9f3dccefcf1d3e941c04505ff6bdd1826b6" } +mg-admin-client = { git = "https://github.com/oxidecomputer/maghemite", branch = "multicast-e2e" } +ddm-admin-client = { git = "https://github.com/oxidecomputer/maghemite", branch = "multicast-e2e" } multimap = "0.10.1" nexus-auth = { path = "nexus/auth" } nexus-background-task-interface = { path = "nexus/background-task-interface" } @@ -652,7 +652,7 @@ omicron-workspace-hack = "0.1.0" omicron-zone-package = "0.12.2" oxide-client = { path = "clients/oxide-client" } oxide-tokio-rt = "0.1.2" -oxide-vpc = { git = "https://github.com/oxidecomputer/opte", rev = "e547d07b08c3f3d6c821c9eb7a958adcffce6e56", features = [ "api", "std" ] } +oxide-vpc = { git = "https://github.com/oxidecomputer/opte", rev = "c570ac2126dbbebbd8e98e73b580c5be6b7e460e", features = [ "api", "std" ] } oxlog = { path = "dev-tools/oxlog" } oxnet = "0.1.4" once_cell = "1.21.3" @@ -661,7 +661,7 @@ openapiv3 = "2.2.0" # must match samael's crate! openssl = "0.10" openssl-sys = "0.9" -opte-ioctl = { git = "https://github.com/oxidecomputer/opte", rev = "e547d07b08c3f3d6c821c9eb7a958adcffce6e56" } +opte-ioctl = { git = "https://github.com/oxidecomputer/opte", rev = "c570ac2126dbbebbd8e98e73b580c5be6b7e460e" } oso = "0.27" owo-colors = "4.2.2" oximeter = { path = "oximeter/oximeter" } @@ -726,7 +726,7 @@ rats-corim = { git = "https://github.com/oxidecomputer/rats-corim.git", rev = "f raw-cpuid = { git = "https://github.com/oxidecomputer/rust-cpuid.git", rev = "a4cf01df76f35430ff5d39dc2fe470bcb953503b" } rayon = "1.10" rcgen = "0.12.1" -rdb-types = { git = "https://github.com/oxidecomputer/maghemite", rev = "b603c9f3dccefcf1d3e941c04505ff6bdd1826b6" } +rdb-types = { git = "https://github.com/oxidecomputer/maghemite", branch = "multicast-e2e" } reconfigurator-cli = { path = "dev-tools/reconfigurator-cli" } reedline = "0.40.0" ref-cast = "1.0" diff --git a/dev-tools/releng/src/main.rs b/dev-tools/releng/src/main.rs index b84c3d7c11c..7149f5198ac 100644 --- a/dev-tools/releng/src/main.rs +++ b/dev-tools/releng/src/main.rs @@ -270,6 +270,18 @@ async fn main() -> Result<()> { let opte_version = fs::read_to_string(WORKSPACE_DIR.join("tools/opte_version")).await?; + // Parse tools/opte_version_override for OPTE_COMMIT. When set, we + // download the override p5p from buildomat and use it as a package + // source during image build instead of the helios pkg repo version. + let opte_override = parse_opte_version_override( + &WORKSPACE_DIR.join("tools/opte_version_override"), + ) + .await?; + if let Some(ov) = &opte_override { + info!(logger, "OPTE override active: commit={}", ov.commit); + } + let opte_version = opte_version.trim(); + let client = reqwest::ClientBuilder::new() .connect_timeout(Duration::from_secs(15)) .timeout(Duration::from_secs(120)) @@ -617,7 +629,7 @@ async fn main() -> Result<()> { .arg("-o") // output directory for image .arg(args.output_dir.join(format!("os-{}", target))) .arg("-F") // pass extra image builder features - .arg(format!("optever={}", opte_version.trim())) + .arg(format!("optever={}", opte_version)) .arg("-P") // include all files from extra proto area .arg(proto_dir.join("root")) .arg("-N") // image name @@ -675,11 +687,33 @@ async fn main() -> Result<()> { .arg(format!("helios-dev={HELIOS_PKGREPO}")) } - // helios-build experiment-image - jobs.push_command(format!("{}-image", target), image_cmd) - .after("helios-setup") - .after("helios-incorp") - .after(format!("{}-proto", target)); + // When OPTE_COMMIT is set, download the override p5p from buildomat + // and add it as a package source for the image build. + if let Some(ov) = &opte_override { + let p5p_path = tempdir.path().join(format!("opte-{}.p5p", target)); + let commit = ov.commit.clone(); + let dest = p5p_path.clone(); + let cl = client.clone(); + let log = logger.clone(); + jobs.push( + format!("{target}-opte-p5p"), + download_opte_p5p(log, cl, commit, dest), + ); + + image_cmd = image_cmd + .arg("-p") + .arg(format!("helios-dev=file://{}", p5p_path,)); + + jobs.push_command(format!("{target}-image"), image_cmd) + .after("helios-setup") + .after("helios-incorp") + .after(format!("{target}-opte-p5p")); + } else { + jobs.push_command(format!("{target}-image"), image_cmd) + .after("helios-setup") + .after("helios-incorp") + .after(format!("{target}-proto")); + } } // Build the recovery target after we build the host target. Only one // of these will build at a time since Cargo locks its target directory; @@ -887,6 +921,73 @@ async fn build_proto_area( Ok(()) } +/// Parsed contents of `tools/opte_version_override` when an override is active. +struct OpteOverride { + commit: String, +} + +/// Parse `tools/opte_version_override` for `OPTE_COMMIT`. Returns `None` if +/// `OPTE_COMMIT` is unset or empty. +async fn parse_opte_version_override( + path: &Utf8PathBuf, +) -> Result> { + let contents = fs::read_to_string(path) + .await + .context("failed to read tools/opte_version_override")?; + + for line in contents.lines() { + let line = line.trim(); + if let Some(val) = line.strip_prefix("OPTE_COMMIT=") { + let val = val.trim_matches('"'); + if !val.is_empty() { + return Ok(Some(OpteOverride { commit: val.to_string() })); + } + } + } + + Ok(None) +} + +const OPTE_BUILDOMAT_BASE: &str = + "https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte"; + +/// Download the OPTE override p5p archive from buildomat. +async fn download_opte_p5p( + logger: Logger, + client: reqwest::Client, + commit: String, + dest: Utf8PathBuf, +) -> Result<()> { + let url = format!("{OPTE_BUILDOMAT_BASE}/repo/{commit}/opte.p5p"); + info!(logger, "downloading OPTE override p5p from {url}"); + for attempt in 1..=RETRY_ATTEMPTS { + let result = async { + let response = client.get(&url).send().await?.error_for_status()?; + let bytes = response.bytes().await?; + fs::write(&dest, &bytes).await?; + Ok::<_, anyhow::Error>(()) + } + .await; + + match result { + Ok(()) => { + info!(logger, "downloaded OPTE p5p to {dest}"); + return Ok(()); + } + Err(err) => { + if attempt == RETRY_ATTEMPTS { + return Err(err).with_context(|| { + format!("failed to download OPTE p5p from {url}") + }); + } + info!(logger, "retrying OPTE p5p download (attempt {attempt})"); + } + } + } + + bail!("failed to download OPTE p5p after {RETRY_ATTEMPTS} attempts") +} + async fn host_add_root_profile(host_proto_root: Utf8PathBuf) -> Result<()> { fs::create_dir_all(&host_proto_root).await?; fs::write( diff --git a/package-manifest.toml b/package-manifest.toml index d828b721a2f..16da409ad8b 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -666,10 +666,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "b603c9f3dccefcf1d3e941c04505ff6bdd1826b6" +source.commit = "11786c0d91beca4ae6f51aedfc7079fac71b3461" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm-gz.sha256.txt -source.sha256 = "506862636920f5e9ebf0b931e77baafcec7460faa77770e94fcb18fd1e1ca194" +source.sha256 = "4cbcaa331df7830a70d631b3c0b28e83113154a17e6bc01743c3dce85ac9efa6" output.type = "tarball" [package.mg-ddm] @@ -682,10 +682,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "b603c9f3dccefcf1d3e941c04505ff6bdd1826b6" +source.commit = "11786c0d91beca4ae6f51aedfc7079fac71b3461" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "01cf23b6fee8643263ff7e435c8f58f47976f22a48343ab0b04b417eff6a40cd" +source.sha256 = "97a5c77e8f705027cb17a901f3d60bb8500091c4f346aba609aa51e8aa1d215f" output.type = "zone" output.intermediate_only = true @@ -697,10 +697,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "b603c9f3dccefcf1d3e941c04505ff6bdd1826b6" +source.commit = "11786c0d91beca4ae6f51aedfc7079fac71b3461" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mgd.sha256.txt -source.sha256 = "c5a642abf324cc3a4c3539996825b21ce6b3ffa193ce646dbc9185754a20f599" +source.sha256 = "78c9748efe41efc8e4fdda0d0ce3fc2ecd6567bc16b657c73b118cde2ac87da1" output.type = "zone" output.intermediate_only = true diff --git a/tools/install_opte.sh b/tools/install_opte.sh index d56523764d9..35e2999cc76 100755 --- a/tools/install_opte.sh +++ b/tools/install_opte.sh @@ -51,6 +51,14 @@ fi # Grab the version of the opte package to install OPTE_VERSION="$(cat "$OMICRON_TOP/tools/opte_version")" +# Check for an OPTE override. When set, the desired OPTE version isn't +# published to the helios pkg repo yet, so we download and install directly +# from the override p5p built by OPTE CI on buildomat. +source "$OMICRON_TOP/tools/opte_version_override" +if [[ "x$OPTE_COMMIT" != "x" ]]; then + echo "OPTE override active: installing from p5p for commit $OPTE_COMMIT" +fi + OMICRON_FROZEN_PKG_COMMENT="OMICRON-PINNED-PACKAGE" # Once we install, we mark the package as frozen at that particular version. @@ -71,22 +79,42 @@ if PKG_FROZEN=$(pkg freeze | grep driver/network/opte); then pfexec pkg unfreeze driver/network/opte fi -# Actually install the xde kernel module and opteadm tool -RC=0 -pfexec pkg install -v pkg://helios-dev/driver/network/opte@"$OPTE_VERSION" || RC=$? -if [[ "$RC" -eq 0 ]]; then - echo "xde driver installed successfully" -elif [[ "$RC" -eq 4 ]]; then - echo "Correct xde driver already installed" +if [[ "x$OPTE_COMMIT" != "x" ]]; then + # Install from the override p5p archive built by OPTE CI. + P5P_URL="https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/repo/$OPTE_COMMIT/opte.p5p" + P5P_PATH="/tmp/opte-override.p5p" + echo "Downloading override p5p from $P5P_URL" + curl -fL -o "$P5P_PATH" "$P5P_URL" + + RC=0 + pfexec pkg install -g "$P5P_PATH" "driver/network/opte@$OPTE_VERSION" || RC=$? + if [[ "$RC" -eq 0 ]]; then + echo "xde driver installed from override p5p" + elif [[ "$RC" -eq 4 ]]; then + echo "Correct xde driver already installed" + else + echo "Installing xde driver from override p5p failed" + exit "$RC" + fi + rm -f "$P5P_PATH" else - echo "Installing xde driver failed" - exit "$RC" + # Install the published version from the helios pkg repo. + RC=0 + pfexec pkg install -v pkg://helios-dev/driver/network/opte@"$OPTE_VERSION" || RC=$? + if [[ "$RC" -eq 0 ]]; then + echo "xde driver installed successfully" + elif [[ "$RC" -eq 4 ]]; then + echo "Correct xde driver already installed" + else + echo "Installing xde driver failed" + exit "$RC" + fi fi RC=0 -pfexec pkg freeze -c "$OMICRON_FROZEN_PKG_COMMENT" driver/network/opte@"$OPTE_VERSION" || RC=$? +pfexec pkg freeze -c "$OMICRON_FROZEN_PKG_COMMENT" driver/network/opte || RC=$? if [[ "$RC" -ne 0 ]]; then - echo "Failed to pin opte package to $OPTE_VERSION" + echo "Failed to pin opte package" exit $RC fi @@ -97,13 +125,3 @@ if [[ "$RC" -ne 0 ]]; then echo "The \`opteadm\` administration tool is not on your path." echo "You may add \"/opt/oxide/opte/bin\" to your path to access it." fi - -source $OMICRON_TOP/tools/opte_version_override - -if [[ "x$OPTE_COMMIT" != "x" ]]; then - set +x - curl -fOL https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/module/$OPTE_COMMIT/xde - pfexec rem_drv xde || true - pfexec mv xde /kernel/drv/amd64/xde - pfexec add_drv xde || true -fi diff --git a/tools/maghemite_ddm_openapi_version b/tools/maghemite_ddm_openapi_version index cac9570a113..060aa2a57bb 100644 --- a/tools/maghemite_ddm_openapi_version +++ b/tools/maghemite_ddm_openapi_version @@ -1 +1 @@ -COMMIT="b603c9f3dccefcf1d3e941c04505ff6bdd1826b6" +COMMIT="11786c0d91beca4ae6f51aedfc7079fac71b3461" diff --git a/tools/maghemite_mg_openapi_version b/tools/maghemite_mg_openapi_version index cac9570a113..060aa2a57bb 100644 --- a/tools/maghemite_mg_openapi_version +++ b/tools/maghemite_mg_openapi_version @@ -1 +1 @@ -COMMIT="b603c9f3dccefcf1d3e941c04505ff6bdd1826b6" +COMMIT="11786c0d91beca4ae6f51aedfc7079fac71b3461" diff --git a/tools/maghemite_mgd_checksums b/tools/maghemite_mgd_checksums index 87e1781ba0b..5fa0f39dd69 100644 --- a/tools/maghemite_mgd_checksums +++ b/tools/maghemite_mgd_checksums @@ -1,2 +1,2 @@ -CIDL_SHA256="c5a642abf324cc3a4c3539996825b21ce6b3ffa193ce646dbc9185754a20f599" -MGD_LINUX_SHA256="83d9bac919524341a845b7e376349fababbe47fa3335f953fda6bbf85e6030ee" \ No newline at end of file +CIDL_SHA256="78c9748efe41efc8e4fdda0d0ce3fc2ecd6567bc16b657c73b118cde2ac87da1" +MGD_LINUX_SHA256="e55f841a879e58dba4c29b6aae35742f05b8e91047926bbb61bc78c61dec7e7d" \ No newline at end of file diff --git a/tools/opte_version b/tools/opte_version index 1742d1585d6..2c4e90d76cd 100644 --- a/tools/opte_version +++ b/tools/opte_version @@ -1 +1 @@ -0.39.455 +0.40.463 diff --git a/tools/opte_version_override b/tools/opte_version_override index 8d57f7ae9f4..66995188d0d 100644 --- a/tools/opte_version_override +++ b/tools/opte_version_override @@ -1,5 +1,18 @@ #!/usr/bin/env bash -# only set this if you want to override the version of opte/xde installed by the -# install_opte.sh script -OPTE_COMMIT="" +# Override for using an unpublished OPTE version. When OPTE_COMMIT is set, +# the override p5p package is downloaded from buildomat and used instead of +# the version published in the helios pkg repo. The p5p is built by the +# opte-p5p buildomat job and published at: +# https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/repo/{commit}/opte.p5p +# +# Consumers: +# - install_opte.sh installs directly from the override p5p +# - releng image builds use extra_packages with the p5p as a pkg source +# - deploy.sh installs from the override p5p on the running system +# - ci_check_opte_ver.sh skips version consistency checks +# +# To activate: set OPTE_COMMIT to the git commit hash of the OPTE build. +# +# To deactivate (once the new version is published): set OPTE_COMMIT to "". +OPTE_COMMIT="c570ac2126dbbebbd8e98e73b580c5be6b7e460e" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 6288b30b730..ffdb9a61cb2 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -155,7 +155,7 @@ usdt = { version = "0.6.0" } usdt-impl-3b31131e45eafb45 = { package = "usdt-impl", version = "0.6.0", default-features = false, features = ["des"] } usdt-impl-d8f496e17d97b5cb = { package = "usdt-impl", version = "0.5.0", default-features = false, features = ["asm", "des"] } uuid = { version = "1.21.0", features = ["serde", "v4"] } -winnow = { version = "0.7.14" } +winnow = { version = "0.7.15" } x509-cert = { version = "0.2.5" } zerocopy = { version = "0.8.40", default-features = false, features = ["derive", "simd"] } zeroize = { version = "1.8.2", features = ["std", "zeroize_derive"] } @@ -307,7 +307,7 @@ usdt-impl-d8f496e17d97b5cb = { package = "usdt-impl", version = "0.5.0", default uuid = { version = "1.21.0", features = ["serde", "v4"] } vergen = { version = "9.0.6", features = ["cargo", "rustc"] } vergen-lib = { version = "0.1.6", features = ["cargo", "git", "rustc"] } -winnow = { version = "0.7.14" } +winnow = { version = "0.7.15" } x509-cert = { version = "0.2.5" } zerocopy = { version = "0.8.40", default-features = false, features = ["derive", "simd"] } zeroize = { version = "1.8.2", features = ["std", "zeroize_derive"] } From 18943d03915aa2827ea65fa63567047f2145028c Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Thu, 26 Mar 2026 14:08:05 +0000 Subject: [PATCH 2/8] [multicast] M2P forwarding, OPTE port subscription, and sled-agent propagation This completes the multicast data path by adding per-sled M2P (multicast-to- physical) mapping, forwarding entry management, and OPTE port subscription for multicast group members. ## Sled-agent + API update(s) - Add multicast endpoints at API v33 (MCAST_M2P_FORWARDING) for M2P, forwarding, and per-VMM subscribe/unsubscribe - Version v7 join/leave endpoints to v7..v33 with shim conversion - Move multicast types from omicron-common to sled-agent-types-versions v33 module (mcast_m2p_forwarding) with re-exports through sled-agent-types - OPTE port_manager gains set/clear operations for M2P and forwarding - Port subscription cleanup on PortTicket release - Consolidate per-port mutable state (eip_gateways, mcast) into PortState - Seed eip_gateways from global map on port creation to prevent stale gateway state on newly created ports - Lock ordering documented for ports, routes, eip_gateways ## Nexus - New `sled.rs` (MulticastSledClient) encapsulating all sled-agent multicast interactions: subscribe/unsubscribe, M2P/forwarding propagation and teardown - Groups RPW propagates M2P and forwarding entries to all member sleds after DPD configuration, with convergent retry on failure - Members RPW uses MemberReconcileCtx to thread shared reconciliation state. Handles subscribe on join, unsubscribe on leave, and re-subscribe on migration - `subscribe_vmm` gracefully handles missing propolis (mirrors unsubscribe) - `lookup_propolis_id` returns Ok(None) for missing instance - `lookup_and_update_member_sled_id` surfaces DB errors instead of swallowing them - Order-independent forwarding comparison to avoid spurious dataplane churn; always create forwarding entries for active groups even with empty next-hops - Dataplane client updated for bifurcated replication groups ## illumos-utils - Remove CIDR allow rules for multicast (handled by OPTE gateway layer) - Reject Reserved replication mode in `list_mcast_fwd` with InvalidMcastForwardingState error - Consolidate error variants into InvalidMcastUnderlay ## Tests - Integration tests for M2P/forwarding/subscribe lifecycle - Instance migration multicast re-convergence --- .github/buildomat/jobs/deploy.sh | 25 +- .github/workflows/check-opte-ver.yml | 11 +- dev-tools/releng/src/main.rs | 24 +- illumos-utils/src/opte/illumos.rs | 6 + illumos-utils/src/opte/mod.rs | 2 +- illumos-utils/src/opte/non_illumos.rs | 112 +- illumos-utils/src/opte/port_manager.rs | 863 ++++++++++++-- .../src/test_util/host_phase_2_test_state.rs | 83 +- .../app/background/tasks/multicast/groups.rs | 228 +++- .../app/background/tasks/multicast/members.rs | 874 ++++++++------ .../src/app/background/tasks/multicast/mod.rs | 26 +- .../tasks/sync_switch_configuration.rs | 4 +- nexus/src/app/bgp.rs | 8 +- nexus/src/app/multicast/dataplane.rs | 95 +- nexus/src/app/multicast/mod.rs | 29 + nexus/src/app/multicast/sled.rs | 582 ++++++++++ .../integration_tests/multicast/instances.rs | 152 ++- .../tests/integration_tests/multicast/mod.rs | 49 +- .../multicast/networking_integration.rs | 1003 ++++++++++++++++- .../sled-agent-32.0.0-d78e46.json.gitstub | 1 + ...e46.json => sled-agent-33.0.0-c33810.json} | 370 +++++- openapi/sled-agent/sled-agent-latest.json | 2 +- sled-agent/api/src/lib.rs | 131 ++- sled-agent/src/bootstrap/early_networking.rs | 4 +- sled-agent/src/http_entrypoints.rs | 98 +- sled-agent/src/instance.rs | 143 +-- sled-agent/src/instance_manager.rs | 48 +- sled-agent/src/probe_manager.rs | 1 + sled-agent/src/services.rs | 1 + sled-agent/src/sim/collection.rs | 11 +- sled-agent/src/sim/http_entrypoints.rs | 106 +- sled-agent/src/sim/sled_agent.rs | 60 + sled-agent/src/sled_agent.rs | 63 +- sled-agent/types/src/lib.rs | 1 + sled-agent/types/src/multicast.rs | 7 + sled-agent/types/versions/src/latest.rs | 12 + sled-agent/types/versions/src/lib.rs | 2 + .../versions/src/mcast_m2p_forwarding/mod.rs | 10 + .../src/mcast_m2p_forwarding/multicast.rs | 132 +++ tools/install_opte.sh | 4 +- 40 files changed, 4441 insertions(+), 942 deletions(-) create mode 100644 nexus/src/app/multicast/sled.rs create mode 100644 openapi/sled-agent/sled-agent-32.0.0-d78e46.json.gitstub rename openapi/sled-agent/{sled-agent-32.0.0-d78e46.json => sled-agent-33.0.0-c33810.json} (96%) create mode 100644 sled-agent/types/src/multicast.rs create mode 100644 sled-agent/types/versions/src/mcast_m2p_forwarding/mod.rs create mode 100644 sled-agent/types/versions/src/mcast_m2p_forwarding/multicast.rs diff --git a/.github/buildomat/jobs/deploy.sh b/.github/buildomat/jobs/deploy.sh index a9c42b7abc2..dd07bcc6757 100755 --- a/.github/buildomat/jobs/deploy.sh +++ b/.github/buildomat/jobs/deploy.sh @@ -33,6 +33,9 @@ _exit_trap() { local status=$? set +o errexit + if [[ "x$OPTE_COMMIT" != "x" ]]; then + pfexec cp /tmp/opteadm /opt/oxide/opte/bin/opteadm + fi # # Stop cron in all zones (to stop logadm log rotation) @@ -182,18 +185,22 @@ ptime -m tar xvzf /input/package/work/package.tar.gz source .github/buildomat/ci-env.sh # Source the OPTE override (if any) from the canonical location and apply it. -# When set, install the override p5p from buildomat instead of using the -# version baked into the ramdisk image. The version must be pinned explicitly -# because IPS version ordering does not match semver. +# +# When set, download the xde driver and opteadm directly from buildomat and +# swap them in. The deploy target is a ramdisk image without pkg(5), so we +# use rem_drv/add_drv instead of the p5p approach used by install_opte.sh +# and releng. # shellcheck source=/dev/null source tools/opte_version_override if [[ "x$OPTE_COMMIT" != "x" ]]; then - OPTE_VERSION="$(cat tools/opte_version)" - P5P_URL="https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/repo/$OPTE_COMMIT/opte.p5p" - P5P_PATH="/tmp/opte-override.p5p" - curl -sSfL -o "$P5P_PATH" "$P5P_URL" - pfexec pkg install -g "$P5P_PATH" "driver/network/opte@$OPTE_VERSION" - rm -f "$P5P_PATH" + curl -sSfOL "https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/module/$OPTE_COMMIT/xde" + pfexec rem_drv xde || true + pfexec mv xde /kernel/drv/amd64/xde + pfexec add_drv xde || true + curl -sSfOL "https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/release/$OPTE_COMMIT/opteadm" + chmod +x opteadm + cp opteadm /tmp/opteadm + pfexec mv opteadm /opt/oxide/opte/bin/opteadm fi # Ask buildomat for the range of extra addresses that we're allowed to use, and diff --git a/.github/workflows/check-opte-ver.yml b/.github/workflows/check-opte-ver.yml index 4cad1f04ef8..65a3b23c121 100644 --- a/.github/workflows/check-opte-ver.yml +++ b/.github/workflows/check-opte-ver.yml @@ -1,11 +1,7 @@ name: check-opte-ver on: pull_request: - paths: - - '.github/workflows/check-opte-ver.yml' - - 'Cargo.toml' - - 'tools/opte_version' - - 'tools/opte_version_override' + branches: [main] jobs: check-opte-ver: runs-on: ubuntu-22.04 @@ -19,7 +15,12 @@ jobs: run: cargo install toml-cli@0.2.3 - name: Check OPTE version and rev match run: ./tools/ci_check_opte_ver.sh + + # Runs on every PR regardless of paths changed, since the override + # file could have been set in an earlier commit and slip through on + # an unrelated PR otherwise. check-opte-override: + if: github.base_ref == 'main' runs-on: ubuntu-22.04 steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/dev-tools/releng/src/main.rs b/dev-tools/releng/src/main.rs index 7149f5198ac..22a30007c5f 100644 --- a/dev-tools/releng/src/main.rs +++ b/dev-tools/releng/src/main.rs @@ -629,7 +629,7 @@ async fn main() -> Result<()> { .arg("-o") // output directory for image .arg(args.output_dir.join(format!("os-{}", target))) .arg("-F") // pass extra image builder features - .arg(format!("optever={}", opte_version)) + .arg(format!("optever={opte_version}")) .arg("-P") // include all files from extra proto area .arg(proto_dir.join("root")) .arg("-N") // image name @@ -690,7 +690,7 @@ async fn main() -> Result<()> { // When OPTE_COMMIT is set, download the override p5p from buildomat // and add it as a package source for the image build. if let Some(ov) = &opte_override { - let p5p_path = tempdir.path().join(format!("opte-{}.p5p", target)); + let p5p_path = tempdir.path().join(format!("opte-{target}.p5p")); let commit = ov.commit.clone(); let dest = p5p_path.clone(); let cl = client.clone(); @@ -702,17 +702,17 @@ async fn main() -> Result<()> { image_cmd = image_cmd .arg("-p") - .arg(format!("helios-dev=file://{}", p5p_path,)); + .arg(format!("helios-dev=file://{p5p_path}")); + } - jobs.push_command(format!("{target}-image"), image_cmd) - .after("helios-setup") - .after("helios-incorp") - .after(format!("{target}-opte-p5p")); - } else { - jobs.push_command(format!("{target}-image"), image_cmd) - .after("helios-setup") - .after("helios-incorp") - .after(format!("{target}-proto")); + let image_job = jobs + .push_command(format!("{target}-image"), image_cmd) + .after("helios-setup") + .after("helios-incorp") + .after(format!("{target}-proto")); + + if opte_override.is_some() { + image_job.after(format!("{target}-opte-p5p")); } } // Build the recovery target after we build the host target. Only one diff --git a/illumos-utils/src/opte/illumos.rs b/illumos-utils/src/opte/illumos.rs index c6d457c8460..d4ff0b51764 100644 --- a/illumos-utils/src/opte/illumos.rs +++ b/illumos-utils/src/opte/illumos.rs @@ -13,6 +13,7 @@ use opte_ioctl::OpteHdl; use slog::Logger; use slog::info; use std::net::IpAddr; +use std::net::Ipv6Addr; #[derive(thiserror::Error, Debug)] pub enum Error { @@ -70,6 +71,11 @@ pub enum Error { "Tried to update attached subnets on non-existent port ({0}, {1:?})" )] AttachedSubnetUpdateMissingPort(uuid::Uuid, NetworkInterfaceKind), + + #[error( + "address {0} is not within the underlay multicast subnet (ff04::/64)" + )] + InvalidMcastUnderlay(Ipv6Addr), } /// Delete all xde devices on the system. diff --git a/illumos-utils/src/opte/mod.rs b/illumos-utils/src/opte/mod.rs index e9e2546cb0a..4903e61db8b 100644 --- a/illumos-utils/src/opte/mod.rs +++ b/illumos-utils/src/opte/mod.rs @@ -33,10 +33,10 @@ use oxnet::IpNet; use oxnet::Ipv4Net; use oxnet::Ipv6Net; pub use port::Port; -pub use port_manager::MulticastGroupCfg; pub use port_manager::PortCreateParams; pub use port_manager::PortManager; pub use port_manager::PortTicket; +pub use sled_agent_types::multicast::MulticastGroupCfg; use std::net::IpAddr; use std::net::Ipv4Addr; use std::net::Ipv6Addr; diff --git a/illumos-utils/src/opte/non_illumos.rs b/illumos-utils/src/opte/non_illumos.rs index ded56ac8945..21a4cd55129 100644 --- a/illumos-utils/src/opte/non_illumos.rs +++ b/illumos-utils/src/opte/non_illumos.rs @@ -2,26 +2,38 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Mock / dummy versions of the OPTE module, for non-illumos platforms +//! Mock / dummy versions of the OPTE module, for non-illumos platforms. +//! +//! Most methods are either `unimplemented!()` or silent no-ops. +//! Multicast subscribe/unsubscribe is an exception, as it maintains real +//! in-memory state because port manager tests assert on subscription contents. use crate::addrobj::AddrObject; use omicron_common::api::internal::shared::NetworkInterfaceKind; use oxide_vpc::api::AddRouterEntryReq; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2PhysReq; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DetachSubnetResp; -use oxide_vpc::api::Direction; +use oxide_vpc::api::DumpMcast2PhysResp; +use oxide_vpc::api::DumpMcastForwardingResp; use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::IpCfg; use oxide_vpc::api::IpCidr; use oxide_vpc::api::ListPortsResp; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; use oxide_vpc::api::NoResp; use oxide_vpc::api::PortInfo; use oxide_vpc::api::RouterClass; use oxide_vpc::api::RouterTarget; use oxide_vpc::api::SetExternalIpsReq; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2PhysReq; +use oxide_vpc::api::SourceFilter; use oxide_vpc::api::VpcCfg; use slog::Logger; use std::collections::HashMap; @@ -76,6 +88,11 @@ pub enum Error { "Tried to update attached subnets on non-existent port ({0}, {1:?})" )] AttachedSubnetUpdateMissingPort(uuid::Uuid, NetworkInterfaceKind), + + #[error( + "address {0} is not within the underlay multicast subnet (ff04::/64)" + )] + InvalidMcastUnderlay(std::net::Ipv6Addr), } pub fn initialize_xde_driver( @@ -172,6 +189,8 @@ pub(crate) struct PortData { pub port: PortInfo, /// The routes for this port. This simulates the router layer. pub routes: Vec, + /// Multicast group subscriptions: group IP → source filter. + pub mcast_subscriptions: HashMap, } #[derive(Debug)] @@ -237,7 +256,11 @@ impl Handle { return Err(OpteError::DuplicatePort(entry.key().to_string())); } Entry::Vacant(entry) => { - entry.insert(PortData { port, routes: Vec::new() }); + entry.insert(PortData { + port, + routes: Vec::new(), + mcast_subscriptions: HashMap::new(), + }); } } Ok(NO_RESPONSE) @@ -270,14 +293,46 @@ impl Handle { Ok(NO_RESPONSE) } - /// Allow traffic to / from a CIDR block on a port. - pub fn allow_cidr( + /// Subscribe a port to a multicast group. + pub fn mcast_subscribe( &self, - _: &str, - _: IpCidr, - _: Direction, + req: &McastSubscribeReq, ) -> Result { - unimplemented!("Not yet used in tests") + let mut inner = opte_state().lock().unwrap(); + let Some(port_data) = inner.ports.get_mut(&req.port_name) else { + return Err(OpteError::NoPort(req.port_name.clone())); + }; + let group_ip: IpAddr = match req.group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + std::net::Ipv4Addr::from(v4).into() + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + std::net::Ipv6Addr::from(v6).into() + } + }; + port_data.mcast_subscriptions.insert(group_ip, req.filter.clone()); + Ok(NO_RESPONSE) + } + + /// Unsubscribe a port from a multicast group. + pub fn mcast_unsubscribe( + &self, + req: &McastUnsubscribeReq, + ) -> Result { + let mut inner = opte_state().lock().unwrap(); + let Some(port_data) = inner.ports.get_mut(&req.port_name) else { + return Err(OpteError::NoPort(req.port_name.clone())); + }; + let group_ip: IpAddr = match req.group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + std::net::Ipv4Addr::from(v4).into() + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + std::net::Ipv6Addr::from(v6).into() + } + }; + port_data.mcast_subscriptions.remove(&group_ip); + Ok(NO_RESPONSE) } /// Delete a router entry from a port. @@ -323,6 +378,45 @@ impl Handle { unimplemented!("Not yet used in tests") } + /// Set a multicast-to-physical mapping. + pub fn set_m2p(&self, _: &SetMcast2PhysReq) -> Result { + Ok(NO_RESPONSE) + } + + /// Clear a multicast-to-physical mapping. + pub fn clear_m2p( + &self, + _: &ClearMcast2PhysReq, + ) -> Result { + Ok(NO_RESPONSE) + } + + /// Set multicast forwarding for a port. + pub fn set_mcast_fwd( + &self, + _: &SetMcastForwardingReq, + ) -> Result { + Ok(NO_RESPONSE) + } + + /// Clear multicast forwarding for a port. + pub fn clear_mcast_fwd( + &self, + _: &ClearMcastForwardingReq, + ) -> Result { + Ok(NO_RESPONSE) + } + + /// Dump all multicast-to-physical mappings. + pub fn dump_m2p(&self) -> Result { + Ok(DumpMcast2PhysResp { ip4: Vec::new(), ip6: Vec::new() }) + } + + /// Dump all multicast forwarding entries. + pub fn dump_mcast_fwd(&self) -> Result { + Ok(DumpMcastForwardingResp { entries: Vec::new() }) + } + /// List ports on the current system. #[allow(dead_code)] pub(crate) fn list_ports(&self) -> Result { diff --git a/illumos-utils/src/opte/port_manager.rs b/illumos-utils/src/opte/port_manager.rs index 2a2031ebcb9..30ecae011d8 100644 --- a/illumos-utils/src/opte/port_manager.rs +++ b/illumos-utils/src/opte/port_manager.rs @@ -17,8 +17,6 @@ use crate::opte::port::PortData; use ipnetwork::Ipv4Network; use ipnetwork::Ipv6Network; use macaddr::MacAddr6; -use omicron_common::address::IPV4_MULTICAST_RANGE; -use omicron_common::address::IPV6_MULTICAST_RANGE; use omicron_common::api::external; use omicron_common::api::internal::shared::ExternalIpConfig; use omicron_common::api::internal::shared::ExternalIpGatewayMap; @@ -41,10 +39,13 @@ use omicron_common::api::internal::shared::RouterVersion; use omicron_common::api::internal::shared::VirtualNetworkInterfaceHost; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::AttachedSubnetConfig; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DetachSubnetResp; use oxide_vpc::api::DhcpCfg; use oxide_vpc::api::ExternalIpCfg; +use oxide_vpc::api::FilterMode; use oxide_vpc::api::IpCfg; use oxide_vpc::api::IpCidr; use oxide_vpc::api::Ipv4Cfg; @@ -52,15 +53,30 @@ use oxide_vpc::api::Ipv4Cidr; use oxide_vpc::api::Ipv6Cfg; use oxide_vpc::api::Ipv6Cidr; use oxide_vpc::api::MacAddr; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::MulticastUnderlay; use oxide_vpc::api::RouterClass; use oxide_vpc::api::SNat4Cfg; use oxide_vpc::api::SNat6Cfg; use oxide_vpc::api::SetExternalIpsReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; +use oxide_vpc::api::SourceFilter; use oxide_vpc::api::TransitIpConfig; use oxide_vpc::api::VpcCfg; use oxnet::IpNet; use oxnet::Ipv4Net; use oxnet::Ipv6Net; +use sled_agent_types::multicast::ClearMcast2Phys; +use sled_agent_types::multicast::ClearMcastForwarding; +use sled_agent_types::multicast::Mcast2PhysMapping; +use sled_agent_types::multicast::McastFilterMode; +use sled_agent_types::multicast::McastForwardingEntry; +use sled_agent_types::multicast::McastForwardingNextHop; +use sled_agent_types::multicast::McastReplication; +use sled_agent_types::multicast::McastSourceFilter; +use sled_agent_types::multicast::MulticastGroupCfg; use slog::Logger; use slog::debug; use slog::error; @@ -89,22 +105,21 @@ struct RouteSet { active_ports: usize, } -/// Configuration for multicast groups on an OPTE port. +/// Lock ordering for `PortManagerInner` fields: /// -/// TODO: This type should be moved to [oxide_vpc::api] when OPTE dependencies -/// are updated, following the same pattern as other VPC configuration types -/// like [ExternalIpCfg], [IpCfg], etc. +/// The only lock nesting is: +/// - `routes` then `ports` in `vpc_routes_ensure` +/// - `ports` then `eip_gateways` in `create_port` /// -/// TODO: Eventually remove. -#[derive(Debug, Clone, PartialEq)] -pub struct MulticastGroupCfg { - /// The multicast group IP address (IPv4 or IPv6). - pub group_ip: IpAddr, - /// Source addresses for source-filtered multicast (optional for ASM, - /// required for SSM). - pub sources: Vec, -} - +/// `set_eip_gateways` acquires each lock separately (global map first, +/// then ports), so there is no nesting. A concurrent `create_port` +/// between the two reads the already-updated global map. The subsequent +/// port iteration is redundant but idempotent. Neither path is hot: +/// `set_eip_gateways` runs once per background task pass and +/// `create_port` runs at instance boot. +/// +/// Note: `release_inner` acquires `ports` then `routes` sequentially +/// (dropping each before acquiring the next). #[derive(Debug)] struct PortManagerInner { log: Logger, @@ -115,20 +130,58 @@ struct PortManagerInner { /// IP address of the hosting sled on the underlay. underlay_ip: Ipv6Addr, - /// Map of all ports, keyed on the interface Uuid and its kind - /// (which includes the Uuid of the parent instance or service) - ports: Mutex>, + /// Map of all ports and their mutable state, keyed on the interface + /// Uuid and its kind (which includes the Uuid of the parent instance + /// or service). + ports: Mutex>, /// Map of all current resolved routes. routes: Mutex>, - /// Mappings of associated Internet Gateways for all External IPs - /// attached to each NIC. - /// - /// IGW IDs are specific to the VPC of each NIC. + /// Most recent EIP gateway mappings, keyed by NIC ID. We store this here so + /// that ports created after `set_eip_gateways` can seed their initial + /// gateway state. eip_gateways: Mutex>>>, } +/// Mutable per-port state tracked alongside the immutable `Port`. +#[derive(Debug)] +struct PortState { + port: Port, + /// Mappings of associated Internet Gateways for External IPs on this NIC. + eip_gateways: HashMap>, + /// Active multicast subscriptions, mapping group IP → source filter. + mcast_subscriptions: HashMap, +} + +impl PortState { + fn new(port: Port) -> Self { + Self { + port, + eip_gateways: HashMap::new(), + mcast_subscriptions: HashMap::new(), + } + } +} + +/// Convert a `MulticastGroupCfg` into OPTE's `SourceFilter`. +/// +/// Empty sources maps to ASM (EXCLUDE with no entries, accepting all +/// sources). Non-empty sources maps to SSM (INCLUDE with the listed +/// sources). +fn multicast_cfg_to_source_filter(cfg: &MulticastGroupCfg) -> SourceFilter { + if cfg.sources.is_empty() { + SourceFilter::default() + } else { + SourceFilter::Include( + cfg.sources + .iter() + .map(|s| oxide_vpc::api::IpAddr::from(*s)) + .collect(), + ) + } +} + impl PortManagerInner { fn next_port_name(&self) -> String { format!( @@ -139,14 +192,15 @@ impl PortManagerInner { } } -#[derive(Debug)] /// Parameters needed to create and configure an OPTE port. +#[derive(Debug)] pub struct PortCreateParams<'a> { pub nic: &'a NetworkInterface, pub external_ips: &'a ExternalIpConfig, pub firewall_rules: &'a [ResolvedVpcFirewallRule], pub dhcp_config: DhcpCfg, pub attached_subnets: Vec, + pub multicast_groups: &'a [MulticastGroupCfg], } impl<'a> TryFrom<&PortCreateParams<'a>> for IpCfg { @@ -349,7 +403,7 @@ impl PortManager { underlay_ip, ports: Mutex::new(BTreeMap::new()), routes: Mutex::new(Default::default()), - eip_gateways: Mutex::new(Default::default()), + eip_gateways: Mutex::new(HashMap::new()), }); Self { inner } @@ -371,6 +425,7 @@ impl PortManager { firewall_rules, dhcp_config, attached_subnets: _, + multicast_groups, } = params; let is_service = matches!(nic.kind, NetworkInterfaceKind::Service { .. }); @@ -424,7 +479,18 @@ impl PortManager { vni, gateway, }); - let old = ports.insert((nic.id, nic.kind), port.clone()); + let mut new_port_state = PortState::new(port.clone()); + + // Seed gateway mappings from the global map so that a port + // created after set_eip_gateways has the correct state + // immediately. Lock order: ports then eip_gateways. + if let Some(gw) = + self.inner.eip_gateways.lock().unwrap().get(&nic.id).cloned() + { + new_port_state.eip_gateways = gw; + } + + let old = ports.insert((nic.id, nic.kind), new_port_state); assert!( old.is_none(), "Duplicate OPTE port detected: interface_id = {}, kind = {:?}", @@ -443,7 +509,8 @@ impl PortManager { // `Instance::refresh_external_ips_inner`), and to prevent updates // racing with nexus before an instance/port are reachable from their // respective managers. - self.external_ips_ensure_port(&port, nic.id, external_ips)?; + let port_state = ports.get(&(nic.id, nic.kind)).unwrap(); + self.external_ips_ensure_port(port_state, external_ips)?; } (port, ticket) }; @@ -543,6 +610,12 @@ impl PortManager { } drop(route_map); + // Configure multicast group subscriptions if any were + // provided at instance start. + if !multicast_groups.is_empty() { + self.multicast_groups_ensure(nic.id, nic.kind, multicast_groups)?; + } + info!( self.inner.log, "Created OPTE port"; @@ -610,13 +683,14 @@ impl PortManager { } // Note: We're deliberately holding both locks here - // to prevent several nexuses computng and applying deltas + // to prevent several nexuses computing and applying deltas // out of order. let ports = self.inner.ports.lock().unwrap(); let hdl = Handle::new()?; // Propagate deltas out to all ports. - for port in ports.values() { + for port_state in ports.values() { + let port = &port_state.port; // Fetch deltas for all router keys: system, IPv4 subnet, and IPv6 // subnet. let system_delta = deltas.get(&port.system_router_key()); @@ -687,11 +761,22 @@ impl PortManager { /// /// Returns whether the internal mappings were changed. pub fn set_eip_gateways(&self, mappings: ExternalIpGatewayMap) -> bool { - let mut gateways = self.inner.eip_gateways.lock().unwrap(); - - let changed = &*gateways != &mappings.mappings; - - *gateways = mappings.mappings; + // Update global map (single lock). A concurrent create_port + // between these two locks will read the updated global map and + // seed correctly; the port iteration below is then a redundant + // but idempotent overwrite. + let mut global_gw = self.inner.eip_gateways.lock().unwrap(); + let changed = &*global_gw != &mappings.mappings; + *global_gw = mappings.mappings.clone(); + drop(global_gw); + + // Push into existing ports. + let mut ports = self.inner.ports.lock().unwrap(); + for ((nic_id, _), port_state) in ports.iter_mut() { + let new_gw = + mappings.mappings.get(nic_id).cloned().unwrap_or_default(); + port_state.eip_gateways = new_gw; + } changed } @@ -704,23 +789,24 @@ impl PortManager { external_ips: &ExternalIpConfig, ) -> Result<(), Error> { let ports = self.inner.ports.lock().unwrap(); - let port = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { + let port_state = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { Error::ExternalIpUpdateMissingPort(nic_id, nic_kind) })?; - self.external_ips_ensure_port(port, nic_id, external_ips) + self.external_ips_ensure_port(port_state, external_ips) } /// Ensure external IPs for an OPTE port are up to date. - pub fn external_ips_ensure_port( + fn external_ips_ensure_port( &self, - port: &Port, - nic_id: Uuid, + port_state: &PortState, external_ips: &ExternalIpConfig, ) -> Result<(), Error> { - let egw_lock = self.inner.eip_gateways.lock().unwrap(); - let inet_gw_map = egw_lock.get(&nic_id).cloned(); - drop(egw_lock); + let inet_gw_map = if port_state.eip_gateways.is_empty() { + None + } else { + Some(port_state.eip_gateways.clone()) + }; // NOTE: The Option::map() call here is a bit confusing. // @@ -740,18 +826,14 @@ impl PortManager { .v6 .as_ref() .map(|v6| build_external_ipv6_config(Some(v6))); - let inet_gw_map = if let Some(map) = inet_gw_map { - Some( - map.into_iter() - .map(|(k, v)| (k.into(), v.into_iter().collect())) - .collect(), - ) - } else { - None - }; + let inet_gw_map = inet_gw_map.map(|map| { + map.into_iter() + .map(|(k, v)| (k.into(), v.into_iter().collect())) + .collect() + }); let req = SetExternalIpsReq { - port_name: port.name().into(), + port_name: port_state.port.name().into(), external_ips_v4, external_ips_v6, inet_gw_map, @@ -762,69 +844,114 @@ impl PortManager { Ok(()) } - /// Validate multicast group memberships for an OPTE port. - /// - /// This method validates multicast group configurations but does not yet - /// configure OPTE port-level multicast group membership. The actual - /// multicast forwarding is currently handled by the reconciler + DPD - /// at the dataplane switch level. - /// - /// TODO: Once OPTE kernel module supports multicast group APIs, this - /// method should be updated to configure OPTE port-level multicast - /// group membership. Note: multicast groups are fleet-scoped and can span - /// across VPCs. + /// Ensure multicast group subscriptions for an OPTE port match the + /// requested set. This diffs current vs new state and issues + /// subscribe/unsubscribe ioctls as needed. pub fn multicast_groups_ensure( &self, nic_id: Uuid, nic_kind: NetworkInterfaceKind, multicast_groups: &[MulticastGroupCfg], ) -> Result<(), Error> { - let ports = self.inner.ports.lock().unwrap(); - let port = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { - Error::MulticastUpdateMissingPort(nic_id, nic_kind) - })?; - - debug!( - self.inner.log, - "Validating multicast group configuration for OPTE port"; - "port_name" => port.name(), - "nic_id" => ?nic_id, - "groups" => ?multicast_groups, - ); - - // Validate multicast group configurations + // Validate and build the new subscription set before acquiring locks. + let mut new_subs: HashMap = HashMap::new(); for group in multicast_groups { if !group.group_ip.is_multicast() { - error!( - self.inner.log, - "Invalid multicast IP address"; - "group_ip" => %group.group_ip, - "port_name" => port.name(), - ); - return Err(Error::InvalidPortIpConfig(String::from( - "invalid multicast IP address", + return Err(Error::InvalidPortIpConfig(format!( + "not a multicast address: {}", + group.group_ip, ))); } + new_subs + .insert(group.group_ip, multicast_cfg_to_source_filter(group)); } - // TODO: Configure firewall rules to allow multicast traffic. - // Add exceptions in source/dest MAC/L3 addr checking for multicast - // addresses matching known groups, only doing cidr-checking on the - // multicasst destination side. + let hdl = Handle::new()?; - info!( - self.inner.log, - "OPTE port configured for multicast traffic"; - "port_name" => port.name(), - "ipv4_range" => %IPV4_MULTICAST_RANGE, - "ipv6_range" => %IPV6_MULTICAST_RANGE, - "multicast_groups" => multicast_groups.len(), - ); + let mut ports = self.inner.ports.lock().unwrap(); + let port_state = + ports.get_mut(&(nic_id, nic_kind)).ok_or_else(|| { + Error::MulticastUpdateMissingPort(nic_id, nic_kind) + })?; + let port_name = port_state.port.name().to_string(); + + // Unsubscribe groups that are no longer requested. + let to_remove: Vec = port_state + .mcast_subscriptions + .keys() + .filter(|g| !new_subs.contains_key(g)) + .copied() + .collect(); + + let removed = to_remove.len(); + for group_ip in &to_remove { + debug!( + self.inner.log, + "unsubscribing from multicast group"; + "port" => &port_name, + "group" => %group_ip, + ); - // TODO: Configure OPTE port for specific multicast group membership - // once OPTE kernel module APIs are available. This is distinct from - // zone vNIC underlay configuration (see instance.rs - // `join_multicast_group_inner`). + // Effectively infallible, as the IPs are verified as multicast, + // the operation is idempotent, and the port exists. + hdl.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: port_name.clone(), + group: (*group_ip).into(), + })?; + + port_state.mcast_subscriptions.remove(group_ip); + } + + // Subscribe to new groups or update changed filters. + let mut added = 0usize; + for (group_ip, filter) in &new_subs { + let needs_subscribe = + match port_state.mcast_subscriptions.get(group_ip) { + None => true, + Some(current) => current != filter, + }; + + if needs_subscribe { + added += 1; + debug!( + self.inner.log, + "subscribing to multicast group"; + "port" => &port_name, + "group" => %group_ip, + "filter" => ?filter, + ); + + // Effectively infallible as the IPs are verified as multicast, + // the operation is idempotent, and the port exists. + hdl.mcast_subscribe(&McastSubscribeReq { + port_name: port_name.clone(), + group: (*group_ip).into(), + filter: filter.clone(), + })?; + + port_state + .mcast_subscriptions + .insert(*group_ip, filter.clone()); + } + } + + if added > 0 || removed > 0 { + info!( + self.inner.log, + "multicast subscriptions updated"; + "port" => &port_name, + "added" => added, + "removed" => removed, + "active_groups" => port_state.mcast_subscriptions.len(), + ); + } else { + debug!( + self.inner.log, + "multicast subscriptions reconciled, no change"; + "port" => &port_name, + "active_groups" => port_state.mcast_subscriptions.len(), + ); + } Ok(()) } @@ -846,10 +973,11 @@ impl PortManager { // We update VPC rules as a set so grab only // the relevant ports using the VPC's VNI. - let vpc_ports = ports - .iter() - .filter(|((_, _), port)| u32::from(vni) == u32::from(*port.vni())); - for ((_, _), port) in vpc_ports { + let vpc_ports = ports.iter().filter(|((_, _), port_state)| { + u32::from(vni) == u32::from(*port_state.port.vni()) + }); + for ((_, _), port_state) in vpc_ports { + let port = &port_state.port; let rules = opte_firewall_rules(rules, port.vni(), port.mac()); let port_name = port.name().to_string(); info!( @@ -951,6 +1079,220 @@ impl PortManager { Ok(()) } + /// Install a multicast overlay-to-underlay (M2P) mapping in OPTE. + pub fn set_mcast_m2p(&self, req: &Mcast2PhysMapping) -> Result<(), Error> { + let addr: Ipv6Addr = req.underlay; + + info!( + self.inner.log, + "Setting multicast overlay-to-underlay mapping"; + "group" => %req.group, + "underlay" => %addr, + ); + + let underlay = MulticastUnderlay::new(addr.into()) + .map_err(|_| Error::InvalidMcastUnderlay(addr))?; + let hdl = Handle::new()?; + hdl.set_m2p(&SetMcast2PhysReq { group: req.group.into(), underlay })?; + Ok(()) + } + + /// Remove a multicast overlay-to-underlay (M2P) mapping from OPTE. + pub fn clear_mcast_m2p(&self, req: &ClearMcast2Phys) -> Result<(), Error> { + let addr: Ipv6Addr = req.underlay; + + info!( + self.inner.log, + "Clearing multicast overlay-to-underlay mapping"; + "group" => %req.group, + "underlay" => %addr, + ); + + let underlay = MulticastUnderlay::new(addr.into()) + .map_err(|_| Error::InvalidMcastUnderlay(addr))?; + let hdl = Handle::new()?; + hdl.clear_m2p(&ClearMcast2PhysReq { + group: req.group.into(), + underlay, + })?; + Ok(()) + } + + /// Set multicast forwarding next hops for an underlay group address. + pub fn set_mcast_fwd( + &self, + req: &McastForwardingEntry, + ) -> Result<(), Error> { + // Safe to unwrap: 77 is well within the 24-bit VNI range. + let mcast_vni = + Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI).unwrap(); + let addr: Ipv6Addr = req.underlay; + + info!( + self.inner.log, + "Setting multicast forwarding"; + "underlay" => %addr, + "next_hops" => req.next_hops.len(), + ); + + let underlay = MulticastUnderlay::new(addr.into()) + .map_err(|_| Error::InvalidMcastUnderlay(addr))?; + let next_hops = req + .next_hops + .iter() + .map(|nexthop| oxide_vpc::api::McastForwardingNextHop { + next_hop: oxide_vpc::api::NextHopV6 { + addr: nexthop.next_hop.into(), + vni: mcast_vni, + }, + replication: match nexthop.replication { + McastReplication::External => { + oxide_vpc::api::Replication::External + } + McastReplication::Underlay => { + oxide_vpc::api::Replication::Underlay + } + McastReplication::Both => oxide_vpc::api::Replication::Both, + }, + source_filter: match nexthop.filter.mode { + McastFilterMode::Include => SourceFilter::Include( + nexthop + .filter + .sources + .iter() + .copied() + .map(Into::into) + .collect(), + ), + McastFilterMode::Exclude => SourceFilter::Exclude( + nexthop + .filter + .sources + .iter() + .copied() + .map(Into::into) + .collect(), + ), + }, + }) + .collect(); + let hdl = Handle::new()?; + hdl.set_mcast_fwd(&SetMcastForwardingReq { underlay, next_hops })?; + Ok(()) + } + + /// Remove all multicast forwarding entries for an underlay group address. + pub fn clear_mcast_fwd( + &self, + req: &ClearMcastForwarding, + ) -> Result<(), Error> { + let addr: Ipv6Addr = req.underlay; + + info!( + self.inner.log, + "Clearing multicast forwarding"; + "underlay" => %addr, + ); + + let underlay = MulticastUnderlay::new(addr.into()) + .map_err(|_| Error::InvalidMcastUnderlay(addr))?; + let hdl = Handle::new()?; + hdl.clear_mcast_fwd(&ClearMcastForwardingReq { underlay })?; + Ok(()) + } + + /// Dump all multicast overlay-to-underlay (M2P) mappings from OPTE. + pub fn list_mcast_m2p(&self) -> Result, Error> { + let hdl = Handle::new()?; + let resp = hdl.dump_m2p()?; + let mappings = resp + .ip4 + .into_iter() + .map(|(group, underlay)| Mcast2PhysMapping { + group: IpAddr::V4(group.into()), + underlay: Ipv6Addr::from(underlay.addr()), + }) + .chain(resp.ip6.into_iter().map(|(group, underlay)| { + Mcast2PhysMapping { + group: IpAddr::V6(group.into()), + underlay: Ipv6Addr::from(underlay.addr()), + } + })) + .collect(); + Ok(mappings) + } + + /// Dump all multicast forwarding entries from OPTE. + pub fn list_mcast_fwd(&self) -> Result, Error> { + let hdl = Handle::new()?; + let resp = hdl.dump_mcast_fwd()?; + resp.entries + .into_iter() + .map(|entry| { + let next_hops = entry + .next_hops + .into_iter() + .filter_map(|nexthop| { + let replication = match nexthop.replication { + oxide_vpc::api::Replication::External => { + McastReplication::External + } + oxide_vpc::api::Replication::Underlay => { + McastReplication::Underlay + } + oxide_vpc::api::Replication::Both => { + McastReplication::Both + } + oxide_vpc::api::Replication::Reserved => { + // Reserved is a 2-bit padding value with + // no valid semantic meaning. Its presence + // in the forwarding table indicates a bug + // or manual opteadm intervention. Skip + // this hop rather than failing the entire + // list so the reconciler can still program + // valid next-hops. + warn!( + self.inner.log, + "skipping next hop with Reserved \ + replication mode"; + "next_hop" => %nexthop.next_hop.addr + ); + return None; + } + }; + + Some(McastForwardingNextHop { + next_hop: nexthop.next_hop.addr.into(), + replication, + filter: McastSourceFilter { + mode: match nexthop.source_filter.mode() { + FilterMode::Include => { + McastFilterMode::Include + } + FilterMode::Exclude => { + McastFilterMode::Exclude + } + }, + sources: nexthop + .source_filter + .sources() + .iter() + .copied() + .map(Into::into) + .collect(), + }, + }) + }) + .collect(); + + Ok(McastForwardingEntry { + underlay: Ipv6Addr::from(entry.underlay.addr()), + next_hops, + }) + }) + .collect() + } + pub fn attached_subnets_ensure( &self, nic_id: Uuid, @@ -959,7 +1301,7 @@ impl PortManager { ensure_added: Vec, ) -> EnsureAttachedSubnetResult { let ports = self.inner.ports.lock().unwrap(); - let Some(port) = ports.get(&(nic_id, nic_kind)) else { + let Some(port_state) = ports.get(&(nic_id, nic_kind)) else { return EnsureAttachedSubnetResult { diff: Default::default(), error: Some(Error::AttachedSubnetUpdateMissingPort( @@ -967,7 +1309,11 @@ impl PortManager { )), }; }; - self.attached_subnets_ensure_port(port, ensure_removed, ensure_added) + self.attached_subnets_ensure_port( + &port_state.port, + ensure_removed, + ensure_added, + ) } fn attached_subnets_ensure_port( @@ -1019,10 +1365,10 @@ impl PortManager { subnet: AttachedSubnet, ) -> Result<(), Error> { let ports = self.inner.ports.lock().unwrap(); - let port = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { + let port_state = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { Error::AttachedSubnetUpdateMissingPort(nic_id, nic_kind) })?; - self.attach_subnet_port(port, subnet) + self.attach_subnet_port(&port_state.port, subnet) } fn attach_subnet_port( @@ -1068,10 +1414,10 @@ impl PortManager { subnet: IpCidr, ) -> Result<(), Error> { let ports = self.inner.ports.lock().unwrap(); - let port = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { + let port_state = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { Error::AttachedSubnetUpdateMissingPort(nic_id, nic_kind) })?; - self.detach_subnet_port(port, subnet) + self.detach_subnet_port(&port_state.port, subnet) } fn detach_subnet_port( @@ -1144,7 +1490,7 @@ impl PortTicket { fn release_inner(&mut self) -> Result<(), Error> { let mut ports = self.manager.ports.lock().unwrap(); - let Some(port) = ports.remove(&(self.id, self.kind)) else { + let Some(port_state) = ports.remove(&(self.id, self.kind)) else { error!( self.manager.log, "Tried to release non-existent port"; @@ -1175,6 +1521,7 @@ impl PortTicket { ); } }; + let port = &port_state.port; let mut routes = self.manager.routes.lock().unwrap(); remove_key(&mut routes, port.system_router_key()); if let Some(key) = port.custom_ipv4_router_key() { @@ -1184,12 +1531,13 @@ impl PortTicket { remove_key(&mut routes, key); } drop(routes); + debug!( self.manager.log, "Removed OPTE port from manager"; "id" => ?&self.id, "kind" => ?&self.kind, - "port" => ?&port, + "port" => ?&port_state, ); Ok(()) } @@ -1219,6 +1567,7 @@ impl Drop for PortTicket { mod tests { use super::PortCreateParams; use super::PortManager; + use crate::opte::Error; use crate::opte::Handle; use macaddr::MacAddr6; use omicron_common::api::external::{MacAddr, Vni}; @@ -1239,17 +1588,24 @@ mod tests { use omicron_common::api::internal::shared::SourceNatConfigV6; use omicron_test_utils::dev::test_setup_log; use oxide_vpc::api::DhcpCfg; + use oxide_vpc::api::FilterMode; use oxide_vpc::api::IpCfg; use oxide_vpc::api::Ipv4Cidr; use oxide_vpc::api::Ipv6Cidr; + use oxide_vpc::api::SourceFilter; use oxnet::IpNet; use oxnet::Ipv4Net; use oxnet::Ipv6Net; + use sled_agent_types::multicast::MulticastGroupCfg; use std::collections::HashSet; + use std::net::IpAddr; use std::net::Ipv4Addr; use std::net::Ipv6Addr; use uuid::Uuid; + // Maximum ephemeral port number for source NAT (14-bit range). + const MAX_PORT: u16 = (1 << 14) - 1; + // Regression for https://github.com/oxidecomputer/omicron/issues/7541. #[test] fn multiple_ports_does_not_destroy_default_route() { @@ -1271,7 +1627,7 @@ mod tests { const SERVICES_VPC_VNI: Vni = Vni::SERVICES_VNI; let handle = Handle::new().unwrap(); - handle.set_xde_underlay("foo0", "foo1").unwrap(); + handle.set_xde_underlay("underlay0", "underlay1").unwrap(); // First, create a port for a service. // @@ -1325,6 +1681,7 @@ mod tests { dns6_servers: Vec::new(), }, attached_subnets: vec![], + multicast_groups: &[], }) .unwrap(); @@ -1504,6 +1861,7 @@ mod tests { dns6_servers: Vec::new(), }, attached_subnets: vec![], + multicast_groups: &[], }) .unwrap(); @@ -1675,6 +2033,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let IpCfg::Ipv4(oxide_vpc::api::Ipv4Cfg { vpc_subnet, @@ -1748,6 +2107,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let IpCfg::Ipv6(oxide_vpc::api::Ipv6Cfg { vpc_subnet, @@ -1755,12 +2115,14 @@ mod tests { gateway_ip, external_ips: oxide_vpc::api::ExternalIpCfg { snat, ephemeral_ip, floating_ips }, - attached_subnets: _, - transit_ips: _, + attached_subnets, + transit_ips, }) = IpCfg::try_from(&prs).unwrap() else { - panic!("Expected IPv4 config") + panic!("Expected IPv6 config") }; + assert!(attached_subnets.is_empty()); + assert!(transit_ips.is_empty()); assert_eq!(private_ip, priv_ip.into()); assert_eq!( @@ -1832,6 +2194,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let IpCfg::DualStack { ipv4, ipv6 } = IpCfg::try_from(&prs).unwrap() else { @@ -1922,6 +2285,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let _ = IpCfg::try_from(&prs).expect_err( "Should fail to convert with public IPv6 and private IPv4", @@ -1968,9 +2332,274 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let _ = IpCfg::try_from(&prs).expect_err( "Should fail to convert with public IPv4 and private IPv6", ); } + + #[test] + fn multicast_groups_ensure_diffing() { + let logctx = test_setup_log("multicast_groups_ensure_diffing"); + let manager = PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST); + + let handle = Handle::new().unwrap(); + handle.set_xde_underlay("underlay0", "underlay1").unwrap(); + + let nic_id = Uuid::new_v4(); + let nic_kind = NetworkInterfaceKind::Service { id: Uuid::new_v4() }; + + let private_subnet = + Ipv4Net::new(Ipv4Addr::new(172, 20, 0, 0), 24).unwrap(); + let private_ip = Ipv4Addr::new(172, 20, 0, 4); + let ip_config = + PrivateIpConfig::new_ipv4(private_ip, private_subnet).unwrap(); + let public_ip = Ipv4Addr::new(10, 0, 0, 4); + + let external_ips = ExternalIpConfig { + v4: Some(ExternalIpv4Config { + source_nat: Some( + SourceNatConfigV4::new(public_ip, 0, MAX_PORT).unwrap(), + ), + ..Default::default() + }), + v6: None, + }; + + // Bindings keep the port registered in the manager for this scope. + let (_port, _ticket) = manager + .create_port(PortCreateParams { + nic: &NetworkInterface { + id: nic_id, + kind: nic_kind, + name: "opte0".parse().unwrap(), + ip_config, + mac: MacAddr(MacAddr6::new( + 0xa8, 0x40, 0x25, 0x00, 0x00, 0x01, + )), + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + }, + external_ips: &external_ips, + firewall_rules: &[], + dhcp_config: DhcpCfg { + hostname: None, + host_domain: None, + domain_search_list: Vec::new(), + dns4_servers: Vec::new(), + dns6_servers: Vec::new(), + }, + attached_subnets: vec![], + multicast_groups: &[], + }) + .unwrap(); + + let group1: IpAddr = "239.1.1.1".parse().unwrap(); + let group2: IpAddr = "239.1.1.2".parse().unwrap(); + let source_a: IpAddr = "10.0.0.1".parse().unwrap(); + + // Subscribe to two groups: one ASM, one SSM. + manager + .multicast_groups_ensure( + nic_id, + nic_kind, + &[ + MulticastGroupCfg { group_ip: group1, sources: vec![] }, + MulticastGroupCfg { + group_ip: group2, + sources: vec![source_a], + }, + ], + ) + .unwrap(); + + // Verify port manager tracking. + { + let ports = manager.inner.ports.lock().unwrap(); + let port_state = ports.get(&(nic_id, nic_kind)).unwrap(); + assert_eq!(port_state.mcast_subscriptions.len(), 2); + assert_eq!( + *port_state.mcast_subscriptions.get(&group1).unwrap(), + SourceFilter::default(), + ); + assert_eq!( + port_state.mcast_subscriptions.get(&group2).unwrap().mode(), + FilterMode::Include, + ); + } + + // Verify mock OPTE state matches. + { + let opte = handle.state().lock().unwrap(); + let port = opte.ports.get("opte0").unwrap(); + assert_eq!(port.mcast_subscriptions.len(), 2); + assert!(port.mcast_subscriptions.contains_key(&group1)); + assert!(port.mcast_subscriptions.contains_key(&group2)); + } + + // Remove group2, keep group1. + manager + .multicast_groups_ensure( + nic_id, + nic_kind, + &[MulticastGroupCfg { group_ip: group1, sources: vec![] }], + ) + .unwrap(); + + { + let ports = manager.inner.ports.lock().unwrap(); + let port_state = ports.get(&(nic_id, nic_kind)).unwrap(); + assert_eq!(port_state.mcast_subscriptions.len(), 1); + assert!(port_state.mcast_subscriptions.contains_key(&group1)); + assert!(!port_state.mcast_subscriptions.contains_key(&group2)); + } + + { + let opte = handle.state().lock().unwrap(); + let port = opte.ports.get("opte0").unwrap(); + assert_eq!(port.mcast_subscriptions.len(), 1); + assert!(!port.mcast_subscriptions.contains_key(&group2)); + } + + // Remove all groups. + manager.multicast_groups_ensure(nic_id, nic_kind, &[]).unwrap(); + + { + let ports = manager.inner.ports.lock().unwrap(); + let port_state = ports.get(&(nic_id, nic_kind)).unwrap(); + assert!(port_state.mcast_subscriptions.is_empty()); + } + + { + let opte = handle.state().lock().unwrap(); + let port = opte.ports.get("opte0").unwrap(); + assert!(port.mcast_subscriptions.is_empty()); + } + + logctx.cleanup_successful(); + } + + #[test] + fn multicast_port_deletion_cleanup() { + let logctx = test_setup_log("multicast_port_deletion_cleanup"); + let manager = PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST); + + let handle = Handle::new().unwrap(); + handle.set_xde_underlay("underlay0", "underlay1").unwrap(); + + let nic_id = Uuid::new_v4(); + let nic_kind = NetworkInterfaceKind::Service { id: Uuid::new_v4() }; + + let private_subnet = + Ipv4Net::new(Ipv4Addr::new(172, 20, 0, 0), 24).unwrap(); + let private_ip = Ipv4Addr::new(172, 20, 0, 4); + let ip_config = + PrivateIpConfig::new_ipv4(private_ip, private_subnet).unwrap(); + let public_ip = Ipv4Addr::new(10, 0, 0, 4); + + let external_ips = ExternalIpConfig { + v4: Some(ExternalIpv4Config { + source_nat: Some( + SourceNatConfigV4::new(public_ip, 0, MAX_PORT).unwrap(), + ), + ..Default::default() + }), + v6: None, + }; + + let (_port, ticket) = manager + .create_port(PortCreateParams { + nic: &NetworkInterface { + id: nic_id, + kind: nic_kind, + name: "opte0".parse().unwrap(), + ip_config, + mac: MacAddr(MacAddr6::new( + 0xa8, 0x40, 0x25, 0x00, 0x00, 0x01, + )), + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + }, + external_ips: &external_ips, + firewall_rules: &[], + dhcp_config: DhcpCfg { + hostname: None, + host_domain: None, + domain_search_list: Vec::new(), + dns4_servers: Vec::new(), + dns6_servers: Vec::new(), + }, + attached_subnets: vec![], + multicast_groups: &[], + }) + .unwrap(); + + let group1: IpAddr = "239.2.2.1".parse().unwrap(); + + // Subscribe to a multicast group. + manager + .multicast_groups_ensure( + nic_id, + nic_kind, + &[MulticastGroupCfg { group_ip: group1, sources: vec![] }], + ) + .unwrap(); + + // Verify subscription tracking exists. + { + let ports = manager.inner.ports.lock().unwrap(); + let port_state = ports.get(&(nic_id, nic_kind)).unwrap(); + assert_eq!( + port_state.mcast_subscriptions.len(), + 1, + "subscription tracking should exist before release" + ); + } + + // Release the port ticket, which should clean up the port + // and its subscription tracking. + ticket.release(); + + // Verify port is removed entirely. + { + let ports = manager.inner.ports.lock().unwrap(); + assert!( + !ports.contains_key(&(nic_id, nic_kind)), + "port should be removed after release" + ); + } + + logctx.cleanup_successful(); + } + + #[test] + fn multicast_ensure_missing_port_error() { + let logctx = test_setup_log("multicast_ensure_missing_port_error"); + let manager = PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST); + + let nic_id = Uuid::new_v4(); + let nic_kind = NetworkInterfaceKind::Instance { id: Uuid::new_v4() }; + let group: IpAddr = "239.3.3.1".parse().unwrap(); + + let res = manager.multicast_groups_ensure( + nic_id, + nic_kind, + &[MulticastGroupCfg { group_ip: group, sources: vec![] }], + ); + + match res { + Err(Error::MulticastUpdateMissingPort(id, kind)) => { + assert_eq!(id, nic_id); + assert_eq!(kind, nic_kind); + } + other => { + panic!("expected MulticastUpdateMissingPort, got {other:?}") + } + } + + logctx.cleanup_successful(); + } } diff --git a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs index aa513da8d4e..f259a69d5f4 100644 --- a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs +++ b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs @@ -225,7 +225,7 @@ mod api_impl { use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; use sled_agent_types::instance::InstanceEnsureBody; use sled_agent_types::instance::InstanceExternalIpBody; - use sled_agent_types::instance::InstanceMulticastBody; + use sled_agent_types::instance::InstanceMulticastMembership; use sled_agent_types::instance::VmmIssueDiskSnapshotRequestBody; use sled_agent_types::instance::VmmIssueDiskSnapshotRequestPathParam; use sled_agent_types::instance::VmmIssueDiskSnapshotRequestResponse; @@ -249,6 +249,10 @@ mod api_impl { use sled_agent_types::inventory::SledCpuFamily; use sled_agent_types::inventory::SledRole; use sled_agent_types::inventory::SvcsEnabledNotOnlineResult; + use sled_agent_types::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, + }; use sled_agent_types::probes::ProbeSet; use sled_agent_types::sled::AddSledRequest; use sled_agent_types::support_bundle::RangeRequestHeaders; @@ -627,45 +631,17 @@ mod api_impl { async fn vmm_join_multicast_group( _rqctx: RequestContext, _path_params: Path, - body: TypedBody, + _body: TypedBody, ) -> Result { - let body_args = body.into_inner(); - match body_args { - InstanceMulticastBody::Join(_) => { - // MGS test utility - just return success for test compatibility - Ok(HttpResponseUpdatedNoContent()) - } - InstanceMulticastBody::Leave(_) => { - // This endpoint is for joining - reject leave operations - Err(HttpError::for_bad_request( - None, - "Join endpoint cannot process Leave operations" - .to_string(), - )) - } - } + unimplemented!() } async fn vmm_leave_multicast_group( _rqctx: RequestContext, _path_params: Path, - body: TypedBody, + _body: TypedBody, ) -> Result { - let body_args = body.into_inner(); - match body_args { - InstanceMulticastBody::Leave(_) => { - // MGS test utility - just return success for test compatibility - Ok(HttpResponseUpdatedNoContent()) - } - InstanceMulticastBody::Join(_) => { - // This endpoint is for leaving - reject join operations - Err(HttpError::for_bad_request( - None, - "Leave endpoint cannot process Join operations" - .to_string(), - )) - } - } + unimplemented!() } async fn disk_put( @@ -756,6 +732,47 @@ mod api_impl { unimplemented!() } + async fn set_mcast_m2p( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn clear_mcast_m2p( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn set_mcast_fwd( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn clear_mcast_fwd( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn list_mcast_m2p( + _rqctx: RequestContext, + ) -> Result>, HttpError> { + unimplemented!() + } + + async fn list_mcast_fwd( + _rqctx: RequestContext, + ) -> Result>, HttpError> + { + unimplemented!() + } + async fn uplink_ensure( _rqctx: RequestContext, _body: TypedBody, diff --git a/nexus/src/app/background/tasks/multicast/groups.rs b/nexus/src/app/background/tasks/multicast/groups.rs index db2c51938a5..b05fd90697f 100644 --- a/nexus/src/app/background/tasks/multicast/groups.rs +++ b/nexus/src/app/background/tasks/multicast/groups.rs @@ -20,6 +20,9 @@ //! - **"Creating" state**: Initiate DPD "ensure" to apply configuration //! - **"Active" state**: Detect DPD drift and sync directly //! - **"Deleting" state**: Switch cleanup and database removal +//! - **M2P/forwarding propagation**: Convergent per-sled propagation of +//! M2P mappings and forwarding entries via sled-agent after member +//! state changes //! - **Extensible processing**: Support for different group types //! //! # Group State Transition Matrix @@ -93,6 +96,7 @@ use super::{ use crate::app::multicast::dataplane::{ GroupUpdateParams, MulticastDataplaneClient, }; +use crate::app::multicast::sled::MulticastSledClient; use crate::app::saga::create_saga_dag; use crate::app::sagas; @@ -100,7 +104,7 @@ use crate::app::sagas; /// /// This grace period avoids racing with in-progress member attachment operations /// that occur immediately after group creation. -const ORPHAN_GROUP_MIN_AGE: chrono::Duration = chrono::Duration::seconds(10); +const ORPHAN_GROUP_MIN_AGE: chrono::TimeDelta = chrono::TimeDelta::seconds(10); /// Check if DPD tag matches the database group's tag. /// @@ -130,35 +134,48 @@ fn dpd_state_matches_sources( let dpd_sources = dpd_group.sources.clone(); let group_ip = group.multicast_ip.ip(); - // Expected DPD state based on source filter logic (RFC 4607) - let expected_sources = if is_ssm_address(group_ip) { - Some(&source_filter.specific_sources) + if is_ssm_address(group_ip) { + // SSM: always expect specific sources + match dpd_sources { + None => false, + Some(dpd_srcs) => { + let mut dpd_ips: Vec<_> = dpd_srcs + .into_iter() + .filter_map(|src| match src { + dpd_client::types::IpSrc::Exact(ip) => Some(ip), + _ => None, + }) + .collect(); + dpd_ips.sort(); + + let mut expected: Vec<_> = + source_filter.specific_sources.iter().copied().collect(); + expected.sort(); + + dpd_ips == expected + } + } } else if source_filter.has_any_source_member { - None + dpd_sources.is_none() } else { - Some(&source_filter.specific_sources) - }; - - match (dpd_sources, expected_sources) { - (None, None) => true, - (Some(_), None) => false, // DPD has sources but shouldn't - (None, Some(_)) => false, // DPD missing sources - (Some(dpd_srcs), Some(expected)) => { - // Extract exact IPs from DPD sources - let mut dpd_ips: Vec<_> = dpd_srcs - .into_iter() - .filter_map(|src| match src { - dpd_client::types::IpSrc::Exact(ip) => Some(ip), - _ => None, - }) - .collect(); - dpd_ips.sort(); - - let mut expected_sorted: Vec<_> = - expected.iter().copied().collect(); - expected_sorted.sort(); - - dpd_ips == expected_sorted + match dpd_sources { + None => source_filter.specific_sources.is_empty(), + Some(dpd_srcs) => { + let mut dpd_ips: Vec<_> = dpd_srcs + .into_iter() + .filter_map(|src| match src { + dpd_client::types::IpSrc::Exact(ip) => Some(ip), + _ => None, + }) + .collect(); + dpd_ips.sort(); + + let mut expected: Vec<_> = + source_filter.specific_sources.iter().copied().collect(); + expected.sort(); + + dpd_ips == expected + } } } } @@ -180,6 +197,7 @@ trait GroupStateProcessor { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result; /// Process a group in "Active" state (check DPD sync status). @@ -189,6 +207,7 @@ trait GroupStateProcessor { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result; } @@ -213,9 +232,15 @@ impl GroupStateProcessor for ExternalGroupProcessor { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { reconciler - .handle_deleting_external_group(opctx, group, dataplane_client) + .handle_deleting_external_group( + opctx, + group, + dataplane_client, + sled_client, + ) .await } @@ -226,9 +251,15 @@ impl GroupStateProcessor for ExternalGroupProcessor { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { reconciler - .handle_active_external_group(opctx, group, dataplane_client) + .handle_active_external_group( + opctx, + group, + dataplane_client, + sled_client, + ) .await } } @@ -336,6 +367,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, state: MulticastGroupState, dataplane_client: Option<&MulticastDataplaneClient>, + sled_client: Option<&MulticastSledClient>, ) -> Result { trace!(opctx.log, "searching for multicast groups"; "state" => %state); @@ -363,7 +395,12 @@ impl MulticastGroupReconciler { let results = stream::iter(groups) .map(|group| async move { let result = self - .process_group_state(opctx, &group, dataplane_client) + .process_group_state( + opctx, + &group, + dataplane_client, + sled_client, + ) .await; (group, result) }) @@ -404,7 +441,7 @@ impl MulticastGroupReconciler { processed += 1; } - debug!( + trace!( opctx.log, "processed multicast group"; "state" => %state, @@ -446,6 +483,7 @@ impl MulticastGroupReconciler { opctx, MulticastGroupState::Creating, None, + None, ) .await } @@ -455,11 +493,13 @@ impl MulticastGroupReconciler { &self, opctx: &OpContext, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { self.reconcile_groups_by_state( opctx, MulticastGroupState::Deleting, Some(dataplane_client), + Some(sled_client), ) .await } @@ -469,11 +509,13 @@ impl MulticastGroupReconciler { &self, opctx: &OpContext, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { self.reconcile_groups_by_state( opctx, MulticastGroupState::Active, Some(dataplane_client), + Some(sled_client), ) .await } @@ -485,6 +527,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: Option<&MulticastDataplaneClient>, + sled_client: Option<&MulticastSledClient>, ) -> Result { // Future: Match on group type to select different processors if // we add more nuanced group types @@ -497,15 +540,31 @@ impl MulticastGroupReconciler { MulticastGroupState::Deleting => { let dataplane_client = dataplane_client .context("dataplane client required for deleting state")?; + let sled_client = sled_client + .context("sled client required for deleting state")?; processor - .process_deleting(self, opctx, group, dataplane_client) + .process_deleting( + self, + opctx, + group, + dataplane_client, + sled_client, + ) .await } MulticastGroupState::Active => { let dataplane_client = dataplane_client .context("dataplane client required for active state")?; + let sled_client = sled_client + .context("sled client required for active state")?; processor - .process_active(self, opctx, group, dataplane_client) + .process_active( + self, + opctx, + group, + dataplane_client, + sled_client, + ) .await } MulticastGroupState::Deleted => { @@ -623,6 +682,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { debug!( opctx.log, @@ -635,8 +695,13 @@ impl MulticastGroupReconciler { "dpd_cleanup_required" => true ); - self.process_deleting_group_inner(opctx, group, dataplane_client) - .await?; + self.process_deleting_group_inner( + opctx, + group, + dataplane_client, + sled_client, + ) + .await?; Ok(StateTransition::StateChanged) } @@ -649,6 +714,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { let underlay_group_id = group .underlay_group_id @@ -747,6 +813,22 @@ impl MulticastGroupReconciler { "group_id" => %group.id(), "multicast_ip" => %group.multicast_ip ); + + // Propagate M2P/forwarding to member sleds after DPD + // sync to ensure OPTE state is also consistent. + if let Err(e) = sled_client + .propagate_m2p_and_forwarding(opctx, group) + .await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding after \ + drift correction (will retry)"; + "group_id" => %group.id(), + "error" => %e + ); + } + Ok(StateTransition::StateChanged) } Err(e) => { @@ -761,6 +843,19 @@ impl MulticastGroupReconciler { } } } else { + // Even when DPD is in sync, propagate M2P/forwarding to + // member sleds to correct any sled-level drift. + if let Err(e) = + sled_client.propagate_m2p_and_forwarding(opctx, group).await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding (will retry)"; + "group_id" => %group.id(), + "error" => %e + ); + } + Ok(StateTransition::NoChange) } } @@ -772,7 +867,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, ) -> Result { - debug!( + trace!( opctx.log, "processing creating multicast group"; "group" => ?group @@ -789,7 +884,7 @@ impl MulticastGroupReconciler { format!("failed to fetch linked underlay group {underlay_id}") })?; - debug!( + trace!( opctx.log, "found linked underlay group"; "group" => ?group, @@ -798,7 +893,7 @@ impl MulticastGroupReconciler { underlay } None => { - debug!( + trace!( opctx.log, "creating new underlay group"; "group" => ?group @@ -860,6 +955,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result<(), anyhow::Error> { let tag = Self::get_multicast_tag(group) .context("multicast group missing tag")?; @@ -875,6 +971,15 @@ impl MulticastGroupReconciler { "cleanup_includes" => "[external_group, underlay_group, forwarding_rules, member_ports]" ); + // Clear M2P/forwarding from all sleds before DPD cleanup. + // This must succeed before deleting DB records, otherwise + // stale OPTE state would persist on failed sleds with no + // source of truth to drive a later cleanup pass. + sled_client + .clear_m2p_and_forwarding(opctx, group) + .await + .context("failed to clear M2P/forwarding from sleds")?; + // Use dataplane client from reconciliation pass to cleanup switch(es) // state by tag dataplane_client @@ -1034,9 +1139,8 @@ mod tests { } #[test] - fn test_dpd_state_matches_sources_asm_address() { - // ASM address with all members specifying sources: expect those - // sources in DPD. + fn test_dpd_state_matches_sources_asm_with_specific_sources() { + // ASM address with specific sources only (no any-source members) let source_filter = SourceFilterState { specific_sources: BTreeSet::from(["10.0.0.1" .parse::() @@ -1044,23 +1148,29 @@ mod tests { has_any_source_member: false, }; - let group = create_group("224.1.1.1"); // ASM address (not 232.x.x.x) + let group = create_group("224.1.1.1"); // ASM address - // DPD has matching sources (correct) + // DPD has matching specific sources let dpd_group = create_dpd_group(Some(vec![dpd_client::types::IpSrc::Exact( "10.0.0.1".parse().unwrap(), )])); assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); - // DPD has None (mismatch: ASM with all-specific should have sources) + // DPD has None (mismatch: should have specific sources) let dpd_group = create_dpd_group(None); assert!(!dpd_state_matches_sources(&dpd_group, &source_filter, &group)); + + // DPD has IpSrc::Any (mismatch: should have specific sources) + let dpd_group = + create_dpd_group(Some(vec![dpd_client::types::IpSrc::Any])); + assert!(!dpd_state_matches_sources(&dpd_group, &source_filter, &group)); } #[test] fn test_dpd_state_matches_sources_asm_with_any_source_member() { - // ASM address with has_any_source_member=true - expects None from DPD + // ASM address with has_any_source_member=true: we send None to DPD, + // and DPD canonicalizes any-source representations to None. let source_filter = SourceFilterState { specific_sources: BTreeSet::new(), has_any_source_member: true, @@ -1068,11 +1178,33 @@ mod tests { let group = create_group("224.1.1.1"); // ASM address - // DPD has None (correct for ASM with any-source members) + // DPD has None (correct: any-source canonicalizes to None) + let dpd_group = create_dpd_group(None); + assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); + + // DPD has specific sources (mismatch) + let dpd_group = + create_dpd_group(Some(vec![dpd_client::types::IpSrc::Exact( + "10.0.0.1".parse().unwrap(), + )])); + assert!(!dpd_state_matches_sources(&dpd_group, &source_filter, &group)); + } + + #[test] + fn test_dpd_state_matches_sources_asm_no_sources() { + // ASM with no source filters at all expects None + let source_filter = SourceFilterState { + specific_sources: BTreeSet::new(), + has_any_source_member: false, + }; + + let group = create_group("224.1.1.1"); // ASM address + + // DPD has None (correct: no sources configured) let dpd_group = create_dpd_group(None); assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); - // DPD has sources (mismatch: should be none) + // DPD has sources (mismatch) let dpd_group = create_dpd_group(Some(vec![dpd_client::types::IpSrc::Exact( "10.0.0.1".parse().unwrap(), diff --git a/nexus/src/app/background/tasks/multicast/members.rs b/nexus/src/app/background/tasks/multicast/members.rs index 1b7f81c6ab3..14175619611 100644 --- a/nexus/src/app/background/tasks/multicast/members.rs +++ b/nexus/src/app/background/tasks/multicast/members.rs @@ -42,6 +42,11 @@ //! - **State transitions**: "Joining" → "Joined" → "Left" with reactivation //! - **Dataplane updates**: Applying and removing configuration via DPD //! client(s) on switches +//! - **M2P/forwarding propagation**: After join, leave, or migration, M2P +//! mappings and forwarding entries are propagated to all sleds via +//! sled-agent inline (not deferred to the next reconciliation pass) +//! - **OPTE subscriptions**: Per-VMM multicast group filters managed via +//! sled-agent on the hosting sled //! - **Sled migration**: Detecting moves and updating dataplane configuration //! (no transition to "Left") //! - **Cleanup**: Removing orphaned switch state for deleted members @@ -124,10 +129,31 @@ use omicron_uuid_kinds::{ use super::{MulticastGroupReconciler, StateTransition, SwitchBackplanePort}; use crate::app::multicast::dataplane::MulticastDataplaneClient; +use crate::app::multicast::sled::MulticastSledClient; + +/// Pre-fetched instance state for multicast reconciliation. +#[derive(Clone, Copy, Debug, Default)] +struct InstanceMulticastState { + /// Whether the instance is in a state that can receive multicast traffic. + valid: bool, + /// Current sled hosting the VMM, if any. + sled_id: Option, + /// Current propolis VMM identifier, if any. + propolis_id: Option, +} + +/// Context shared across member reconciliation operations. +struct MemberReconcileCtx<'a> { + opctx: &'a OpContext, + group: &'a MulticastGroup, + member: &'a MulticastGroupMember, + instance_states: &'a InstanceStateMap, + dataplane_client: &'a MulticastDataplaneClient, + sled_client: &'a MulticastSledClient, +} -/// Pre-fetched instance state data for batch processing. -/// Maps instance_id -> (is_valid_for_multicast, current_sled_id). -type InstanceStateMap = HashMap)>; +/// Maps instance_id to pre-fetched multicast-relevant state. +type InstanceStateMap = HashMap; /// Backplane port mapping from DPD-client. /// Maps switch port ID to backplane link configuration. @@ -168,33 +194,21 @@ trait MemberStateProcessor { async fn process_joining( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result; /// Process a member in "Joined" state. async fn process_joined( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result; /// Process a member in "Left" state. async fn process_left( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result; } @@ -205,61 +219,25 @@ impl MemberStateProcessor for InstanceMemberProcessor { async fn process_joining( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - reconciler - .handle_instance_joining( - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + reconciler.handle_instance_joining(ctx).await } async fn process_joined( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - reconciler - .handle_instance_joined( - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + reconciler.handle_instance_joined(ctx).await } async fn process_left( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - reconciler - .handle_instance_left( - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + reconciler.handle_instance_left(ctx).await } } @@ -276,6 +254,7 @@ impl MulticastGroupReconciler { &self, opctx: &OpContext, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { trace!(opctx.log, "reconciling member state changes"); @@ -286,7 +265,12 @@ impl MulticastGroupReconciler { for group in groups { match self - .process_group_member_states(opctx, &group, dataplane_client) + .process_group_member_states( + opctx, + &group, + dataplane_client, + sled_client, + ) .await { Ok(count) => { @@ -326,6 +310,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { let mut processed = 0; @@ -348,6 +333,7 @@ impl MulticastGroupReconciler { &member, &instance_states, dataplane_client, + sled_client, ) .await; (member, res) @@ -364,7 +350,7 @@ impl MulticastGroupReconciler { StateTransition::StateChanged | StateTransition::NoChange => { processed += 1; - debug!( + trace!( opctx.log, "processed member state change"; "member" => ?member, @@ -374,7 +360,7 @@ impl MulticastGroupReconciler { } StateTransition::NeedsCleanup => { processed += 1; - debug!( + trace!( opctx.log, "member marked for cleanup"; "member" => ?member, @@ -382,7 +368,7 @@ impl MulticastGroupReconciler { ); } StateTransition::EntityGone => { - debug!( + trace!( opctx.log, "member deleted during processing"; "member" => ?member, @@ -407,7 +393,7 @@ impl MulticastGroupReconciler { /// Main dispatch function for processing member state changes. /// - /// Routes to appropriate node based on member type. + /// Routes to the appropriate handler based on member state. async fn process_member_state( &self, opctx: &OpContext, @@ -415,6 +401,7 @@ impl MulticastGroupReconciler { member: &MulticastGroupMember, instance_states: &InstanceStateMap, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { // Check if the parent group has been deleted or is being deleted. // If so, delete the member so cleanup can proceed. @@ -444,43 +431,24 @@ impl MulticastGroupReconciler { // For now, all members are instance-based, but this is where we'd // dispatch to different processors for different member types let processor = InstanceMemberProcessor; + let ctx = MemberReconcileCtx { + opctx, + group, + member, + instance_states, + dataplane_client, + sled_client, + }; match member.state { MulticastGroupMemberState::Joining => { - processor - .process_joining( - self, - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + processor.process_joining(self, &ctx).await } MulticastGroupMemberState::Joined => { - processor - .process_joined( - self, - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + processor.process_joined(self, &ctx).await } MulticastGroupMemberState::Left => { - processor - .process_left( - self, - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + processor.process_left(self, &ctx).await } } } @@ -495,7 +463,7 @@ impl MulticastGroupReconciler { ) -> Result { // Skip if member is already deleted if member.time_deleted.is_some() { - debug!( + trace!( opctx.log, "member already deleted, no action needed"; "member_id" => %member.id, @@ -532,35 +500,25 @@ impl MulticastGroupReconciler { /// when ready. Uses CAS operations for concurrent-safe state updates. async fn handle_instance_joining( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - // Extract pre-fetched instance state - let (instance_valid, current_sled_id) = - self.get_instance_state_from_cache(instance_states, member); + let instance_state = + self.get_instance_state_from_cache(ctx.instance_states, ctx.member); - // Execute reconciliation CAS operation let reconcile_res = self .execute_joining_reconciliation( - opctx, - group, - member, - instance_valid, - current_sled_id, + ctx.opctx, + ctx.group, + ctx.member, + instance_state.valid, + instance_state.sled_id, ) .await?; - // Process reconciliation result self.process_joining_reconcile_result( - opctx, - group, - member, - instance_valid, + ctx, + instance_state, reconcile_res, - dataplane_client, ) .await } @@ -570,8 +528,8 @@ impl MulticastGroupReconciler { &self, instance_states: &InstanceStateMap, member: &MulticastGroupMember, - ) -> (bool, Option) { - instance_states.get(&member.parent_id).copied().unwrap_or((false, None)) + ) -> InstanceMulticastState { + instance_states.get(&member.parent_id).copied().unwrap_or_default() } /// Execute the reconciliation CAS operation for a member in "Joining" state. @@ -600,39 +558,29 @@ impl MulticastGroupReconciler { /// Process the result of a "Joining" state reconciliation operation. async fn process_joining_reconcile_result( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, reconcile_result: ReconcileJoiningResult, - dataplane_client: &MulticastDataplaneClient, ) -> Result { match reconcile_result.action { ReconcileAction::TransitionedToLeft => { - self.handle_transitioned_to_left(opctx, group, member).await + self.handle_transitioned_to_left( + ctx.opctx, ctx.group, ctx.member, + ) + .await } ReconcileAction::UpdatedSledId { old, new } => { self.handle_sled_id_updated( - opctx, - group, - member, - instance_valid, + ctx, + instance_state, SledIdUpdate { old, new }, - dataplane_client, ) .await } ReconcileAction::NotFound | ReconcileAction::NoChange => { - self.handle_no_change_or_not_found( - opctx, - group, - member, - instance_valid, - dataplane_client, - ) - .await + self.handle_no_change_or_not_found(ctx, instance_state).await } } } @@ -660,63 +608,43 @@ impl MulticastGroupReconciler { /// Handle the case where a member's sled_id was updated. async fn handle_sled_id_updated( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, sled_id_update: SledIdUpdate, - dataplane_client: &MulticastDataplaneClient, ) -> Result { - debug!( - opctx.log, + trace!( + ctx.opctx.log, "updated member sled_id, checking if ready to join"; - "member_id" => %member.id, + "member_id" => %ctx.member.id, "old_sled_id" => ?sled_id_update.old, "new_sled_id" => ?sled_id_update.new, - "group_state" => ?group.state, - "instance_valid" => instance_valid + "group_state" => ?ctx.group.state, + "instance_valid" => instance_state.valid ); - self.try_complete_join_if_ready( - opctx, - group, - member, - instance_valid, - dataplane_client, - ) - .await + self.try_complete_join_if_ready(ctx, instance_state).await } /// Handle the case where no changes were made or member was not found. async fn handle_no_change_or_not_found( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, ) -> Result { // Check if member is already in Joined state - if member.state == MulticastGroupMemberState::Joined { - debug!( - opctx.log, + if ctx.member.state == MulticastGroupMemberState::Joined { + trace!( + ctx.opctx.log, "member already in 'Joined' state, no action needed"; - "member_id" => %member.id, - "group_id" => %group.id(), - "group_name" => group.name().as_str() + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str() ); return Ok(StateTransition::NoChange); } // Try to complete the join if conditions are met - self.try_complete_join_if_ready( - opctx, - group, - member, - instance_valid, - dataplane_client, - ) - .await + self.try_complete_join_if_ready(ctx, instance_state).await } fn is_ready_to_join( @@ -729,30 +657,31 @@ impl MulticastGroupReconciler { async fn try_complete_join_if_ready( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, ) -> Result { - if self.is_ready_to_join(group, instance_valid) { - self.complete_instance_member_join( - opctx, - group, - member, - dataplane_client, - ) - .await?; - Ok(StateTransition::StateChanged) + if self.is_ready_to_join(ctx.group, instance_state.valid) { + let joined = self + .complete_instance_member_join( + ctx, + None, + instance_state.propolis_id, + ) + .await?; + if joined { + Ok(StateTransition::StateChanged) + } else { + Ok(StateTransition::NoChange) + } } else { - debug!( - opctx.log, + trace!( + ctx.opctx.log, "member not ready to join: waiting for next run"; - "member_id" => %member.id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "instance_valid" => instance_valid, - "group_state" => ?group.state + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "instance_valid" => instance_state.valid, + "group_state" => ?ctx.group.state ); Ok(StateTransition::NoChange) } @@ -761,62 +690,61 @@ impl MulticastGroupReconciler { /// Instance-specific handler for members in "Joined" state. async fn handle_instance_joined( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - // Get pre-fetched instance state and sled_id - let (instance_valid, current_sled_id) = instance_states - .get(&member.parent_id) + let instance_state = ctx + .instance_states + .get(&ctx.member.parent_id) .copied() - .unwrap_or((false, None)); + .unwrap_or_default(); - match (instance_valid, current_sled_id) { - // Invalid instance -> remove from dataplane and transition to "Left" + match (instance_state.valid, instance_state.sled_id) { (false, _) => { self.handle_invalid_instance( - opctx, - group, - member, - dataplane_client, + ctx.opctx, + ctx.group, + ctx.member, + ctx.dataplane_client, + ctx.sled_client, ) .await } - // Valid instance with sled, but sled changed (migration) - (true, Some(sled_id)) if member.sled_id != Some(sled_id.into()) => { + (true, Some(sled_id)) + if ctx.member.sled_id != Some(sled_id.into()) => + { self.handle_sled_migration( - opctx, - group, - member, + ctx, sled_id, - dataplane_client, + instance_state.propolis_id, ) .await } - // Valid instance with sled, sled unchanged -> verify configuration (true, Some(_)) => { - self.verify_members(opctx, group, member, dataplane_client) - .await?; + self.verify_members( + ctx.opctx, + ctx.group, + ctx.member, + ctx.dataplane_client, + ctx.sled_client, + ) + .await?; trace!( - opctx.log, + ctx.opctx.log, "member configuration verified, no changes needed"; - "member_id" => %member.id, - "group_id" => %group.id() + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id() ); Ok(StateTransition::NoChange) } - // Valid instance but no sled_id (shouldn't typically happen in "Joined" state) (true, None) => { self.handle_joined_without_sled( - opctx, - group, - member, - dataplane_client, + ctx.opctx, + ctx.group, + ctx.member, + ctx.dataplane_client, ) .await } @@ -830,13 +758,14 @@ impl MulticastGroupReconciler { group: &MulticastGroup, member: &MulticastGroupMember, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { // Remove from dataplane first if let Err(e) = self .remove_member_from_dataplane(opctx, member, dataplane_client) .await { - debug!( + warn!( opctx.log, "failed to remove member from dataplane, will retry"; "member_id" => %member.id, @@ -845,6 +774,24 @@ impl MulticastGroupReconciler { return Err(e); } + // Unsubscribe the VMM from the multicast group before the CAS + // clears the sled ID. Best-effort since the VMM may already be torn + // down. + if let Some(sled_id) = member.sled_id { + if let Err(e) = sled_client + .unsubscribe_vmm(opctx, group, member, sled_id.into(), None) + .await + { + warn!( + opctx.log, + "failed to unsubscribe VMM during instance invalidation"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } + } + // Update database state (atomically set "Left" and clear `sled_id`) let updated = self .datastore @@ -870,6 +817,21 @@ impl MulticastGroupReconciler { return Ok(StateTransition::NoChange); } + // Propagate updated M2P/forwarding to all sleds so the + // dataplane reflects the member's departure. Best-effort since + // group reconciliation will converge if this fails. + if let Err(e) = + sled_client.propagate_m2p_and_forwarding(opctx, group).await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding after member leave"; + "member_id" => %member.id, + "group_id" => %group.id(), + "error" => %e + ); + } + info!( opctx.log, "multicast member lifecycle transition: 'Joined' → 'Left' (instance invalid)"; @@ -877,7 +839,6 @@ impl MulticastGroupReconciler { "instance_id" => %member.parent_id, "group_id" => %group.id(), "group_multicast_ip" => %group.multicast_ip, - "dpd_operation" => "remove_member_from_underlay_group", "reason" => "instance_no_longer_valid_for_multicast_traffic" ); Ok(StateTransition::StateChanged) @@ -886,46 +847,58 @@ impl MulticastGroupReconciler { /// Handle sled migration for a "Joined" member. async fn handle_sled_migration( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, new_sled_id: SledUuid, - dataplane_client: &MulticastDataplaneClient, + cached_propolis_id: Option, ) -> Result { info!( - opctx.log, + ctx.opctx.log, "detected sled migration for 'Joined' member: re-applying configuration"; - "member_id" => %member.id, - "instance_id" => %member.parent_id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "group_multicast_ip" => %group.multicast_ip, - "old_sled_id" => ?member.sled_id, + "member_id" => %ctx.member.id, + "instance_id" => %ctx.member.parent_id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "group_multicast_ip" => %ctx.group.multicast_ip, + "old_sled_id" => ?ctx.member.sled_id, "new_sled_id" => %new_sled_id ); // Remove from old sled's dataplane first if let Err(e) = self - .remove_member_from_dataplane(opctx, member, dataplane_client) + .remove_member_from_dataplane( + ctx.opctx, + ctx.member, + ctx.dataplane_client, + ) .await { - debug!( - opctx.log, + warn!( + ctx.opctx.log, "failed to remove member from old sled, will retry"; - "member_id" => %member.id, - "old_sled_id" => ?member.sled_id, + "member_id" => %ctx.member.id, + "old_sled_id" => ?ctx.member.sled_id, "error" => ?e ); return Err(e); } - // Update sled_id in database using CAS + // Source-sled OPTE cleanup (M2P, forwarding, port subscription) + // is handled by VMM teardown: remove_propolis_zone -> + // release_opte_ports -> PortTicket::release_inner, which + // clears multicast subscriptions along with V2P and firewall + // rules. + // + // This is consistent with all other OPTE state. Nexus + // never explicitly calls sled-agent for source-sled cleanup + // after migration. + + // Update `sled_id` in database using CAS let updated = self .datastore .multicast_group_member_update_sled_id_if_current( - opctx, - InstanceUuid::from_untyped_uuid(member.parent_id), - member.sled_id, + ctx.opctx, + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), + ctx.member.sled_id, Some(new_sled_id.into()), ) .await @@ -935,49 +908,53 @@ impl MulticastGroupReconciler { if !updated { debug!( - opctx.log, + ctx.opctx.log, "skipping sled_id update after migration due to concurrent change"; - "member_id" => %member.id, - "group_id" => %group.id(), - "old_sled_id" => ?member.sled_id, + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "old_sled_id" => ?ctx.member.sled_id, "new_sled_id" => %new_sled_id ); return Ok(StateTransition::NoChange); } - // Re-apply configuration on new sled - // If this fails (e.g., sled not yet in inventory), transition to "Joining" for retry + // Re-apply configuration on new sled. Pass `new_sled_id` explicitly + // because the in-memory member struct still has the old sled_id. match self .complete_instance_member_join( - opctx, - group, - member, - dataplane_client, + ctx, + Some(new_sled_id), + cached_propolis_id, ) .await { - Ok(()) => { + Ok(joined) => { info!( - opctx.log, + ctx.opctx.log, "member configuration re-applied after sled migration"; - "member_id" => %member.id, - "instance_id" => %member.parent_id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "group_multicast_ip" => %group.multicast_ip, + "member_id" => %ctx.member.id, + "instance_id" => %ctx.member.parent_id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "group_multicast_ip" => %ctx.group.multicast_ip, "new_sled_id" => %new_sled_id, - "dpd_operation" => "re_add_member_to_underlay_multicast_group" + "action" => "re_add_member_to_underlay_multicast_group", + "joined" => joined ); - Ok(StateTransition::StateChanged) + if joined { + Ok(StateTransition::StateChanged) + } else { + Ok(StateTransition::NoChange) + } } Err(e) => { // Failed to join on new sled. We transition to "Joining" and // retry next cycle/run. warn!( - opctx.log, + ctx.opctx.log, "failed to complete join on new sled after migration: transitioning to 'Joining' for retry"; - "member_id" => %member.id, - "group_id" => %group.id(), + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), "new_sled_id" => %new_sled_id, "error" => %e ); @@ -1005,9 +982,9 @@ impl MulticastGroupReconciler { let updated = self .datastore .multicast_group_member_set_state_if_current( - opctx, - MulticastGroupUuid::from_untyped_uuid(group.id()), - InstanceUuid::from_untyped_uuid(member.parent_id), + ctx.opctx, + MulticastGroupUuid::from_untyped_uuid(ctx.group.id()), + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), MulticastGroupMemberState::Joined, MulticastGroupMemberState::Joining, ) @@ -1018,10 +995,10 @@ impl MulticastGroupReconciler { if updated { info!( - opctx.log, + ctx.opctx.log, "member transitioned to 'Joining': will retry on next reconciliation run"; - "member_id" => %member.id, - "group_id" => %group.id(), + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), "new_sled_id" => %new_sled_id ); Ok(StateTransition::StateChanged) @@ -1094,7 +1071,7 @@ impl MulticastGroupReconciler { "instance_id" => %member.parent_id, "group_id" => %group.id(), "group_multicast_ip" => %group.multicast_ip, - "dpd_operation" => "remove_member_from_underlay_group", + "action" => "transition_to_left", "reason" => "inconsistent_state_sled_id_missing_in_joined_state" ); Ok(StateTransition::StateChanged) @@ -1103,22 +1080,27 @@ impl MulticastGroupReconciler { /// Instance-specific handler for members in "Left" state. async fn handle_instance_left( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - // Get pre-fetched instance state and sled_id - let (instance_valid, current_sled_id) = instance_states - .get(&member.parent_id) + let InstanceMulticastState { + valid: instance_valid, + sled_id: current_sled_id, + .. + } = ctx + .instance_states + .get(&ctx.member.parent_id) .copied() - .unwrap_or((false, None)); - - // Handle permanent deletion first - if member.time_deleted.is_some() { - self.cleanup_deleted_member(opctx, group, member, dataplane_client) - .await?; + .unwrap_or_default(); + + if ctx.member.time_deleted.is_some() { + self.cleanup_deleted_member( + ctx.opctx, + ctx.group, + ctx.member, + ctx.dataplane_client, + ctx.sled_client, + ) + .await?; return Ok(StateTransition::NeedsCleanup); } @@ -1129,27 +1111,57 @@ impl MulticastGroupReconciler { // - sled_id is None (uses fallback path) // - member was already removed from DPD if let Err(e) = self - .remove_member_from_dataplane(opctx, member, dataplane_client) + .remove_member_from_dataplane( + ctx.opctx, + ctx.member, + ctx.dataplane_client, + ) .await { - debug!( - opctx.log, + warn!( + ctx.opctx.log, "failed to clean up DPD state for 'Left' member (will retry)"; - "member_id" => %member.id, + "member_id" => %ctx.member.id, "error" => ?e ); - // Continue to reactivation even on cleanup failure because - // the add operation may succeed if the port was already removed } - // Handle reactivation: instance valid and group active -> transition to "Joining" - if instance_valid && group.state == MulticastGroupState::Active { + // Unsubscribe the VMM's OPTE port from this multicast group. + // Best-effort since if the VMM is already gone, there's nothing to + // unsubscribe (the OPTE port was destroyed with the VMM). + if let Some(sled_id) = ctx.member.sled_id { + if let Err(e) = ctx + .sled_client + .unsubscribe_vmm( + ctx.opctx, + ctx.group, + ctx.member, + sled_id.into(), + None, + ) + .await + { + warn!( + ctx.opctx.log, + "failed to unsubscribe VMM from multicast group"; + "member_id" => %ctx.member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } + } + + if instance_valid && ctx.group.state == MulticastGroupState::Active { return self - .reactivate_left_member(opctx, group, member, current_sled_id) + .reactivate_left_member( + ctx.opctx, + ctx.group, + ctx.member, + current_sled_id, + ) .await; } - // Stay in "Left" state Ok(StateTransition::NoChange) } @@ -1250,10 +1262,10 @@ impl MulticastGroupReconciler { // Build the state map from the fetched data state_map.extend(members.iter().map(|member| { - let (is_valid, sled_id) = if let Some((instance, vmm_opt)) = + let state = if let Some((instance, vmm_opt)) = instance_vmm_data.get(&member.parent_id) { - let is_valid = matches!( + let valid = matches!( instance.nexus_state.state(), InstanceState::Creating | InstanceState::Starting @@ -1267,13 +1279,16 @@ impl MulticastGroupReconciler { SledUuid::from_untyped_uuid(vmm.sled_id.into_untyped_uuid()) }); - (is_valid, sled_id) + let propolis_id = vmm_opt + .as_ref() + .map(|vmm| PropolisUuid::from_untyped_uuid(vmm.id)); + + InstanceMulticastState { valid, sled_id, propolis_id } } else { - // Instance not found (mark as invalid) - (false, None) + InstanceMulticastState::default() }; - (member.parent_id, (is_valid, sled_id)) + (member.parent_id, state) })); debug!( @@ -1319,13 +1334,13 @@ impl MulticastGroupReconciler { return Ok(None); } Err(e) => { - debug!( + warn!( opctx.log, "failed to look up instance state"; "member" => ?member, "error" => ?e ); - return Ok(None); + return Err(e.into()); } }; @@ -1381,76 +1396,145 @@ impl MulticastGroupReconciler { } } - /// Complete a member join operation ("Joining" -> "Joined") for an instance. + /// Complete a member join by configuring the dataplane and subscribing + /// the VMM. + /// + /// When `sled_id_override` is provided (e.g., during migration), it + /// is used instead of the potentially stale `member.sled_id`. + /// + /// # Returns + /// + /// `Ok(true)` when the join completed successfully. `Ok(false)` when no + /// sled was available and the operation was a no-op. async fn complete_instance_member_join( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, - ) -> Result<(), anyhow::Error> { + ctx: &MemberReconcileCtx<'_>, + sled_id_override: Option, + cached_propolis_id: Option, + ) -> Result { debug!( - opctx.log, + ctx.opctx.log, "completing member join"; - "member" => ?member, - "group" => ?group + "member" => ?ctx.member, + "group" => ?ctx.group ); - // Get sled_id from member record, or look it up and update if missing - let sled_id = match member.sled_id { - Some(id) => id, - None => { - match self - .lookup_and_update_member_sled_id(opctx, member) - .await? - { - Some(id) => id, - None => return Ok(()), // No sled available, cannot join - } - } + // Use the override if provided, then the member's cached sled_id, + // then look it up from the instance as a last resort. + let sled_id: SledUuid = if let Some(id) = + sled_id_override.or(ctx.member.sled_id.map(Into::into)) + { + id + } else if let Some(id) = + self.lookup_and_update_member_sled_id(ctx.opctx, ctx.member).await? + { + id.into() + } else { + return Ok(false); }; self.add_member_to_dataplane( - opctx, - group, - member, - sled_id.into(), - dataplane_client, + ctx.opctx, + ctx.group, + ctx.member, + sled_id, + ctx.dataplane_client, ) .await?; - // Transition to "Joined" state (only if still in "Joining") - let updated = self - .datastore - .multicast_group_member_set_state_if_current( - opctx, - MulticastGroupUuid::from_untyped_uuid(group.id()), - InstanceUuid::from_untyped_uuid(member.parent_id), - MulticastGroupMemberState::Joining, - MulticastGroupMemberState::Joined, + // If the member is already in a "Joined" state (migration path), skip + // the state transition but still propagate and subscribe. During + // migration the caller updates the sled ID without changing state, + // so we must not gate propagation on this CAS. + if ctx.member.state != MulticastGroupMemberState::Joined { + let updated = self + .datastore + .multicast_group_member_set_state_if_current( + ctx.opctx, + MulticastGroupUuid::from_untyped_uuid(ctx.group.id()), + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), + MulticastGroupMemberState::Joining, + MulticastGroupMemberState::Joined, + ) + .await + .context( + "failed to conditionally transition member to 'Joined' state", + )?; + + if !updated { + debug!( + ctx.opctx.log, + "skipping Joining→Joined transition due to concurrent update"; + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id() + ); + // Concurrent update moved the member away from the "Joining" + // state, so skip propagation and subscribe. + return Ok(false); + } + } + + // Propagate M2P mappings and forwarding entries to all sleds. + // + // Athis point, the member is now "Joined" in the database, so propagate + // includes this sled in forwarding next-hops. If propagation or + // subscribe fails below, the member remains "Joined" with incomplete + // sled state. The reconciler's next pass converges via + // `handle_instance_joined` -> `verify_members`. + // + // Propagation failures are best-effort since the reconciler will + // re-converge all sleds on the next cycle. Subscribe failures + // below are treated as hard errors because the VMM cannot + // receive traffic without an OPTE port subscription. + if let Err(e) = ctx + .sled_client + .propagate_m2p_and_forwarding(ctx.opctx, ctx.group) + .await + { + warn!( + ctx.opctx.log, + "failed to propagate M2P/forwarding after member join"; + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "error" => %e + ); + } + + // Subscribe the VMM's OPTE port last. Propagation above is + // best-effort, and any sleds that failed will be converged by the + // reconciler on the next cycle. + if let Err(e) = ctx + .sled_client + .subscribe_vmm( + ctx.opctx, + ctx.group, + ctx.member, + sled_id, + cached_propolis_id, ) .await - .context( - "failed to conditionally transition member to 'Joined' state", - )?; - if !updated { - debug!( - opctx.log, - "skipping Joining→Joined transition due to concurrent update"; - "member_id" => %member.id, - "group_id" => %group.id() + { + warn!( + ctx.opctx.log, + "failed to subscribe VMM to multicast group via sled-agent \ + (will retry next cycle)"; + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "sled_id" => %sled_id, + "error" => %e ); + return Err(e); } info!( - opctx.log, + ctx.opctx.log, "member join completed"; - "member_id" => %member.id, - "group_id" => %group.id(), + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), "sled_id" => %sled_id ); - Ok(()) + Ok(true) } /// Apply member dataplane configuration (via DPD-client). @@ -1870,6 +1954,10 @@ impl MulticastGroupReconciler { /// - Removing the member from any unexpected/stale rear ports /// - Adding the member to expected ports /// + /// If the sled cannot be resolved (e.g., decommissioned), the member + /// is transitioned to "Left" and M2P/forwarding is propagated inline + /// to remove stale entries. + /// /// This handles cases like `sp_slot` changes where the sled's physical /// location changed but the `sled_id` stayed the same. async fn verify_members( @@ -1878,6 +1966,7 @@ impl MulticastGroupReconciler { group: &MulticastGroup, member: &MulticastGroupMember, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result<(), anyhow::Error> { debug!( opctx.log, @@ -1940,6 +2029,23 @@ impl MulticastGroupReconciler { ) .await; + // Unsubscribe the VMM before the CAS clears sled_id; + // otherwise, the OPTE subscription is stranded with no + // way to identify the sled on later passes. Best-effort + // since the VMM may already be torn down. + if let Err(e) = sled_client + .unsubscribe_vmm(opctx, group, member, sled_id.into(), None) + .await + { + warn!( + opctx.log, + "failed to unsubscribe VMM during port resolution failure"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } + let updated = self .datastore .multicast_group_member_to_left_if_current( @@ -1952,6 +2058,21 @@ impl MulticastGroupReconciler { .context("failed to transition member to 'Left' after port resolution failure")?; if updated { + // Propagate updated M2P/forwarding to remove + // stale entries for this now-Left member. + if let Err(e) = sled_client + .propagate_m2p_and_forwarding(opctx, group) + .await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding after \ + member left due to unresolvable sled"; + "member_id" => %member.id, + "group_id" => %group.id(), + "error" => %e + ); + } info!( opctx.log, "member transitioned to 'Left': sled no longer resolvable"; @@ -2105,6 +2226,23 @@ impl MulticastGroupReconciler { } } + // Ensure the VMM subscription is in place for the current propolis_id. + // This is idempotent and covers cases where the propolis_id changed + // (e.g., after live migration) but the sled_id stayed the same. + if let Err(e) = sled_client + .subscribe_vmm(opctx, group, member, sled_id.into(), None) + .await + { + warn!( + opctx.log, + "failed to verify VMM subscription during member verification"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + return Err(e); + } + info!( opctx.log, "member verification completed"; @@ -2607,13 +2745,33 @@ impl MulticastGroupReconciler { } /// Cleanup a member that is marked for deletion (time_deleted set). + /// + /// This includes unsubscribing a member from its VMM, removing + /// it from the dataplane, and hard-deleting the DB row. async fn cleanup_deleted_member( &self, opctx: &OpContext, group: &MulticastGroup, member: &MulticastGroupMember, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result<(), anyhow::Error> { + // Unsubscribe from sled-agent (best-effort, VMM may be gone). + if let Some(sled_id) = member.sled_id { + if let Err(e) = sled_client + .unsubscribe_vmm(opctx, group, member, sled_id.into(), None) + .await + { + debug!( + opctx.log, + "failed to unsubscribe VMM during member cleanup"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } + } + // Use the consolidated cleanup helper with strict error handling self.cleanup_member_from_dataplane( opctx, diff --git a/nexus/src/app/background/tasks/multicast/mod.rs b/nexus/src/app/background/tasks/multicast/mod.rs index 8f592a41087..08c56c4154a 100644 --- a/nexus/src/app/background/tasks/multicast/mod.rs +++ b/nexus/src/app/background/tasks/multicast/mod.rs @@ -84,7 +84,7 @@ //! - Unlike linear probing (`h + i`), scattered outputs avoid clustering //! - **8-bit salt**: 256 unique underlay addresses per external IP //! - **Resolution**: Exhaustion requires 256 other groups to occupy exactly -//! those 256 scattered addresses—effectively impossible in 2^64 space +//! those 256 scattered addresses, effectively impossible in 2^64 space //! //! ### Forwarding Architecture (Incoming multicast traffic to guests) //! @@ -105,6 +105,8 @@ //! - **Group lifecycle**: "Creating" → "Active" → "Deleting" → hard-deleted //! - **Member lifecycle**: "Joining" → "Joined" → "Left" → soft-deleted → hard-deleted //! - **Dataplane updates**: DPD API calls for P4 table updates +//! - **Sled propagation**: M2P mappings and forwarding entries pushed to sled-agents +//! - **OPTE subscriptions**: Per-VMM multicast group subscriptions on target sleds //! - **Topology mapping**: Sled-to-switch-port resolution (with caching) //! //! ## Deletion Semantics: Groups vs Members @@ -151,6 +153,7 @@ use sled_hardware_types::BaseboardId; use crate::app::background::BackgroundTask; use crate::app::multicast::dataplane::MulticastDataplaneClient; +use crate::app::multicast::sled::MulticastSledClient; use crate::app::saga::StartSaga; pub(crate) mod groups; @@ -362,7 +365,7 @@ impl MulticastGroupReconciler { /// │ 6 │ 0xa ⊕ 6 │ 0xc │ /// │ 7 │ 0xa ⊕ 7 │ 0xd │ /// └──────┴─────────┴────────┘ -/// Outputs: [a, b, 8, 9, e, f, c, d] — scattered, not sequential +/// Outputs: [a, b, 8, 9, e, f, c, d] (scattered, not sequential) /// ``` /// /// On collision (i.e., underlay IP already in use), we increment salt and retry. @@ -533,6 +536,10 @@ impl MulticastGroupReconciler { } }; + // Create sled-agent client for OPTE subscriptions and + // M2P/forwarding propagation. + let sled_client = MulticastSledClient::new(self.datastore.clone()); + // Process creating groups match self.reconcile_creating_groups(opctx).await { Ok(count) => status.groups_created += count, @@ -543,7 +550,10 @@ impl MulticastGroupReconciler { } // Process member state changes - match self.reconcile_member_states(opctx, &dataplane_client).await { + match self + .reconcile_member_states(opctx, &dataplane_client, &sled_client) + .await + { Ok(count) => status.members_processed += count, Err(e) => { let msg = format!("failed to reconcile member states: {e:#}"); @@ -574,7 +584,10 @@ impl MulticastGroupReconciler { } // Reconcile active groups (verify state, update dataplane as needed) - match self.reconcile_active_groups(opctx, &dataplane_client).await { + match self + .reconcile_active_groups(opctx, &dataplane_client, &sled_client) + .await + { Ok(count) => status.groups_verified += count, Err(e) => { let msg = format!("failed to reconcile active groups: {e:#}"); @@ -583,7 +596,10 @@ impl MulticastGroupReconciler { } // Process deleting groups (DPD cleanup + hard-delete from DB) - match self.reconcile_deleting_groups(opctx, &dataplane_client).await { + match self + .reconcile_deleting_groups(opctx, &dataplane_client, &sled_client) + .await + { Ok(count) => status.groups_deleted += count, Err(e) => { let msg = format!("failed to reconcile deleting groups: {e:#}"); diff --git a/nexus/src/app/background/tasks/sync_switch_configuration.rs b/nexus/src/app/background/tasks/sync_switch_configuration.rs index 71b6f97c837..a0e10c930a8 100644 --- a/nexus/src/app/background/tasks/sync_switch_configuration.rs +++ b/nexus/src/app/background/tasks/sync_switch_configuration.rs @@ -982,11 +982,11 @@ impl BackgroundTask for SwitchPortSettingsManager { "switch_slot" => ?switch_slot, "config" => ?config, ); - if let Err(e) = client.bgp_apply_v2(config).await { + if let Err(e) = client.bgp_apply(config).await { error!(log, "error while applying bgp configuration"; "error" => ?e); } - if let Err(e) = client.update_rib_bestpath_fanout(fanout).await { + if let Err(e) = client.update_bestpath_fanout(fanout).await { error!(log, "error while updating bestpath fanout"; "error" => ?e); } } diff --git a/nexus/src/app/bgp.rs b/nexus/src/app/bgp.rs index 6c0a3fc1f51..53cc41996c5 100644 --- a/nexus/src/app/bgp.rs +++ b/nexus/src/app/bgp.rs @@ -121,7 +121,7 @@ impl super::Nexus { for r in &router_info { let asn = r.asn; - let peers = match client.get_neighbors_v4(asn).await { + let peers = match client.get_neighbors(asn).await { Ok(result) => result.into_inner(), Err(e) => { error!( @@ -183,7 +183,7 @@ impl super::Nexus { peer: None, }; - let exported = match client.get_exported_v3(&selector).await { + let exported = match client.get_exported(&selector).await { Ok(result) => result.into_inner(), Err(e) => { error!( @@ -237,7 +237,7 @@ impl super::Nexus { )) })? { let history = match client - .message_history_v3(&MessageHistoryRequest { + .message_history(&MessageHistoryRequest { asn: sel.asn, direction: None, peer: None, @@ -280,7 +280,7 @@ impl super::Nexus { )) })? { let mut imported: Vec = Vec::new(); - match client.get_rib_imported_v2(None, None).await { + match client.get_rib_imported(None, None).await { Ok(result) => { for (prefix, paths) in result.into_inner().iter() { let ipnet = match prefix.parse() { diff --git a/nexus/src/app/multicast/dataplane.rs b/nexus/src/app/multicast/dataplane.rs index 5d79df7d078..8d858154b2a 100644 --- a/nexus/src/app/multicast/dataplane.rs +++ b/nexus/src/app/multicast/dataplane.rs @@ -113,7 +113,8 @@ trait IntoUnderlayMulticast { impl IntoUnderlayMulticast for IpAddr { fn into_underlay_multicast(self) -> Result { match self { - IpAddr::V6(ipv6) => Ok(UnderlayMulticastIpv6(ipv6)), + IpAddr::V6(ipv6) => UnderlayMulticastIpv6::try_from(ipv6) + .map_err(|e| Error::invalid_request(e.to_string())), IpAddr::V4(_) => Err(Error::invalid_request( "underlay multicast groups must use IPv6 addresses", )), @@ -179,14 +180,36 @@ impl MulticastDataplaneClient { fn select_one_switch( &self, ) -> MulticastDataplaneResult<(&SwitchSlot, &dpd_client::Client)> { - let mut switches: Vec<_> = self.dpd_clients.iter().collect(); - switches.sort_by_key(|(loc, _)| *loc); - switches - .into_iter() - .next() + self.dpd_clients + .iter() + .min_by_key(|(loc, _)| *loc) .ok_or_else(|| Error::internal_error("no DPD clients available")) } + /// Compute DPD source filter from aggregated member source state. + /// + /// For SSM addresses, always returns specific sources. For ASM addresses, + /// returns `None` (any source) if any member omitted sources, otherwise + /// returns the union of all member sources. + fn compute_sources_for_dpd( + external_group_ip: IpAddr, + source_filter: &SourceFilterState, + ) -> Option> { + if is_ssm_address(external_group_ip) + || !source_filter.has_any_source_member + { + Some( + source_filter + .specific_sources + .iter() + .map(|ip| dpd_client::types::IpSrc::Exact(*ip)) + .collect(), + ) + } else { + None + } + } + async fn dpd_ensure_underlay_created( &self, client: &dpd_client::Client, @@ -413,33 +436,9 @@ impl MulticastDataplaneClient { inner_mac: MacAddr { a: underlay_ipv6.derive_multicast_mac() }, vni: Vni::from(u32::from(external_group.vni.0)), }; - let external_group_ip = external_group.multicast_ip.ip(); - - // Source filtering per RFC 4607: - // - SSM (232/8, ff3x::/32): always use specific sources. API - // validation prevents SSM joins without sources. - // - ASM: use specific sources when all members specify sources, - // otherwise None to allow any source at the switch level. - let sources_dpd = if is_ssm_address(external_group_ip) { - Some( - source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - } else if source_filter.has_any_source_member { - None - } else { - Some( - source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - }; + let sources_dpd = + Self::compute_sources_for_dpd(external_group_ip, source_filter); let create_operations = dpd_clients.into_iter().map(|(switch_slot, client)| { @@ -570,36 +569,12 @@ impl MulticastDataplaneClient { inner_mac: MacAddr { a: underlay_ipv6.derive_multicast_mac() }, vni: Vni::from(u32::from(params.external_group.vni.0)), }; - let new_name_str = params.new_name.to_string(); let external_group_ip = params.external_group.multicast_ip.ip(); - - // Source filtering per RFC 4607: - // - SSM (232/8, ff3x::/32): always use specific sources. API - // validation prevents SSM joins without sources. - // - ASM: use specific sources when all members specify sources, - // otherwise None to allow any source at the switch level. - let sources_dpd = if is_ssm_address(external_group_ip) { - Some( - params - .source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - } else if params.source_filter.has_any_source_member { - None - } else { - Some( - params - .source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - }; + let sources_dpd = Self::compute_sources_for_dpd( + external_group_ip, + params.source_filter, + ); let update_operations = dpd_clients.into_iter().map(|(switch_slot, client)| { diff --git a/nexus/src/app/multicast/mod.rs b/nexus/src/app/multicast/mod.rs index 629d1253c89..57b48188d77 100644 --- a/nexus/src/app/multicast/mod.rs +++ b/nexus/src/app/multicast/mod.rs @@ -70,6 +70,7 @@ use omicron_common::api::external::{ use omicron_uuid_kinds::{GenericUuid, InstanceUuid, MulticastGroupUuid}; pub(crate) mod dataplane; +pub(crate) mod sled; /// Validate that SSM addresses have source IPs. /// @@ -887,4 +888,32 @@ mod tests { 0xff1e, 0, 0, 0, 0, 0, 0, 1 )))); } + + #[test] + fn test_generate_group_name_from_ip() { + let v4 = IpAddr::V4(Ipv4Addr::new(224, 1, 2, 3)); + assert_eq!( + generate_group_name_from_ip(v4).unwrap().as_str(), + "mcast-224-1-2-3" + ); + + let v4_zeros = IpAddr::V4(Ipv4Addr::new(224, 0, 0, 1)); + assert_eq!( + generate_group_name_from_ip(v4_zeros).unwrap().as_str(), + "mcast-224-0-0-1" + ); + + let v6: IpAddr = IpAddr::V6(Ipv6Addr::new(0xff0e, 0, 0, 0, 0, 0, 0, 1)); + assert_eq!( + generate_group_name_from_ip(v6).unwrap().as_str(), + "mcast-ff0e-0-0-0-0-0-0-1" + ); + + let v6_ssm: IpAddr = + IpAddr::V6(Ipv6Addr::new(0xff3e, 0, 0, 0, 0, 0, 0, 0xabcd)); + assert_eq!( + generate_group_name_from_ip(v6_ssm).unwrap().as_str(), + "mcast-ff3e-0-0-0-0-0-0-abcd" + ); + } } diff --git a/nexus/src/app/multicast/sled.rs b/nexus/src/app/multicast/sled.rs new file mode 100644 index 00000000000..73ea17bd71d --- /dev/null +++ b/nexus/src/app/multicast/sled.rs @@ -0,0 +1,582 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Sled-agent multicast operations for OPTE subscriptions, M2P mappings, +//! and forwarding entries. +//! +//! Parallel to [`dataplane`] which handles DPD switch operations, this +//! module manages sled-local multicast state via sled-agent: +//! +//! - **OPTE subscriptions**: Per-VMM multicast group filters on the +//! hosting sled +//! - **M2P mappings**: Overlay multicast IP to underlay IPv6 address +//! translation, installed on all sleds +//! - **Forwarding entries**: Underlay multicast address to next-hop sled +//! replication lists, installed on all sleds +//! +//! [`dataplane`]: super::dataplane + +use std::collections::BTreeSet; +use std::net::{IpAddr, Ipv6Addr}; +use std::sync::Arc; + +use anyhow::Context; +use slog::{debug, info, warn}; + +use nexus_db_model::{ + MulticastGroup, MulticastGroupMember, MulticastGroupMemberState, +}; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::deployment::SledFilter; +use nexus_types::identity::{Asset, Resource}; +use omicron_common::api::external::DataPageParams; +use omicron_uuid_kinds::{ + GenericUuid, InstanceUuid, MulticastGroupUuid, PropolisUuid, SledUuid, +}; +use sled_agent_client::types::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, McastFilterMode, + McastForwardingEntry, McastForwardingNextHop, McastReplication, + McastSourceFilter, +}; + +/// Client for sled-agent multicast operations. +/// +/// Unlike [`MulticastDataplaneClient`] which pre-builds per-switch clients, +/// sled clients are constructed on demand since the target sled set varies +/// per group. +/// +/// [`MulticastDataplaneClient`]: super::dataplane::MulticastDataplaneClient +pub(crate) struct MulticastSledClient { + datastore: Arc, +} + +impl MulticastSledClient { + pub(crate) fn new(datastore: Arc) -> Self { + Self { datastore } + } + + /// Create a sled-agent client for the given sled. + /// + /// Looks up the sled's address in the database and constructs an HTTP + /// client. Follows the same pattern as V2P mapping propagation. + async fn sled_client( + &self, + opctx: &OpContext, + sled_id: SledUuid, + ) -> Result + { + nexus_networking::sled_client( + &self.datastore, + opctx, + sled_id, + &opctx.log, + ) + .await + } + + /// Look up the current `propolis_id` for an instance. + async fn lookup_propolis_id( + &self, + opctx: &OpContext, + instance_id: InstanceUuid, + ) -> Result, anyhow::Error> { + let instance_state = self + .datastore + .instance_get_state(opctx, &instance_id) + .await + .context("failed to look up instance state")?; + + Ok(instance_state + .and_then(|s| s.propolis_id) + .map(PropolisUuid::from_untyped_uuid)) + } + + /// Build the membership descriptor sent to sled-agent for + /// subscribe/unsubscribe calls. + fn membership_for( + group: &MulticastGroup, + member: &MulticastGroupMember, + ) -> sled_agent_client::types::InstanceMulticastMembership { + sled_agent_client::types::InstanceMulticastMembership { + group_ip: group.multicast_ip.ip(), + sources: member.source_ips.iter().map(|s| s.ip()).collect(), + } + } + + /// Subscribe a VMM to a multicast group via sled-agent. + /// + /// Looks up the instance's current `propolis_id` and calls the sled-agent + /// endpoint to configure OPTE port-level multicast filters. The member's + /// per-instance source IPs are passed for SSM filtering. + pub(crate) async fn subscribe_vmm( + &self, + opctx: &OpContext, + group: &MulticastGroup, + member: &MulticastGroupMember, + sled_id: SledUuid, + cached_propolis_id: Option, + ) -> Result<(), anyhow::Error> { + let instance_id = InstanceUuid::from_untyped_uuid(member.parent_id); + // If the instance has no propolis_id (already stopped/destroyed), + // the OPTE port is gone and there's nothing to subscribe. + let propolis_id = match cached_propolis_id { + Some(id) => id, + None => match self.lookup_propolis_id(opctx, instance_id).await? { + Some(id) => id, + None => { + debug!( + opctx.log, + "no propolis_id for instance, skipping subscribe"; + "member_id" => %member.id, + "instance_id" => %instance_id + ); + return Ok(()); + } + }, + }; + + let client = self + .sled_client(opctx, sled_id) + .await + .context("failed to create sled-agent client")?; + + let membership = Self::membership_for(group, member); + + client + .vmm_join_multicast_group(&propolis_id, &membership) + .await + .context("sled-agent vmm_join_multicast_group call failed")?; + + debug!( + opctx.log, + "subscribed VMM to multicast group via sled-agent"; + "member_id" => %member.id, + "propolis_id" => %propolis_id, + "sled_id" => %sled_id, + "group_ip" => %group.multicast_ip + ); + + Ok(()) + } + + /// Unsubscribe a VMM from a multicast group via sled-agent. + /// + /// Best-effort since if the VMM or sled is already gone, the unsubscribe + /// is effectively a no-op since the OPTE port was destroyed. + pub(crate) async fn unsubscribe_vmm( + &self, + opctx: &OpContext, + group: &MulticastGroup, + member: &MulticastGroupMember, + sled_id: SledUuid, + cached_propolis_id: Option, + ) -> Result<(), anyhow::Error> { + let instance_id = InstanceUuid::from_untyped_uuid(member.parent_id); + + // If the instance has no propolis_id (already stopped/destroyed), + // the OPTE port is gone and there's nothing to unsubscribe. + let propolis_id = match cached_propolis_id { + Some(id) => id, + None => match self.lookup_propolis_id(opctx, instance_id).await? { + Some(id) => id, + None => { + debug!( + opctx.log, + "no propolis_id for instance, skipping unsubscribe"; + "member_id" => %member.id, + "instance_id" => %instance_id + ); + return Ok(()); + } + }, + }; + + let client = self + .sled_client(opctx, sled_id) + .await + .context("failed to create sled-agent client")?; + + let membership = Self::membership_for(group, member); + + client + .vmm_leave_multicast_group(&propolis_id, &membership) + .await + .context("sled-agent vmm_leave_multicast_group call failed")?; + + debug!( + opctx.log, + "unsubscribed VMM from multicast group via sled-agent"; + "member_id" => %member.id, + "propolis_id" => %propolis_id, + "sled_id" => %sled_id, + "group_ip" => %group.multicast_ip + ); + + Ok(()) + } + + /// Propagate M2P mappings and forwarding entries to all VPC-routing sleds. + /// + /// Performs convergent per-sled propagation: each sled's current state + /// is queried and diffed against desired state. New entries are added + /// and stale state is removed (member leaves, instance stops). When no + /// joined members remain, every sled has stale state and it is cleared. + /// + /// # Scope + /// + /// M2P mappings and forwarding entries are pushed to all VPC-routing + /// sleds, not just member sleds. Any instance on any sled may send to + /// a multicast group address. Hence, without the M2P mapping, OPTE's + /// overlay layer silently drops the packet. Forwarding entries are needed + /// on sender sleds so OPTE can replicate to member sleds. Subscriptions + /// (per-port group membership) remain member-sled-only. + pub(crate) async fn propagate_m2p_and_forwarding( + &self, + opctx: &OpContext, + group: &MulticastGroup, + ) -> Result<(), anyhow::Error> { + let underlay_group_id = group + .underlay_group_id + .context("group missing underlay_group_id")?; + + let underlay_group = self + .datastore + .underlay_multicast_group_fetch(opctx, underlay_group_id) + .await + .context("failed to fetch underlay group")?; + + let underlay_ip = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => anyhow::bail!( + "underlay multicast address for group {} is {other}, expected IPv6", + group.id() + ), + }; + + let group_ip = group.multicast_ip.ip(); + + // Compute desired state from DB, determining which sleds should have + // M2P and forwarding entries for this group. + let group_id = MulticastGroupUuid::from_untyped_uuid(group.id()); + let members = self + .datastore + .multicast_group_members_list( + opctx, + group_id, + &DataPageParams::max_page(), + ) + .await + .context("failed to list group members")?; + + let member_sled_ids: BTreeSet = members + .iter() + .filter(|m| m.state == MulticastGroupMemberState::Joined) + .filter_map(|m| m.sled_id.map(SledUuid::from)) + .collect(); + + // Build desired M2P entry. + let desired_m2p = + Mcast2PhysMapping { group: group_ip, underlay: underlay_ip }; + + // Look up member sled underlay IPs for forwarding next-hop + // computation. These are the sleds that host "Joined" members + // and should appear as next hops in every sled's forwarding + // entry. + let mut member_sled_ips: Vec<(SledUuid, Ipv6Addr)> = Vec::new(); + let mut failed_lookups: usize = 0; + for sled_id in &member_sled_ids { + let lookup = match nexus_networking::sled_lookup( + &self.datastore, + opctx, + *sled_id, + ) { + Ok(found) => found, + Err(e) => { + warn!( + opctx.log, + "failed to resolve sled for M2P/forwarding"; + "sled_id" => %sled_id, + "error" => %e + ); + failed_lookups += 1; + continue; + } + }; + + match lookup.fetch().await { + Ok((.., sled)) => { + member_sled_ips.push((*sled_id, sled.ip())); + } + Err(e) => { + warn!( + opctx.log, + "failed to resolve sled for M2P/forwarding"; + "sled_id" => %sled_id, + "error" => %e + ); + failed_lookups += 1; + } + } + } + + // Abort before mutating sled state if any member lookups failed. + // Pushing the partial member set would prune forwarding entries + // for the unresolved sleds, turning a transient lookup failure + // into packet loss for still-joined members. + if failed_lookups > 0 { + anyhow::bail!( + "aborting convergence: {failed_lookups} member sled \ + lookup(s) failed out of {} joined members", + member_sled_ids.len() + ); + } + + // The group is active if any members are "Joined". M2P and + // forwarding are pushed to all sleds when active, cleared + // from all sleds when inactive. + let group_is_active = !member_sled_ids.is_empty(); + + // Query all VPC-routing sleds for current state and converge. + let all_sleds = self + .datastore + .sled_list_all_batched(opctx, SledFilter::VpcRouting) + .await + .context("failed to enumerate sleds")?; + + let convergence_params = GroupConvergenceParams { + group_ip, + underlay_ip, + group_is_active, + desired_m2p: &desired_m2p, + member_sled_ips: &member_sled_ips, + }; + + let mut failed_sleds: usize = 0; + + for sled in &all_sleds { + let sled_id: SledUuid = sled.id(); + let client = match self.sled_client(opctx, sled_id).await { + Ok(c) => c, + Err(e) => { + warn!( + opctx.log, + "failed to create sled-agent client for \ + M2P/forwarding convergence"; + "sled_id" => %sled_id, + "error" => %e + ); + failed_sleds += 1; + continue; + } + }; + + if let Err(e) = converge_sled_m2p_and_forwarding( + &client, + sled_id, + &convergence_params, + ) + .await + { + warn!( + opctx.log, + "failed to converge M2P/forwarding on sled"; + "sled_id" => %sled_id, + "group_ip" => %group_ip, + "error" => %e + ); + failed_sleds += 1; + } + } + + info!( + opctx.log, + "converged M2P and forwarding state"; + "group_id" => %group.id(), + "group_ip" => %group_ip, + "underlay_ip" => %underlay_ip, + "member_sleds" => member_sled_ids.len(), + "total_sleds_checked" => all_sleds.len(), + "failed_sleds" => failed_sleds + ); + + if failed_sleds > 0 { + anyhow::bail!( + "failed to converge M2P/forwarding: \ + {failed_sleds} sled convergence failures \ + (out of {} sleds)", + all_sleds.len() + ); + } + + Ok(()) + } + + /// Clear M2P mappings and forwarding entries from all sleds for + /// this group. + /// + /// Delegates to the convergent [`propagate_m2p_and_forwarding`] which + /// will detect that no joined members remain and clear stale state + /// from all sleds. + /// + /// [`propagate_m2p_and_forwarding`]: Self::propagate_m2p_and_forwarding + pub(crate) async fn clear_m2p_and_forwarding( + &self, + opctx: &OpContext, + group: &MulticastGroup, + ) -> Result<(), anyhow::Error> { + self.propagate_m2p_and_forwarding(opctx, group).await + } +} + +/// Resolved group state used to converge M2P and forwarding on each sled. +struct GroupConvergenceParams<'a> { + group_ip: IpAddr, + underlay_ip: Ipv6Addr, + group_is_active: bool, + desired_m2p: &'a Mcast2PhysMapping, + member_sled_ips: &'a [(SledUuid, Ipv6Addr)], +} + +/// Per-sled convergence of M2P and forwarding state. +/// +/// # Errors +/// +/// Returns an error when any sled-agent RPC fails (list, set, or clear). +/// The caller increments `failed_sleds` and continues to the next sled. +async fn converge_sled_m2p_and_forwarding( + client: &sled_agent_client::Client, + sled_id: SledUuid, + params: &GroupConvergenceParams<'_>, +) -> Result<(), anyhow::Error> { + converge_m2p(client, params).await?; + converge_forwarding(client, sled_id, params).await?; + Ok(()) +} + +/// Converge a single sled's M2P mapping for one group. +/// +/// Sets the mapping when the group is active and missing, clears it +/// when the group is inactive and present. Already-correct state +/// is left alone. +async fn converge_m2p( + client: &sled_agent_client::Client, + params: &GroupConvergenceParams<'_>, +) -> Result<(), anyhow::Error> { + let found = client + .list_mcast_m2p() + .await + .context("failed to list M2P mappings on sled")? + .into_inner(); + + let has_m2p = found.iter().any(|m| { + m.group == params.group_ip && m.underlay == params.underlay_ip + }); + + match (params.group_is_active, has_m2p) { + // Active group missing M2P: install it. + (true, false) => { + client + .set_mcast_m2p(params.desired_m2p) + .await + .context("failed to add M2P mapping to sled")?; + } + // Inactive group has stale M2P: remove it. + (false, true) => { + let clear = ClearMcast2Phys { + group: params.group_ip, + underlay: params.underlay_ip, + }; + client + .clear_mcast_m2p(&clear) + .await + .context("failed to clear stale M2P from sled")?; + } + // Already converged. + _ => {} + } + + Ok(()) +} + +/// Converge a single sled's forwarding entries for one group. +/// +/// When the group is active, computes desired next hops (all member +/// sleds except this one) and updates only if the current state +/// differs. When inactive, clears any stale entries. +async fn converge_forwarding( + client: &sled_agent_client::Client, + sled_id: SledUuid, + params: &GroupConvergenceParams<'_>, +) -> Result<(), anyhow::Error> { + let found = client + .list_mcast_fwd() + .await + .context("failed to list forwarding on sled")? + .into_inner(); + + let current_entry = found.iter().find(|f| f.underlay == params.underlay_ip); + + if !params.group_is_active { + if current_entry.is_some() { + let clear = ClearMcastForwarding { underlay: params.underlay_ip }; + client + .clear_mcast_fwd(&clear) + .await + .context("failed to clear stale forwarding from sled")?; + } + return Ok(()); + } + + let desired_next_hops: Vec = params + .member_sled_ips + .iter() + .filter(|(id, _)| *id != sled_id) + .map(|(_, ip)| McastForwardingNextHop { + next_hop: *ip, + replication: McastReplication::Underlay, + filter: McastSourceFilter { + mode: McastFilterMode::Exclude, + sources: Vec::new(), + }, + }) + .collect(); + + // Comparison via sets: OPTE may return next hops in a different order + // than we build them, so a naive Vec comparison would cause spurious + // clear+set cycles on every reconciliation pass. + let needs_update = match current_entry { + Some(f) if f.next_hops.len() == desired_next_hops.len() => { + !desired_next_hops.iter().all(|d| f.next_hops.contains(d)) + } + Some(_) => true, + // Always create the entry when the group is active; even an + // empty next-hops list signals to OPTE that the underlay + // address is known. + None => true, + }; + + if needs_update { + // OPTE's set_mcast_fwd handler is additive: it inserts next + // hops but never removes stale ones. Clear first so the + // subsequent set produces an exact replacement. + if current_entry.is_some() { + let clear = ClearMcastForwarding { underlay: params.underlay_ip }; + client + .clear_mcast_fwd(&clear) + .await + .context("failed to clear forwarding before update")?; + } + let desired_fwd = McastForwardingEntry { + underlay: params.underlay_ip, + next_hops: desired_next_hops, + }; + client + .set_mcast_fwd(&desired_fwd) + .await + .context("failed to set forwarding on sled")?; + } + + Ok(()) +} diff --git a/nexus/tests/integration_tests/multicast/instances.rs b/nexus/tests/integration_tests/multicast/instances.rs index 245e284248e..521d85d0405 100644 --- a/nexus/tests/integration_tests/multicast/instances.rs +++ b/nexus/tests/integration_tests/multicast/instances.rs @@ -377,7 +377,7 @@ async fn test_multicast_group_attach_conflicts( } #[nexus_test] -async fn test_multicast_group_attach_limits( +async fn test_multicast_group_attach_multiple( cptestctx: &ControlPlaneTestContext, ) { let client = &cptestctx.external_client; @@ -390,14 +390,8 @@ async fn test_multicast_group_attach_limits( ) .await; - // Group names for implicit groups (implicitly created when first member joins) - let group_names = [ - "limit-test-group-0", - "limit-test-group-1", - "limit-test-group-2", - "limit-test-group-3", - "limit-test-group-4", - ]; + let group_names = + ["limit-test-group-0", "limit-test-group-1", "limit-test-group-2"]; // Create instance first (groups will be implicitly created when attached) let instance = instance_for_multicast_groups( @@ -409,8 +403,8 @@ async fn test_multicast_group_attach_limits( ) .await; - // Attach instance to 3 groups (implicitly creates each group) - let multicast_group_names = &group_names[0..3]; + // Attach instance to multiple groups (implicitly creates each group) + let multicast_group_names = &group_names; for group_name in multicast_group_names { multicast_group_attach( cptestctx, @@ -585,29 +579,23 @@ async fn test_multicast_concurrent_operations( // Wait for final state to be consistent (should still have 2 members) wait_for_member_count(client, "concurrent-test-group", 2).await; - // Concurrent operations during reconciler processing - - // Start a member addition and immediately follow with another operation - // This tests handling of operations that arrive while reconciler is processing - let rapid_ops_future = async { - multicast_group_attach( - cptestctx, - PROJECT_NAME, - "concurrent-instance-3", - "concurrent-test-group", - ) - .await; - // Don't wait for reconciler; immediately do another operation - multicast_group_detach( - client, - PROJECT_NAME, - "concurrent-instance-4", - "concurrent-test-group", - ) - .await; - }; - - rapid_ops_future.await; + // Back-to-back operations without waiting for reconciler between them. + // Tests that the reconciler handles state changes that arrive while it + // is still processing a previous batch. + multicast_group_attach( + cptestctx, + PROJECT_NAME, + "concurrent-instance-3", + "concurrent-test-group", + ) + .await; + multicast_group_detach( + client, + PROJECT_NAME, + "concurrent-instance-4", + "concurrent-test-group", + ) + .await; // Wait for system to reach consistent final state (should have 2 members) wait_for_member_count(client, "concurrent-test-group", 2).await; @@ -896,6 +884,94 @@ async fn test_multicast_migration_scenarios( .await .expect("Group should exist in DPD after migration"); + // Verify sled-agent state after migration: the target sled should + // have the VMM subscription and M2P mapping. The source sled should + // not have any subscription for the old propolis. + { + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => { + panic!("Expected IPv6 underlay address, got {other}") + } + }; + + // Target sled should have the VMM subscription after the + // reconciler pushes it via verify_members. Poll because the + // reconciler may still be propagating state to the sled-agent. + let post_info = nexus + .active_instance_info(&instance1_id, None) + .await + .unwrap() + .unwrap(); + + let target_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == target_sled) + .unwrap() + .sled_agent(); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let groups = target_agent.multicast_groups.lock().unwrap(); + let has_sub = + groups.get(&post_info.propolis_id).map_or(false, |g| { + g.iter().any(|m| m.group_ip == multicast_ip) + }); + if has_sub { Ok(()) } else { Err(CondCheckError::NotYet::<()>) } + }, + &POLL_INTERVAL, + &POLL_TIMEOUT, + ) + .await + .expect("Target sled should have VMM subscription after migration"); + + // Target sled should have M2P mapping. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = target_agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &POLL_TIMEOUT, + ) + .await + .expect("Target sled should have M2P mapping after migration"); + + // TODO: assert the source sled no longer holds a multicast + // subscription for the old propolis_id. On real hardware, + // VMM teardown (release_opte_ports -> PortTicket::release_inner) + // clears it. The sim does not model per-propolis cleanup on + // unregister for any of the networking maps (external_ips, + // attached_subnets, multicast_groups). + } + // Case: Concurrent migrations let group2_name = "concurrent-migration-group"; @@ -911,7 +987,9 @@ async fn test_multicast_migration_scenarios( group2_name, ) .await; + wait_for_group_active(client, group2_name).await; + multicast_group_attach( cptestctx, project_name, @@ -1787,14 +1865,13 @@ async fn test_multicast_ipv6_lifecycle(cptestctx: &ControlPlaneTestContext) { instance_wait_for_state(client, instance_id, InstanceState::Running).await; wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - let member_joined = wait_for_member_state( + wait_for_member_state( cptestctx, group_name, instance.identity.id, nexus_db_model::MulticastGroupMemberState::Joined, ) .await; - assert_eq!(member_joined.state, "Joined"); // Stop the instance - member should transition to "Left" let stop_url = @@ -1813,14 +1890,13 @@ async fn test_multicast_ipv6_lifecycle(cptestctx: &ControlPlaneTestContext) { instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - let member_left = wait_for_member_state( + wait_for_member_state( cptestctx, group_name, instance.identity.id, nexus_db_model::MulticastGroupMemberState::Left, ) .await; - assert_eq!(member_left.state, "Left"); // Delete the instance - this should delete the group since it's the only member cleanup_instances(cptestctx, client, project_name, &["ipv6-instance"]) diff --git a/nexus/tests/integration_tests/multicast/mod.rs b/nexus/tests/integration_tests/multicast/mod.rs index cc3c947008c..742ea6df22d 100644 --- a/nexus/tests/integration_tests/multicast/mod.rs +++ b/nexus/tests/integration_tests/multicast/mod.rs @@ -69,6 +69,7 @@ mod pool_selection; // Timeout constants for test operations const POLL_INTERVAL: Duration = Duration::from_millis(50); +const POLL_TIMEOUT: Duration = Duration::from_secs(30); const MULTICAST_OPERATION_TIMEOUT: Duration = Duration::from_secs(120); /// Generic helper for PUT upsert requests that return 201 Created. @@ -211,6 +212,11 @@ pub(crate) async fn create_multicast_ip_pool_v6( pool } +/// The reconciler can take longer than the default 10s timeout under +/// parallel test load, especially after the CRDB graceful-shutdown +/// change (eb8ae2f8f). 30s matches other heavy background task timeouts. +const RECONCILER_ACTIVATION_TIMEOUT: Duration = Duration::from_secs(30); + /// Waits for the multicast group reconciler to complete. /// /// This wraps wait_background_task with the correct task name. @@ -231,9 +237,10 @@ pub(crate) async fn wait_for_multicast_reconciler( pub(crate) async fn activate_multicast_reconciler( lockstep_client: &ClientTestContext, ) -> nexus_lockstep_client::types::BackgroundTask { - nexus_test_utils::background::activate_background_task( + nexus_test_utils::background::activate_background_task_with_timeout( lockstep_client, "multicast_reconciler", + RECONCILER_ACTIVATION_TIMEOUT, ) .await } @@ -307,8 +314,8 @@ where /// This function verifies that inventory has SP data for EVERY in-service sled, /// not just that inventory completed. /// -/// This is required for multicast member operations which map `sled_id` → `sp_slot` -/// → switch ports via inventory. +/// This is required for multicast member operations which map `sled_id` to +/// `sp_slot` to switch ports via inventory. pub(crate) async fn ensure_inventory_ready( cptestctx: &ControlPlaneTestContext, ) { @@ -358,9 +365,8 @@ pub(crate) async fn ensure_inventory_ready( let mut missing_sleds = Vec::new(); for sled in &sleds { let has_sp = inventory.sps.iter().any(|(bb, _)| { - (bb.serial_number == sled.serial_number() - && bb.part_number == sled.part_number()) - || bb.serial_number == sled.serial_number() + bb.serial_number == sled.serial_number() + && bb.part_number == sled.part_number() }); if !has_sp { @@ -385,8 +391,8 @@ pub(crate) async fn ensure_inventory_ready( Err(CondCheckError::::NotYet) } }, - &Duration::from_millis(500), // Check every 500ms - &Duration::from_secs(120), // Wait up to 120s + &Duration::from_millis(500), + &MULTICAST_OPERATION_TIMEOUT, ) .await { @@ -448,8 +454,8 @@ pub(crate) async fn ensure_dpd_ready(cptestctx: &ControlPlaneTestContext) { } } }, - &Duration::from_millis(200), // Check every 200ms - &Duration::from_secs(30), // Wait up to 30 seconds for switches + &Duration::from_millis(200), + &POLL_TIMEOUT, ) .await { @@ -1067,19 +1073,16 @@ pub(crate) async fn wait_for_group_deleted( lockstep_client, || async { let group_url = mcast_group_url(group_name); - match NexusRequest::object_get(client, &group_url) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - { - Ok(response) => { - if response.status == StatusCode::NOT_FOUND { - Ok(()) - } else { - Err(CondCheckError::<()>::NotYet) - } - } - Err(_) => Ok(()), // Assume 404 or similar error means deleted + let response = NexusRequest::new( + RequestBuilder::new(client, Method::GET, &group_url) + .expect_status(Some(StatusCode::NOT_FOUND)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await; + match response { + Ok(_) => Ok(()), + Err(_) => Err(CondCheckError::<()>::NotYet), } }, &POLL_INTERVAL, diff --git a/nexus/tests/integration_tests/multicast/networking_integration.rs b/nexus/tests/integration_tests/multicast/networking_integration.rs index 3b28892ef82..6103633e9ef 100644 --- a/nexus/tests/integration_tests/multicast/networking_integration.rs +++ b/nexus/tests/integration_tests/multicast/networking_integration.rs @@ -8,10 +8,14 @@ //! //! - External IPs: Instances with ephemeral/floating IPs can join multicast groups //! - Floating IP attach/detach: Multicast membership unaffected by IP changes +//! - Sled-agent M2P/forwarding propagation on member join and group deletion +//! - Per-VMM multicast subscriptions via sled-agent -use std::time::Duration; +use std::net::IpAddr; use http::{Method, StatusCode}; +use nexus_db_lookup::LookupPath; +use nexus_db_queries::context::OpContext; use nexus_test_utils::http_testing::{AuthnMode, NexusRequest, RequestBuilder}; use nexus_test_utils::resource_helpers::create_floating_ip; use nexus_test_utils::resource_helpers::{ @@ -30,6 +34,7 @@ use omicron_common::api::external::{ ByteCount, IdentityMetadataCreateParams, Instance, InstanceCpuCount, NameOrId, }; +use omicron_nexus::TestInterfaces; use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; @@ -637,13 +642,13 @@ async fn test_multicast_with_floating_ip_basic( Err(CondCheckError::::NotYet) } }, - &Duration::from_millis(200), - &Duration::from_secs(30), + &POLL_INTERVAL, + &POLL_TIMEOUT, ) .await .unwrap_or_else(|e| { panic!( - "instance did not show floating IP {} as attached within 30s: {e:?}", + "instance did not show floating IP {} as attached within {POLL_TIMEOUT:?}: {e:?}", floating_ip.ip ) }); @@ -694,13 +699,13 @@ async fn test_multicast_with_floating_ip_basic( Err(CondCheckError::::NotYet) } }, - &Duration::from_millis(200), - &Duration::from_secs(30), + &POLL_INTERVAL, + &POLL_TIMEOUT, ) .await .unwrap_or_else(|e| { panic!( - "instance still showed floating IP {} as attached after 30s: {e:?}", + "instance still showed floating IP {} as attached after {POLL_TIMEOUT:?}: {e:?}", floating_ip.ip ) }); @@ -713,3 +718,987 @@ async fn test_multicast_with_floating_ip_basic( cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; wait_for_group_deleted(cptestctx, group_name).await; } + +/// Verify that when an instance joins a multicast group, the reconciler +/// pushes M2P mappings, forwarding entries, and per-VMM subscriptions +/// to the sim sled-agent. Also verify cleanup on instance deletion. +#[nexus_test] +async fn test_multicast_sled_agent_m2p_and_subscriptions( + cptestctx: &nexus_test_utils::ControlPlaneTestContext< + omicron_nexus::Server, + >, +) { + let client = &cptestctx.external_client; + let project_name = "sled-agent-mcast-project"; + let group_name = "sled-agent-mcast-group"; + let instance_name = "sled-agent-mcast-instance"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "sled-agent-mcast-pool", + (224, 150, 0, 1), + (224, 150, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + // Create and start an instance. + let instance_params = InstanceCreate { + identity: IdentityMetadataCreateParams { + name: instance_name.parse().unwrap(), + description: "Instance for sled-agent multicast test".to_string(), + }, + ncpus: InstanceCpuCount::try_from(1).unwrap(), + memory: ByteCount::from_gibibytes_u32(1), + hostname: instance_name.parse().unwrap(), + user_data: vec![], + ssh_public_keys: None, + network_interfaces: InstanceNetworkInterfaceAttachment::DefaultIpv4, + external_ips: vec![], + multicast_groups: vec![], + disks: vec![], + boot_disk: None, + cpu_platform: None, + start: true, + auto_restart_policy: Default::default(), + anti_affinity_groups: Vec::new(), + }; + + let instance_url = format!("/v1/instances?project={project_name}"); + let instance: Instance = + object_create(client, &instance_url, &instance_params).await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + + instance_wait_for_running_with_simulation(cptestctx, instance_id).await; + wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + + // Attach instance to a multicast group. + multicast_group_attach(cptestctx, project_name, instance_name, group_name) + .await; + wait_for_group_active(client, group_name).await; + + // Wait for the member to reach "Joined" state (reconciler processes it). + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Look up the underlay multicast IPv6 address for verification. + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + let group_view = get_multicast_group(client, group_name).await; + let multicast_ip = group_view.multicast_ip; + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay multicast group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // Verify M2P mapping on the sim sled-agent. + let sled_agent = cptestctx.first_sled_agent(); + { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + assert!( + m2p.contains(&(multicast_ip, underlay_ipv6)), + "Sled-agent should have M2P mapping ({multicast_ip}, \ + {underlay_ipv6}), got: {m2p:?}" + ); + } + + // Verify forwarding entries on the sim sled-agent. + // With a single sled, the forwarding entry exists but has no next hops + // (no other sleds to forward to). + { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + assert!( + fwd.contains_key(&underlay_ipv6), + "Sled-agent should have forwarding entry for {underlay_ipv6}, \ + got: {fwd:?}" + ); + let next_hops = &fwd[&underlay_ipv6]; + assert!( + next_hops.is_empty(), + "Single-sled setup should have empty next_hops, got: {next_hops:?}" + ); + } + + // Verify per-VMM multicast subscription on the sim sled-agent. + { + let info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Running instance should have active info"); + + let groups = sled_agent.multicast_groups.lock().unwrap(); + let vmm_groups = groups + .get(&info.propolis_id) + .expect("Sled-agent should have multicast groups for propolis"); + + assert!( + vmm_groups.iter().any(|m| m.group_ip == multicast_ip), + "VMM should be subscribed to multicast group {multicast_ip}, \ + got: {vmm_groups:?}" + ); + } + + // Stop the instance. The member transitions "Joined" -> "Left". + let stop_url = + format!("/v1/instances/{instance_name}/stop?project={project_name}"); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &stop_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Should stop instance"); + + wait_for_instance_stopped(cptestctx, client, instance_id, instance_name) + .await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Left, + ) + .await; + + // Per-VMM subscription cleanup after stop is not asserted here. + // In production, destroying the VMM tears down the OPTE port, which + // implicitly removes multicast subscriptions. The reconciler's + // unsubscribe path correctly skips when the propolis_id is gone + // (matching production semantics where the port no longer exists). + // + // V2P follows the same pattern: sled-agent cleanup is keyed by + // network identity, not VMM identity. + + // M2P and forwarding should be cleared since there are no "Joined" + // members remaining. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + if !m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("M2P should be cleared when no Joined members remain"); + + // Forwarding should also be cleared when no "Joined" members remain. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + if !fwd.contains_key(&underlay_ipv6) { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("Forwarding should be cleared when no Joined members remain"); + + // Delete the instance, which should trigger group deletion. + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; + wait_for_group_deleted(cptestctx, group_name).await; + + // Verify M2P and forwarding are cleared. + { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + assert!( + !m2p.contains(&(multicast_ip, underlay_ipv6)), + "M2P mapping should be cleared after group deletion, got: {m2p:?}" + ); + } + { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + assert!( + !fwd.contains_key(&underlay_ipv6), + "Forwarding entry should be cleared after group deletion, \ + got: {fwd:?}" + ); + } +} + +/// Verify M2P and forwarding entries propagate to all sleds, not just the +/// hosting sled. Analogous to `test_instance_v2p_mappings` which verifies +/// V2P mappings on all sleds. +/// +/// Also verifies cleanup: after instance deletion, M2P and forwarding +/// entries are removed from every sled. +#[nexus_test(extra_sled_agents = 1)] +async fn test_multicast_multi_sled_m2p_propagation( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let project_name = "multi-sled-mcast-project"; + let group_name = "multi-sled-mcast-group"; + let instance_name = "multi-sled-mcast-instance"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "multi-sled-mcast-pool", + (224, 160, 0, 1), + (224, 160, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + // Collect all sled agents (2 total: 1 default + 1 extra). + // We use extra_sled_agents = 1 (not 2) because the gateway sim only + // provides SP data for the two well-known sled UUIDs. A 3rd sled with + // a random UUID would have no SP entry, causing inventory readiness + // to time out. Two sleds is sufficient to verify cross-sled propagation. + let all_sled_agents: Vec<_> = + cptestctx.sled_agents.iter().map(|sa| sa.sled_agent()).collect(); + assert_eq!(all_sled_agents.len(), 2, "expected 2 sled agents"); + + // Create and start an instance. + let instance = instance_for_multicast_groups( + cptestctx, + project_name, + instance_name, + true, + &[], + ) + .await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + + instance_wait_for_running_with_simulation(cptestctx, instance_id).await; + wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + + // Attach to a multicast group. + multicast_group_attach(cptestctx, project_name, instance_name, group_name) + .await; + wait_for_group_active(client, group_name).await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Look up the underlay IPv6 address for verification. + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + let group_view = get_multicast_group(client, group_name).await; + let multicast_ip = group_view.multicast_ip; + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay multicast group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // Look up the hosting sled for subscription verification. + let info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Running instance should have active info"); + + let hosting_sled_id = info.sled_id; + + // M2P and forwarding are pushed to all sleds (like V2P). Any + // instance on any sled may send to a multicast group; without the + // M2P mapping OPTE's overlay layer silently drops the packet. + // Forwarding entries let sender sleds replicate to member sleds. + for (i, sled_agent) in cptestctx.sled_agents.iter().enumerate() { + let agent = sled_agent.sled_agent(); + + // Wait for M2P on every sled. The reconciler may need an + // additional pass after the member reaches "Joined": during + // reconcile_member_states, propagate_m2p_and_forwarding may + // see member_sleds=0 (member still "Joining" in DB), so the + // actual push happens in reconcile_active_groups or the next + // full pass. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!("Sled {i} should have M2P mapping within timeout: {e:?}") + }); + + // Verify forwarding on every sled. With a single member on + // one sled, the hosting sled's forwarding has no next hops + // (local delivery via subscription). Non-hosting sleds list + // the hosting sled as a next hop so senders can reach it. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let fwd = agent.mcast_fwd.lock().unwrap(); + if fwd.contains_key(&underlay_ipv6) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "Sled {i} should have forwarding entry within timeout: {e:?}" + ) + }); + + let fwd = agent.mcast_fwd.lock().unwrap(); + let next_hops = &fwd[&underlay_ipv6]; + if sled_agent.sled_agent_id() == hosting_sled_id { + // Hosting sled: no next hops (only local member, OPTE + // delivers locally via subscription). + assert!( + next_hops.is_empty(), + "Hosting sled forwarding should have empty next_hops, \ + got: {next_hops:?}" + ); + } else { + // Non-hosting sled: next hop is the hosting sled so + // senders on this sled can reach the member. + assert_eq!( + next_hops.len(), + 1, + "Non-hosting sled {i} should have 1 next_hop (the hosting \ + sled), got: {next_hops:?}" + ); + } + } + + // Verify per-VMM subscription on the hosting sled only. + // Subscriptions are member-sled-only (not all sleds). + let hosting_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == hosting_sled_id) + .unwrap() + .sled_agent(); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let groups = hosting_agent.multicast_groups.lock().unwrap(); + match groups.get(&info.propolis_id) { + Some(vmm_groups) + if vmm_groups + .iter() + .any(|m| m.group_ip == multicast_ip) => + { + Ok(()) + } + _ => Err(CondCheckError::NotYet::<()>), + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "VMM should be subscribed to {multicast_ip} within timeout: {e:?}" + ) + }); + + // Delete the instance, which triggers group deletion. + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; + wait_for_group_deleted(cptestctx, group_name).await; + + // Verify cleanup on every sled: M2P and forwarding removed. + for (i, sled_agent) in all_sled_agents.iter().enumerate() { + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + if !m2p.contains(&(multicast_ip, underlay_ipv6)) + && !fwd.contains_key(&underlay_ipv6) + { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "Sled {i} M2P/forwarding not cleaned up within timeout: {e:?}" + ) + }); + } +} + +/// Verify cross-sled forwarding when members exist on both sleds. +/// +/// With one member on sled A and another on sled B, each sled's forwarding +/// entry should list the other sled as its sole next hop (self-exclusion). +/// This exercises the `.filter(|(id, _)| *id != sled_id)` logic in +/// `converge_forwarding`. +#[nexus_test(extra_sled_agents = 1)] +async fn test_multicast_cross_sled_forwarding( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + let project_name = "bidir-fwd-project"; + let group_name = "bidir-fwd-group"; + let instance_a_name = "bidir-instance-a"; + let instance_b_name = "bidir-instance-b"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "bidir-fwd-pool", + (224, 170, 0, 1), + (224, 170, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + let sled_a_id = cptestctx.sled_agents[0].sled_agent_id(); + let sled_b_id = cptestctx.sled_agents[1].sled_agent_id(); + + // Pin instance A to sled A by making sled B non-provisionable. + { + let (authz_sled, ..) = LookupPath::new(&opctx, datastore) + .sled_id(sled_b_id) + .lookup_for(nexus_auth::authz::Action::Modify) + .await + .expect("lookup sled B"); + datastore + .sled_set_provision_policy( + &opctx, + &authz_sled, + nexus_types::external_api::sled::SledProvisionPolicy::NonProvisionable, + ) + .await + .expect("set sled B non-provisionable"); + } + + let instance_a = instance_for_multicast_groups( + cptestctx, + project_name, + instance_a_name, + true, + &[], + ) + .await; + let instance_a_id = InstanceUuid::from_untyped_uuid(instance_a.identity.id); + instance_wait_for_running_with_simulation(cptestctx, instance_a_id).await; + + // Verify instance A landed on sled A. + let info_a = nexus + .active_instance_info(&instance_a_id, None) + .await + .unwrap() + .expect("instance A should be running"); + assert_eq!(info_a.sled_id, sled_a_id, "instance A should be on sled A"); + + // Swap provisionability: sled A non-provisionable, sled B provisionable. + { + let (authz_sled_a, ..) = LookupPath::new(&opctx, datastore) + .sled_id(sled_a_id) + .lookup_for(nexus_auth::authz::Action::Modify) + .await + .expect("lookup sled A"); + let (authz_sled_b, ..) = LookupPath::new(&opctx, datastore) + .sled_id(sled_b_id) + .lookup_for(nexus_auth::authz::Action::Modify) + .await + .expect("lookup sled B"); + datastore + .sled_set_provision_policy( + &opctx, + &authz_sled_a, + nexus_types::external_api::sled::SledProvisionPolicy::NonProvisionable, + ) + .await + .expect("set sled A non-provisionable"); + datastore + .sled_set_provision_policy( + &opctx, + &authz_sled_b, + nexus_types::external_api::sled::SledProvisionPolicy::Provisionable, + ) + .await + .expect("set sled B provisionable"); + } + + let instance_b = instance_for_multicast_groups( + cptestctx, + project_name, + instance_b_name, + true, + &[], + ) + .await; + + let instance_b_id = InstanceUuid::from_untyped_uuid(instance_b.identity.id); + instance_wait_for_running_with_simulation(cptestctx, instance_b_id).await; + + // Verify instance B landed on sled B. + let info_b = nexus + .active_instance_info(&instance_b_id, None) + .await + .unwrap() + .expect("instance B should be running"); + + assert_eq!(info_b.sled_id, sled_b_id, "instance B should be on sled B"); + + // Both instances join the same multicast group. + multicast_group_attach( + cptestctx, + project_name, + instance_a_name, + group_name, + ) + .await; + + multicast_group_attach( + cptestctx, + project_name, + instance_b_name, + group_name, + ) + .await; + + wait_for_group_active(client, group_name).await; + + // Wait for both members to reach "Joined". + for instance in [&instance_a, &instance_b] { + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + } + + // Resolve underlay IPv6 for forwarding assertions. + let group_view = get_multicast_group(client, group_name).await; + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, group_view.multicast_ip) + .await + .expect("lookup group by IP"); + + let underlay_group = datastore + .underlay_multicast_group_fetch( + &opctx, + external_group + .underlay_group_id + .expect("active group should have underlay_group_id"), + ) + .await + .expect("fetch underlay group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // Wait for forwarding entries on both sleds, then verify each sled's + // forwarding lists exactly the other sled (not itself). + let agent_a = cptestctx.sled_agents[0].sled_agent(); + let agent_b = cptestctx.sled_agents[1].sled_agent(); + + for (label, agent) in [("sled A", &agent_a), ("sled B", &agent_b)] { + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let fwd = agent.mcast_fwd.lock().unwrap(); + match fwd.get(&underlay_ipv6) { + Some(hops) if hops.len() == 1 => Ok(()), + _ => Err(CondCheckError::NotYet::<()>), + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!("{label} should have exactly 1 forwarding next_hop: {e:?}") + }); + } + + // Cleanup. + cleanup_instances( + cptestctx, + client, + project_name, + &[instance_a_name, instance_b_name], + ) + .await; + wait_for_group_deleted(cptestctx, group_name).await; +} + +/// Verify multicast state is re-established after simulated cold start. +/// Analogous to `test_instance_start_creates_networking_state` which tests +/// V2P re-establishment after forcibly clearing sled-agent state. +/// +/// Steps: a) create instance, b) join multicast, c) stop instance, +/// d) forcibly clear all sim sled-agent multicast state, e) restart +/// instance, f) verify M2P, forwarding, and per-VMM subscriptions are +/// re-established. +#[nexus_test(extra_sled_agents = 1)] +async fn test_multicast_cold_start_reestablishment( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let project_name = "cold-start-mcast-project"; + let group_name = "cold-start-mcast-group"; + let instance_name = "cold-start-mcast-instance"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "cold-start-mcast-pool", + (224, 170, 0, 1), + (224, 170, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + let all_sled_agents: Vec<_> = + cptestctx.sled_agents.iter().map(|sa| sa.sled_agent()).collect(); + + // Create and start an instance, join a multicast group. + let instance = instance_for_multicast_groups( + cptestctx, + project_name, + instance_name, + true, + &[], + ) + .await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + + instance_wait_for_running_with_simulation(cptestctx, instance_id).await; + wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + + multicast_group_attach(cptestctx, project_name, instance_name, group_name) + .await; + wait_for_group_active(client, group_name).await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Look up the underlay IPv6. + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + let group_view = get_multicast_group(client, group_name).await; + let multicast_ip = group_view.multicast_ip; + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay multicast group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // M2P and forwarding are pushed to all sleds. Verify at least the + // hosting sled has M2P before we clear state. + let pre_info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Running instance should have active info"); + + let pre_hosting_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == pre_info.sled_id) + .unwrap() + .sled_agent(); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = pre_hosting_agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("Hosting sled M2P should exist before cold start simulation"); + + // Stop the instance. + let stop_url = + format!("/v1/instances/{instance_name}/stop?project={project_name}"); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &stop_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Should stop instance"); + + wait_for_instance_stopped(cptestctx, client, instance_id, instance_name) + .await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Left, + ) + .await; + + // Forcibly clear all sim sled-agent multicast state, simulating a cold + // start where sled-agents lose in-memory state. + for sled_agent in &all_sled_agents { + sled_agent.m2p_mappings.lock().unwrap().clear(); + sled_agent.mcast_fwd.lock().unwrap().clear(); + sled_agent.multicast_groups.lock().unwrap().clear(); + } + + // Restart the instance. + let start_url = + format!("/v1/instances/{instance_name}/start?project={project_name}"); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &start_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Should start instance"); + + // Use `try_instance_simulate` here instead of `instance_wait_for_running_with_simulation` + // because the old VMM may still be draining from the sim collection after + // the stop. `instance_simulate` would panic if it pokes a VMM that was just + // removed; `try_instance_simulate` handles that gracefully. + wait_for_condition( + || async { + let _ = + instance_helpers::try_instance_simulate(nexus, &instance_id) + .await; + + let url = format!("/v1/instances/{instance_id}"); + let instance: Instance = NexusRequest::object_get(client, &url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .map_err(|_| CondCheckError::<()>::NotYet)? + .parsed_body() + .map_err(|_| CondCheckError::<()>::NotYet)?; + + if instance.runtime.run_state == InstanceState::Running { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("Instance should reach Running after restart"); + + // Wait for the reconciler to re-establish multicast state. + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Verify M2P and forwarding re-established on all sleds. + for (i, sled_agent) in all_sled_agents.iter().enumerate() { + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!("Sled {i} M2P not re-established within timeout: {e:?}") + }); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + if fwd.contains_key(&underlay_ipv6) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "Sled {i} forwarding not re-established within timeout: {e:?}" + ) + }); + } + + // Verify per-VMM subscription on the hosting sled (new propolis_id + // since restart creates a new VMM). + let post_info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Restarted instance should have active info"); + + let post_hosting_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == post_info.sled_id) + .unwrap() + .sled_agent(); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let groups = post_hosting_agent.multicast_groups.lock().unwrap(); + match groups.get(&post_info.propolis_id) { + Some(vmm_groups) + if vmm_groups + .iter() + .any(|m| m.group_ip == multicast_ip) => + { + Ok(()) + } + _ => Err(CondCheckError::NotYet::<()>), + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "New VMM should be subscribed to {multicast_ip} after restart: \ + {e:?}" + ) + }); + + // Cleanup. + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; + wait_for_group_deleted(cptestctx, group_name).await; +} diff --git a/openapi/sled-agent/sled-agent-32.0.0-d78e46.json.gitstub b/openapi/sled-agent/sled-agent-32.0.0-d78e46.json.gitstub new file mode 100644 index 00000000000..52aeb33df8f --- /dev/null +++ b/openapi/sled-agent/sled-agent-32.0.0-d78e46.json.gitstub @@ -0,0 +1 @@ +a6bb64efe80fcd43559bd0171051e45550983106:openapi/sled-agent/sled-agent-32.0.0-d78e46.json diff --git a/openapi/sled-agent/sled-agent-32.0.0-d78e46.json b/openapi/sled-agent/sled-agent-33.0.0-c33810.json similarity index 96% rename from openapi/sled-agent/sled-agent-32.0.0-d78e46.json rename to openapi/sled-agent/sled-agent-33.0.0-c33810.json index d14f54e5973..58a42bf4b42 100644 --- a/openapi/sled-agent/sled-agent-32.0.0-d78e46.json +++ b/openapi/sled-agent/sled-agent-33.0.0-c33810.json @@ -7,7 +7,7 @@ "url": "https://oxide.computer", "email": "api@oxide.computer" }, - "version": "32.0.0" + "version": "33.0.0" }, "paths": { "/artifacts": { @@ -389,6 +389,162 @@ } } }, + "/networking/mcast-fwd": { + "get": { + "summary": "List multicast forwarding entries present on this sled.", + "operationId": "list_mcast_fwd", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_McastForwardingEntry", + "type": "array", + "items": { + "$ref": "#/components/schemas/McastForwardingEntry" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "put": { + "summary": "Set multicast forwarding entries for an underlay address.", + "operationId": "set_mcast_fwd", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/McastForwardingEntry" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "summary": "Clear multicast forwarding entries for an underlay address.", + "operationId": "clear_mcast_fwd", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ClearMcastForwarding" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/networking/mcast-m2p": { + "get": { + "summary": "List M2P mappings present on this sled.", + "operationId": "list_mcast_m2p", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_Mcast2PhysMapping", + "type": "array", + "items": { + "$ref": "#/components/schemas/Mcast2PhysMapping" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "put": { + "summary": "Set a multicast-to-physical (M2P) mapping in OPTE.", + "operationId": "set_mcast_m2p", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Mcast2PhysMapping" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "summary": "Clear a multicast-to-physical (M2P) mapping in OPTE.", + "operationId": "clear_mcast_m2p", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ClearMcast2Phys" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/omicron-config": { "put": { "operationId": "omicron_config_put", @@ -2310,7 +2466,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/InstanceMulticastBody" + "$ref": "#/components/schemas/InstanceMulticastMembership" } } }, @@ -2344,7 +2500,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/InstanceMulticastBody" + "$ref": "#/components/schemas/InstanceMulticastMembership" } } }, @@ -3992,6 +4148,40 @@ } ] }, + "ClearMcast2Phys": { + "description": "Clear a mapping from an overlay multicast group to an underlay multicast address.", + "type": "object", + "properties": { + "group": { + "description": "Overlay multicast group address.", + "type": "string", + "format": "ip" + }, + "underlay": { + "description": "Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`].", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "group", + "underlay" + ] + }, + "ClearMcastForwarding": { + "description": "Clear all forwarding entries for an underlay multicast address.", + "type": "object", + "properties": { + "underlay": { + "description": "Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`].", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "underlay" + ] + }, "CombineError": { "type": "string", "enum": [ @@ -5793,35 +5983,6 @@ "src_propolis_addr" ] }, - "InstanceMulticastBody": { - "description": "Request body for multicast group operations.", - "oneOf": [ - { - "type": "object", - "properties": { - "join": { - "$ref": "#/components/schemas/InstanceMulticastMembership" - } - }, - "required": [ - "join" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "leave": { - "$ref": "#/components/schemas/InstanceMulticastMembership" - } - }, - "required": [ - "leave" - ], - "additionalProperties": false - } - ] - }, "InstanceMulticastMembership": { "description": "Represents a multicast group membership for an instance.\n\nIntroduced in v7.", "type": "object", @@ -6643,6 +6804,151 @@ "minimum": 1, "maximum": 32 }, + "Mcast2PhysMapping": { + "description": "Mapping from an overlay multicast group to an underlay multicast address.\n\nThe underlay address must be within the underlay multicast subnet (ff04::/64). This invariant is enforced by mapping in Nexus, not validated at this layer.", + "type": "object", + "properties": { + "group": { + "description": "Overlay multicast group address.", + "type": "string", + "format": "ip" + }, + "underlay": { + "description": "Underlay IPv6 multicast address (ff04::/64).", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "group", + "underlay" + ] + }, + "McastFilterMode": { + "description": "Filter mode for multicast source filtering.", + "oneOf": [ + { + "description": "Accept only packets from listed sources (SSM).", + "type": "string", + "enum": [ + "include" + ] + }, + { + "description": "Accept packets from all sources except those listed. With an empty sources list this is any-source multicast (ASM).", + "type": "string", + "enum": [ + "exclude" + ] + } + ] + }, + "McastForwardingEntry": { + "description": "Forwarding entry for an underlay multicast address, specifying which next hops should receive replicated packets.", + "type": "object", + "properties": { + "next_hops": { + "description": "Next hops with replication and source filter configuration.", + "type": "array", + "items": { + "$ref": "#/components/schemas/McastForwardingNextHop" + } + }, + "underlay": { + "description": "Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`].", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "next_hops", + "underlay" + ] + }, + "McastForwardingNextHop": { + "description": "A forwarding next hop with replication mode and aggregated source filter.", + "type": "object", + "properties": { + "filter": { + "description": "Aggregated source filter for this destination.", + "allOf": [ + { + "$ref": "#/components/schemas/McastSourceFilter" + } + ] + }, + "next_hop": { + "description": "Unicast IPv6 address of the destination sled.", + "type": "string", + "format": "ipv6" + }, + "replication": { + "description": "Replication mode for this next hop.", + "allOf": [ + { + "$ref": "#/components/schemas/McastReplication" + } + ] + } + }, + "required": [ + "filter", + "next_hop", + "replication" + ] + }, + "McastReplication": { + "description": "Replication mode for multicast forwarding.", + "oneOf": [ + { + "description": "Replicate to front panel ports (egress to external networks).", + "type": "string", + "enum": [ + "external" + ] + }, + { + "description": "Replicate to sled underlay ports.", + "type": "string", + "enum": [ + "underlay" + ] + }, + { + "description": "Replicate to both external and underlay ports.", + "type": "string", + "enum": [ + "both" + ] + } + ] + }, + "McastSourceFilter": { + "description": "Source filter for multicast forwarding.", + "type": "object", + "properties": { + "mode": { + "description": "Filter mode.", + "allOf": [ + { + "$ref": "#/components/schemas/McastFilterMode" + } + ] + }, + "sources": { + "description": "Source addresses to include or exclude.", + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + } + }, + "required": [ + "mode", + "sources" + ] + }, "Measurement": { "description": "An RoT provided measurement which represents a digest of some component in the trusted computing base (TCB) for the attestor.", "oneOf": [ diff --git a/openapi/sled-agent/sled-agent-latest.json b/openapi/sled-agent/sled-agent-latest.json index b62f1c66560..df2f3f649c3 120000 --- a/openapi/sled-agent/sled-agent-latest.json +++ b/openapi/sled-agent/sled-agent-latest.json @@ -1 +1 @@ -sled-agent-32.0.0-d78e46.json \ No newline at end of file +sled-agent-33.0.0-c33810.json \ No newline at end of file diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 5c8859e4c96..2485d74c803 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -19,6 +19,10 @@ use omicron_common::api::internal::{ SledIdentifiers, VirtualNetworkInterfaceHost, }, }; +use sled_agent_types_versions::latest::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, +}; use sled_agent_types_versions::{ latest, v1, v4, v6, v7, v9, v10, v11, v12, v14, v16, v17, v18, v20, v22, v24, v25, v26, v29, v30, v31, @@ -38,6 +42,7 @@ api_versions!([ // | example for the next person. // v // (next_int, IDENT), + (33, MCAST_M2P_FORWARDING), (32, MAKE_ALL_EXTERNAL_IP_FIELDS_OPTIONAL), (31, ADD_ICMPV6_FIREWALL_SUPPORT), (30, STRONGER_BGP_UNNUMBERED_TYPES), @@ -630,25 +635,79 @@ pub trait SledAgentApi { #[endpoint { method = PUT, path = "/vmms/{propolis_id}/multicast-group", - versions = VERSION_MULTICAST_SUPPORT.., + versions = VERSION_MCAST_M2P_FORWARDING.., }] async fn vmm_join_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result; #[endpoint { method = DELETE, path = "/vmms/{propolis_id}/multicast-group", - versions = VERSION_MULTICAST_SUPPORT.., + versions = VERSION_MCAST_M2P_FORWARDING.., }] async fn vmm_leave_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result; + /// Join a multicast group. + /// + /// Accepts a tagged `InstanceMulticastBody` request. + /// Superseded in MCAST_M2P_FORWARDING. + #[endpoint { + operation_id = "vmm_join_multicast_group", + method = PUT, + path = "/vmms/{propolis_id}/multicast-group", + versions = VERSION_MULTICAST_SUPPORT..VERSION_MCAST_M2P_FORWARDING, + }] + async fn vmm_join_multicast_group_v7( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let body = body.try_map(|b| match b { + v7::instance::InstanceMulticastBody::Join(m) => Ok(m), + v7::instance::InstanceMulticastBody::Leave(_) => { + Err(HttpError::for_bad_request( + None, + "Join endpoint cannot process Leave operations".to_string(), + )) + } + })?; + Self::vmm_join_multicast_group(rqctx, path_params, body).await + } + + /// Leave a multicast group. + /// + /// Accepts a tagged `InstanceMulticastBody` request. + /// Superseded in MCAST_M2P_FORWARDING. + #[endpoint { + operation_id = "vmm_leave_multicast_group", + method = DELETE, + path = "/vmms/{propolis_id}/multicast-group", + versions = VERSION_MULTICAST_SUPPORT..VERSION_MCAST_M2P_FORWARDING, + }] + async fn vmm_leave_multicast_group_v7( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let body = body.try_map(|b| match b { + v7::instance::InstanceMulticastBody::Leave(m) => Ok(m), + v7::instance::InstanceMulticastBody::Join(_) => { + Err(HttpError::for_bad_request( + None, + "Leave endpoint cannot process Join operations".to_string(), + )) + } + })?; + Self::vmm_leave_multicast_group(rqctx, path_params, body).await + } + #[endpoint { method = PUT, path = "/disks/{disk_id}", @@ -805,6 +864,70 @@ pub trait SledAgentApi { rqctx: RequestContext, ) -> Result>, HttpError>; + /// Set a multicast-to-physical (M2P) mapping in OPTE. + #[endpoint { + method = PUT, + path = "/networking/mcast-m2p", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn set_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Clear a multicast-to-physical (M2P) mapping in OPTE. + #[endpoint { + method = DELETE, + path = "/networking/mcast-m2p", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn clear_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Set multicast forwarding entries for an underlay address. + #[endpoint { + method = PUT, + path = "/networking/mcast-fwd", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn set_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Clear multicast forwarding entries for an underlay address. + #[endpoint { + method = DELETE, + path = "/networking/mcast-fwd", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn clear_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// List M2P mappings present on this sled. + #[endpoint { + method = GET, + path = "/networking/mcast-m2p", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn list_mcast_m2p( + rqctx: RequestContext, + ) -> Result>, HttpError>; + + /// List multicast forwarding entries present on this sled. + #[endpoint { + method = GET, + path = "/networking/mcast-fwd", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn list_mcast_fwd( + rqctx: RequestContext, + ) -> Result>, HttpError>; + #[endpoint { method = POST, path = "/switch-ports", diff --git a/sled-agent/src/bootstrap/early_networking.rs b/sled-agent/src/bootstrap/early_networking.rs index ade0edbe289..c88a17819b3 100644 --- a/sled-agent/src/bootstrap/early_networking.rs +++ b/sled-agent/src/bootstrap/early_networking.rs @@ -697,7 +697,7 @@ impl<'a> EarlyNetworkSetup<'a> { fanout: config.max_paths.as_nonzero_u8(), }; - if let Err(e) = mgd.bgp_apply_v2(&request).await { + if let Err(e) = mgd.bgp_apply(&request).await { error!( self.log, "BGP peer configuration failed"; @@ -706,7 +706,7 @@ impl<'a> EarlyNetworkSetup<'a> { ); } - if let Err(e) = mgd.update_rib_bestpath_fanout(&fanout).await { + if let Err(e) = mgd.update_bestpath_fanout(&fanout).await { error!( self.log, "error while updating bestpath fanout"; diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index d359b2be89d..3eb78a4e43f 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -44,12 +44,16 @@ use sled_agent_types::disk::{DiskEnsureBody, DiskPathParam}; use sled_agent_types::early_networking::EarlyNetworkConfigEnvelope; use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; use sled_agent_types::instance::{ - InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastBody, + InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastMembership, VmmIssueDiskSnapshotRequestBody, VmmIssueDiskSnapshotRequestPathParam, VmmIssueDiskSnapshotRequestResponse, VmmPathParam, VmmPutStateBody, VmmPutStateResponse, VmmUnregisterResponse, VpcPathParam, }; use sled_agent_types::inventory::{Inventory, OmicronSledConfig}; +use sled_agent_types::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, +}; use sled_agent_types::probes::ProbeSet; use sled_agent_types::rot::{ Attestation, CertificateChain, MeasurementLog, Nonce, RotPathParams, @@ -709,14 +713,14 @@ impl SledAgentApi for SledAgentImpl { async fn vmm_join_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result { let sa = rqctx.context(); let id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); + let membership = body.into_inner(); sa.latencies() .instrument_dropshot_handler(&rqctx, async { - sa.instance_join_multicast_group(id, &body_args).await?; + sa.instance_join_multicast_group(id, &membership).await?; Ok(HttpResponseUpdatedNoContent()) }) .await @@ -725,14 +729,14 @@ impl SledAgentApi for SledAgentImpl { async fn vmm_leave_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result { let sa = rqctx.context(); let id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); + let membership = body.into_inner(); sa.latencies() .instrument_dropshot_handler(&rqctx, async { - sa.instance_leave_multicast_group(id, &body_args).await?; + sa.instance_leave_multicast_group(id, &membership).await?; Ok(HttpResponseUpdatedNoContent()) }) .await @@ -933,6 +937,86 @@ impl SledAgentApi for SledAgentImpl { .await } + async fn set_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.set_mcast_m2p(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn clear_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.clear_mcast_m2p(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn set_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.set_mcast_fwd(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn clear_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.clear_mcast_fwd(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn list_mcast_m2p( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + let m2p = sa.list_mcast_m2p().await.map_err(Error::from)?; + Ok(HttpResponseOk(m2p)) + }) + .await + } + + async fn list_mcast_fwd( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + let fwd = sa.list_mcast_fwd().await.map_err(Error::from)?; + Ok(HttpResponseOk(fwd)) + }) + .await + } + async fn uplink_ensure( rqctx: RequestContext, body: TypedBody, diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 6d187ae8881..a09aa8463a1 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -2332,7 +2332,11 @@ impl InstanceRunner { // for them. let mut opte_ports = Vec::with_capacity(self.requested_nics.len()); let mut opte_port_names = Vec::with_capacity(self.requested_nics.len()); + let mcast_cfg = self.multicast_group_cfgs(); for nic in self.requested_nics.iter() { + // Multicast subscriptions target the primary NIC only. + // See the TODO on ensure_multicast_groups. + let groups: &[_] = if nic.primary { &mcast_cfg } else { &[] }; let port = self.port_manager.create_port(PortCreateParams { nic, external_ips: &self.external_ips, @@ -2344,6 +2348,7 @@ impl InstanceRunner { .copied() .map(Into::into) .collect(), + multicast_groups: groups, })?; opte_port_names.push(port.0.name().to_string()); opte_ports.push(port); @@ -2625,12 +2630,13 @@ impl InstanceRunner { &mut self, membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - // Similar logic to add_external_ip - save state for rollback + // Save pre-call state so rollback restores exactly what was + // present, mirroring add_external_ip's old_config pattern. + let old_groups = self.multicast_groups.clone(); let out = self.join_multicast_group_inner(membership).await; if out.is_err() { - // Rollback state on error - self.multicast_groups.retain(|m| m != membership); + self.multicast_groups = old_groups; } out } @@ -2639,14 +2645,13 @@ impl InstanceRunner { &mut self, membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - // Similar logic to delete_external_ip - save state for rollback + // Save pre-call state so rollback restores exactly what was + // present, mirroring delete_external_ip's old_config pattern. + let old_groups = self.multicast_groups.clone(); let out = self.leave_multicast_group_inner(membership).await; if out.is_err() { - // Rollback state on error - readd the membership if it was removed - if !self.multicast_groups.contains(membership) { - self.multicast_groups.push(membership.clone()); - } + self.multicast_groups = old_groups; } out } @@ -2655,105 +2660,75 @@ impl InstanceRunner { self.refresh_multicast_groups_inner() } - async fn join_multicast_group_inner( - &mut self, - membership: &InstanceMulticastMembership, - ) -> Result<(), Error> { - // Check for duplicate membership (idempotency) - if self.multicast_groups.contains(membership) { - return Ok(()); - } - - // Add to local state - self.multicast_groups.push(membership.clone()); + /// Convert `InstanceMulticastMembership` list to OPTE + /// `MulticastGroupCfg` list. + fn multicast_group_cfgs( + &self, + ) -> Vec { + self.multicast_groups + .iter() + .map(|m| illumos_utils::opte::MulticastGroupCfg { + group_ip: m.group_ip, + sources: m.sources.clone(), + }) + .collect() + } - // Update OPTE configuration + /// Sync the current multicast group memberships to OPTE via the + /// port manager. + /// + // TODO: subscriptions target the primary NIC only. + // InstanceMulticastMembership carries no NIC identifier, same as + // external IPs and attached subnets (though not firewall rules, + // which fan out across all VPC ports by VNI). If per-NIC multicast + // is needed, the membership type needs a NIC field and both this + // function and setup_propolis_zone must be updated. + fn ensure_multicast_groups(&self) -> Result<(), Error> { let Some(primary_nic) = self.primary_nic() else { return Err(Error::Opte(illumos_utils::opte::Error::NoPrimaryNic)); }; - // Convert InstanceMulticastMembership to MulticastGroupCfg - let multicast_cfg: Vec = self - .multicast_groups - .iter() - .map(|membership| illumos_utils::opte::MulticastGroupCfg { - group_ip: membership.group_ip, - sources: membership.sources.clone(), - }) - .collect(); - - // Validate multicast configuration with OPTE self.port_manager.multicast_groups_ensure( primary_nic.id, primary_nic.kind, - &multicast_cfg, + &self.multicast_group_cfgs(), )?; - // TODO: Configure underlay multicast group addresses on the zone's vNIC. - // This should add the multicast group addresses to the zone's network - // interface so it can receive underlay multicast traffic (physical - // network layer). Rack-wide dataplane forwarding is handled by the - // RPW reconciler + DPD. - // See also: port_manager.rs multicast_groups_ensure() TODO about - // configuring OPTE port-level multicast group membership. - Ok(()) } - async fn leave_multicast_group_inner( + async fn join_multicast_group_inner( &mut self, membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - // Remove from local state - self.multicast_groups.retain(|m| m != membership); - - // Update OPTE configuration - let Some(primary_nic) = self.primary_nic() else { - return Err(Error::Opte(illumos_utils::opte::Error::NoPrimaryNic)); - }; + // Idempotent -> skip if already subscribed. + if self.multicast_groups.contains(membership) { + return Ok(()); + } - // Convert InstanceMulticastMembership to MulticastGroupCfg - let multicast_cfg: Vec = self - .multicast_groups - .iter() - .map(|membership| illumos_utils::opte::MulticastGroupCfg { - group_ip: membership.group_ip, - sources: membership.sources.clone(), - }) - .collect(); + self.multicast_groups.push(membership.clone()); + self.ensure_multicast_groups()?; - self.port_manager.multicast_groups_ensure( - primary_nic.id, - primary_nic.kind, - &multicast_cfg, - )?; + // OPTE's xde driver uses mac_siphon_set on the underlay NIC to + // receive all packets (including multicast) at the MAC layer. + // + // Subscription filtering and delivery happen inside OPTE via + // mcast_subscribe. Rack-wide dataplane forwarding is handled by + // the RPW reconciler + DPD. Ok(()) } - fn refresh_multicast_groups_inner(&mut self) -> Result<(), Error> { - // Update OPTE configuration - let Some(primary_nic) = self.primary_nic() else { - return Err(Error::Opte(illumos_utils::opte::Error::NoPrimaryNic)); - }; - - // Convert InstanceMulticastMembership to MulticastGroupCfg - let multicast_cfg: Vec = self - .multicast_groups - .iter() - .map(|membership| illumos_utils::opte::MulticastGroupCfg { - group_ip: membership.group_ip, - sources: membership.sources.clone(), - }) - .collect(); - - self.port_manager.multicast_groups_ensure( - primary_nic.id, - primary_nic.kind, - &multicast_cfg, - )?; + async fn leave_multicast_group_inner( + &mut self, + membership: &InstanceMulticastMembership, + ) -> Result<(), Error> { + self.multicast_groups.retain(|m| m != membership); + self.ensure_multicast_groups() + } - Ok(()) + fn refresh_multicast_groups_inner(&mut self) -> Result<(), Error> { + self.ensure_multicast_groups() } } diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 71c39e93d82..5d941c9cc86 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -26,7 +26,9 @@ use sled_agent_config_reconciler::CurrentlyManagedZpoolsReceiver; use sled_agent_types::attached_subnet::AttachedSubnet; use sled_agent_types::attached_subnet::AttachedSubnets; use sled_agent_types::instance::*; -use sled_agent_types::instance::{InstanceEnsureBody, InstanceMulticastBody}; +use sled_agent_types::instance::{ + InstanceEnsureBody, InstanceMulticastMembership, +}; use slog::Logger; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; @@ -308,14 +310,14 @@ impl InstanceManager { pub async fn join_multicast_group( &self, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { let (tx, rx) = oneshot::channel(); self.inner .tx .send(InstanceManagerRequest::JoinMulticastGroup { propolis_id, - multicast_body: multicast_body.clone(), + membership: membership.clone(), tx, }) .await @@ -327,14 +329,14 @@ impl InstanceManager { pub async fn leave_multicast_group( &self, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { let (tx, rx) = oneshot::channel(); self.inner .tx .send(InstanceManagerRequest::LeaveMulticastGroup { propolis_id, - multicast_body: multicast_body.clone(), + membership: membership.clone(), tx, }) .await @@ -486,12 +488,12 @@ enum InstanceManagerRequest { }, JoinMulticastGroup { propolis_id: PropolisUuid, - multicast_body: InstanceMulticastBody, + membership: InstanceMulticastMembership, tx: oneshot::Sender>, }, LeaveMulticastGroup { propolis_id: PropolisUuid, - multicast_body: InstanceMulticastBody, + membership: InstanceMulticastMembership, tx: oneshot::Sender>, }, GetState { @@ -632,11 +634,11 @@ impl InstanceManagerRunner { Some(RefreshExternalIps { tx }) => { self.refresh_external_ips(tx) }, - Some(JoinMulticastGroup { propolis_id, multicast_body, tx }) => { - self.join_multicast_group(tx, propolis_id, &multicast_body) + Some(JoinMulticastGroup { propolis_id, membership, tx }) => { + self.join_multicast_group(tx, propolis_id, &membership) }, - Some(LeaveMulticastGroup { propolis_id, multicast_body, tx }) => { - self.leave_multicast_group(tx, propolis_id, &multicast_body) + Some(LeaveMulticastGroup { propolis_id, membership, tx }) => { + self.leave_multicast_group(tx, propolis_id, &membership) } Some(GetState { propolis_id, tx }) => { // TODO(eliza): it could potentially be nice to @@ -909,20 +911,12 @@ impl InstanceManagerRunner { &self, tx: oneshot::Sender>, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { let Some(instance) = self.get_propolis(propolis_id) else { return Err(Error::NoSuchVmm(propolis_id)); }; - - match multicast_body { - InstanceMulticastBody::Join(membership) => { - instance.join_multicast_group(tx, membership)?; - } - InstanceMulticastBody::Leave(membership) => { - instance.leave_multicast_group(tx, membership)?; - } - } + instance.join_multicast_group(tx, membership)?; Ok(()) } @@ -930,20 +924,12 @@ impl InstanceManagerRunner { &self, tx: oneshot::Sender>, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { let Some(instance) = self.get_propolis(propolis_id) else { return Err(Error::NoSuchVmm(propolis_id)); }; - - match multicast_body { - InstanceMulticastBody::Join(membership) => { - instance.join_multicast_group(tx, membership)?; - } - InstanceMulticastBody::Leave(membership) => { - instance.leave_multicast_group(tx, membership)?; - } - } + instance.leave_multicast_group(tx, membership)?; Ok(()) } diff --git a/sled-agent/src/probe_manager.rs b/sled-agent/src/probe_manager.rs index 2ce98d54da7..f5d60df198e 100644 --- a/sled-agent/src/probe_manager.rs +++ b/sled-agent/src/probe_manager.rs @@ -381,6 +381,7 @@ impl ProbeManagerInner { // but probes are supposed to mimic instances as closely as // possible. We should consider if we want to support them here. attached_subnets: vec![], + multicast_groups: &[], })?; let installed_zone = ZoneBuilderFactory::new() diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 74fb7d07779..cd7c084564a 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -1198,6 +1198,7 @@ impl ServiceManager { dhcp_config: DhcpCfg::default(), // Services do not use attached subnets, only instances. attached_subnets: vec![], + multicast_groups: &[], }) .map_err(|err| Error::ServicePortCreation { service: zone_kind, diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index 09c08cc808c..58ded77d0b3 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -236,18 +236,15 @@ impl SimCollection { while should_step { let (new_state, to_destroy) = { - // The object must be present in `objects` because it only gets - // removed when it comes to rest in the "Destroyed" state, but - // we can only get here if there's an asynchronous state - // transition desired. - // // We do as little as possible with the lock held. In // particular, we want to finish this work before calling out to // notify the nexus. let mut objects = self.objects.lock().await; + + // The object may already have been destroyed and removed by a + // concurrent poke (e.g., sim_step racing with an explicit poke + // from a test). In that case there is nothing left to do. let Some(mut object) = objects.remove(&id) else { - // Instance was already removed (e.g., destroyed by a - // concurrent transition). Nothing left to do. break; }; object.transition_finish(); diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index e85c2d55e45..1667dfaef46 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -56,12 +56,16 @@ use sled_agent_types::disk::{DiskEnsureBody, DiskPathParam}; use sled_agent_types::early_networking::EarlyNetworkConfigEnvelope; use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; use sled_agent_types::instance::{ - InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastBody, + InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastMembership, VmmIssueDiskSnapshotRequestBody, VmmIssueDiskSnapshotRequestPathParam, VmmIssueDiskSnapshotRequestResponse, VmmPathParam, VmmPutStateBody, VmmPutStateResponse, VmmUnregisterResponse, VpcPathParam, }; use sled_agent_types::inventory::{Inventory, OmicronSledConfig}; +use sled_agent_types::multicast::ClearMcast2Phys; +use sled_agent_types::multicast::ClearMcastForwarding; +use sled_agent_types::multicast::Mcast2PhysMapping; +use sled_agent_types::multicast::McastForwardingEntry; use sled_agent_types::probes::ProbeSet; use sled_agent_types::rot::{ Attestation, CertificateChain, MeasurementLog, Nonce, RotPathParams, @@ -190,52 +194,24 @@ impl SledAgentApi for SledAgentSimImpl { async fn vmm_join_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result { let sa = rqctx.context(); let propolis_id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); - - match body_args { - InstanceMulticastBody::Join(membership) => { - sa.instance_join_multicast_group(propolis_id, &membership) - .await?; - } - InstanceMulticastBody::Leave(_) => { - // This endpoint is for joining - reject leave operations - return Err(HttpError::for_bad_request( - None, - "Join endpoint cannot process Leave operations".to_string(), - )); - } - } - + let membership = body.into_inner(); + sa.instance_join_multicast_group(propolis_id, &membership).await?; Ok(HttpResponseUpdatedNoContent()) } async fn vmm_leave_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result { let sa = rqctx.context(); let propolis_id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); - - match body_args { - InstanceMulticastBody::Leave(membership) => { - sa.instance_leave_multicast_group(propolis_id, &membership) - .await?; - } - InstanceMulticastBody::Join(_) => { - // This endpoint is for leaving - reject join operations - return Err(HttpError::for_bad_request( - None, - "Leave endpoint cannot process Join operations".to_string(), - )); - } - } - + let membership = body.into_inner(); + sa.instance_leave_multicast_group(propolis_id, &membership).await?; Ok(HttpResponseUpdatedNoContent()) } @@ -389,6 +365,66 @@ impl SledAgentApi for SledAgentSimImpl { Ok(HttpResponseOk(vnics)) } + async fn set_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.set_mcast_m2p(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn clear_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.clear_mcast_m2p(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn set_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.set_mcast_fwd(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn clear_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.clear_mcast_fwd(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn list_mcast_m2p( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + let m2p = sa + .list_mcast_m2p() + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseOk(m2p)) + } + + async fn list_mcast_fwd( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + let fwd = sa + .list_mcast_fwd() + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseOk(fwd)) + } + async fn uplink_ensure( _rqctx: RequestContext, _body: TypedBody, diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 6ad61297ff5..000dec5b1b0 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -69,6 +69,10 @@ use sled_agent_types::inventory::{ OmicronFileSourceResolverInventory, OmicronSledConfig, OmicronZonesConfig, SingleMeasurementInventory, SledRole, ZpoolHealth, }; +use sled_agent_types::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, McastForwardingNextHop, +}; use sled_agent_types::support_bundle::SupportBundleMetadata; use slog::Logger; @@ -98,6 +102,8 @@ pub struct SledAgent { pub nexus_client: Arc, pub simulated_upstairs: Arc, pub v2p_mappings: Mutex>, + pub m2p_mappings: Mutex>, + pub mcast_fwd: Mutex>>, mock_propolis: futures::lock::Mutex< Option<(propolis_mock_server::Server, PropolisClient)>, >, @@ -187,6 +193,8 @@ impl SledAgent { nexus_client, simulated_upstairs, v2p_mappings: Mutex::new(HashSet::new()), + m2p_mappings: Mutex::new(HashSet::new()), + mcast_fwd: Mutex::new(HashMap::new()), external_ips: Mutex::new(HashMap::new()), attached_subnets: Mutex::new(HashMap::new()), multicast_groups: Mutex::new(HashMap::new()), @@ -675,6 +683,58 @@ impl SledAgent { Ok(Vec::from_iter(v2p_mappings.clone())) } + pub fn set_mcast_m2p(&self, req: &Mcast2PhysMapping) -> Result<(), Error> { + let mut m2p = self.m2p_mappings.lock().unwrap(); + m2p.insert((req.group, req.underlay)); + Ok(()) + } + + pub fn clear_mcast_m2p(&self, req: &ClearMcast2Phys) -> Result<(), Error> { + let mut m2p = self.m2p_mappings.lock().unwrap(); + m2p.remove(&(req.group, req.underlay)); + Ok(()) + } + + pub fn set_mcast_fwd( + &self, + req: &McastForwardingEntry, + ) -> Result<(), Error> { + let mut fwd = self.mcast_fwd.lock().unwrap(); + fwd.insert(req.underlay, req.next_hops.clone()); + Ok(()) + } + + pub fn clear_mcast_fwd( + &self, + req: &ClearMcastForwarding, + ) -> Result<(), Error> { + let mut fwd = self.mcast_fwd.lock().unwrap(); + fwd.remove(&req.underlay); + Ok(()) + } + + pub fn list_mcast_m2p(&self) -> Result, Error> { + let m2p = self.m2p_mappings.lock().unwrap(); + Ok(m2p + .iter() + .map(|(group, underlay)| Mcast2PhysMapping { + group: *group, + underlay: *underlay, + }) + .collect()) + } + + pub fn list_mcast_fwd(&self) -> Result, Error> { + let fwd = self.mcast_fwd.lock().unwrap(); + Ok(fwd + .iter() + .map(|(underlay, next_hops)| McastForwardingEntry { + underlay: *underlay, + next_hops: next_hops.clone(), + }) + .collect()) + } + pub async fn instance_put_external_ip( &self, propolis_id: PropolisUuid, diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 07c9270722d..59cac0e525d 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -79,10 +79,14 @@ use sled_agent_types::disk::DiskStateRequested; use sled_agent_types::early_networking::EarlyNetworkConfigEnvelope; use sled_agent_types::early_networking::RackNetworkConfig; use sled_agent_types::instance::{ - InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastBody, + InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastMembership, VmmPutStateResponse, VmmStateRequested, VmmUnregisterResponse, }; use sled_agent_types::inventory::{Inventory, OmicronSledConfig, SledRole}; +use sled_agent_types::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, +}; use sled_agent_types::probes::ProbeCreate; use sled_agent_types::resolvable_files::{ PreparedOmicronZone, RemoveMupdateOverrideResult, ResolverStatus, @@ -415,7 +419,6 @@ struct SledAgentInner { // A handle to the trust quorum. trust_quorum: trust_quorum::NodeTaskHandle, - // A handle to the hardware monitor. hardware_monitor: HardwareMonitorHandle, @@ -1013,26 +1016,28 @@ impl SledAgent { .map_err(|e| Error::Instance(e)) } + /// Subscribe a VMM's OPTE port to a multicast group. pub async fn instance_join_multicast_group( &self, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { self.inner .instances - .join_multicast_group(propolis_id, multicast_body) + .join_multicast_group(propolis_id, membership) .await .map_err(|e| Error::Instance(e)) } + /// Unsubscribe a VMM's OPTE port from a multicast group. pub async fn instance_leave_multicast_group( &self, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { self.inner .instances - .leave_multicast_group(propolis_id, multicast_body) + .leave_multicast_group(propolis_id, membership) .await .map_err(|e| Error::Instance(e)) } @@ -1117,6 +1122,52 @@ impl SledAgent { .map_err(Error::from) } + /// Install a multicast overlay-to-underlay (M2P) mapping in OPTE. + pub async fn set_mcast_m2p( + &self, + req: &Mcast2PhysMapping, + ) -> Result<(), Error> { + self.inner.port_manager.set_mcast_m2p(req).map_err(Error::from) + } + + /// Remove a multicast overlay-to-underlay (M2P) mapping from OPTE. + pub async fn clear_mcast_m2p( + &self, + req: &ClearMcast2Phys, + ) -> Result<(), Error> { + self.inner.port_manager.clear_mcast_m2p(req).map_err(Error::from) + } + + /// Set multicast forwarding next hops for an underlay group address. + pub async fn set_mcast_fwd( + &self, + req: &McastForwardingEntry, + ) -> Result<(), Error> { + self.inner.port_manager.set_mcast_fwd(req).map_err(Error::from) + } + + /// Remove multicast forwarding entries for an underlay group address. + pub async fn clear_mcast_fwd( + &self, + req: &ClearMcastForwarding, + ) -> Result<(), Error> { + self.inner.port_manager.clear_mcast_fwd(req).map_err(Error::from) + } + + /// List all multicast M2P mappings from OPTE. + pub async fn list_mcast_m2p( + &self, + ) -> Result, Error> { + self.inner.port_manager.list_mcast_m2p().map_err(Error::from) + } + + /// List all multicast forwarding entries from OPTE. + pub async fn list_mcast_fwd( + &self, + ) -> Result, Error> { + self.inner.port_manager.list_mcast_fwd().map_err(Error::from) + } + pub async fn ensure_scrimlet_host_ports( &self, uplinks: Vec, diff --git a/sled-agent/types/src/lib.rs b/sled-agent/types/src/lib.rs index 04776eaca6f..27dd198a590 100644 --- a/sled-agent/types/src/lib.rs +++ b/sled-agent/types/src/lib.rs @@ -16,6 +16,7 @@ pub mod early_networking; pub mod firewall_rules; pub mod instance; pub mod inventory; +pub mod multicast; pub mod probes; pub mod rack_init; pub mod rack_ops; diff --git a/sled-agent/types/src/multicast.rs b/sled-agent/types/src/multicast.rs new file mode 100644 index 00000000000..27e95a0d94c --- /dev/null +++ b/sled-agent/types/src/multicast.rs @@ -0,0 +1,7 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Multicast networking types for the Sled Agent API. + +pub use sled_agent_types_versions::latest::multicast::*; diff --git a/sled-agent/types/versions/src/latest.rs b/sled-agent/types/versions/src/latest.rs index 3aa22f7742c..e62a2acc379 100644 --- a/sled-agent/types/versions/src/latest.rs +++ b/sled-agent/types/versions/src/latest.rs @@ -89,6 +89,18 @@ pub mod firewall_rules { pub use crate::v31::firewall_rules::VpcFirewallRulesEnsureBody; } +pub mod multicast { + pub use crate::v33::multicast::ClearMcast2Phys; + pub use crate::v33::multicast::ClearMcastForwarding; + pub use crate::v33::multicast::Mcast2PhysMapping; + pub use crate::v33::multicast::McastFilterMode; + pub use crate::v33::multicast::McastForwardingEntry; + pub use crate::v33::multicast::McastForwardingNextHop; + pub use crate::v33::multicast::McastReplication; + pub use crate::v33::multicast::McastSourceFilter; + pub use crate::v33::multicast::MulticastGroupCfg; +} + pub mod instance { pub use crate::v1::instance::InstanceExternalIpBody; pub use crate::v1::instance::InstanceMetadata; diff --git a/sled-agent/types/versions/src/lib.rs b/sled-agent/types/versions/src/lib.rs index 7467e5edad0..a79b31399ac 100644 --- a/sled-agent/types/versions/src/lib.rs +++ b/sled-agent/types/versions/src/lib.rs @@ -77,6 +77,8 @@ pub mod v30; pub mod v31; #[path = "make_all_external_ip_fields_optional/mod.rs"] pub mod v32; +#[path = "mcast_m2p_forwarding/mod.rs"] +pub mod v33; #[path = "add_nexus_lockstep_port_to_inventory/mod.rs"] pub mod v4; #[path = "add_probe_put_endpoint/mod.rs"] diff --git a/sled-agent/types/versions/src/mcast_m2p_forwarding/mod.rs b/sled-agent/types/versions/src/mcast_m2p_forwarding/mod.rs new file mode 100644 index 00000000000..8c9d1bb1c4a --- /dev/null +++ b/sled-agent/types/versions/src/mcast_m2p_forwarding/mod.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Version `MCAST_M2P_FORWARDING` of the Sled Agent API. +//! +//! Adds multicast-to-physical mapping and forwarding types used by +//! the multicast-to-physical and forwarding endpoints. + +pub mod multicast; diff --git a/sled-agent/types/versions/src/mcast_m2p_forwarding/multicast.rs b/sled-agent/types/versions/src/mcast_m2p_forwarding/multicast.rs new file mode 100644 index 00000000000..5c2247c1159 --- /dev/null +++ b/sled-agent/types/versions/src/mcast_m2p_forwarding/multicast.rs @@ -0,0 +1,132 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2026 Oxide Computer Company + +//! Multicast networking types for the sled-agent API. +//! +//! These types support overlay-to-underlay multicast mapping and +//! multicast forwarding configuration via OPTE. The underlay address +//! space is ff04::/64, a subset of admin-local scope per +//! [RFC 7346](https://www.rfc-editor.org/rfc/rfc7346). + +use std::net::IpAddr; +use std::net::Ipv6Addr; + +use schemars::JsonSchema; +use serde::Deserialize; +use serde::Serialize; + +/// Mapping from an overlay multicast group to an underlay multicast +/// address. +/// +/// The underlay address must be within the underlay multicast subnet +/// (ff04::/64). This invariant is enforced by mapping in Nexus, not +/// validated at this layer. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct Mcast2PhysMapping { + /// Overlay multicast group address. + pub group: IpAddr, + /// Underlay IPv6 multicast address (ff04::/64). + pub underlay: Ipv6Addr, +} + +/// Clear a mapping from an overlay multicast group to an underlay +/// multicast address. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct ClearMcast2Phys { + /// Overlay multicast group address. + pub group: IpAddr, + /// Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`]. + pub underlay: Ipv6Addr, +} + +/// Forwarding entry for an underlay multicast address, specifying +/// which next hops should receive replicated packets. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct McastForwardingEntry { + /// Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`]. + pub underlay: Ipv6Addr, + /// Next hops with replication and source filter configuration. + pub next_hops: Vec, +} + +/// Clear all forwarding entries for an underlay multicast address. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct ClearMcastForwarding { + /// Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`]. + pub underlay: Ipv6Addr, +} + +/// A forwarding next hop with replication mode and aggregated +/// source filter. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +pub struct McastForwardingNextHop { + /// Unicast IPv6 address of the destination sled. + pub next_hop: Ipv6Addr, + /// Replication mode for this next hop. + pub replication: McastReplication, + /// Aggregated source filter for this destination. + pub filter: McastSourceFilter, +} + +/// Replication mode for multicast forwarding. +#[derive( + Clone, Copy, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, +)] +#[serde(rename_all = "snake_case")] +pub enum McastReplication { + /// Replicate to front panel ports (egress to external networks). + External, + /// Replicate to sled underlay ports. + Underlay, + /// Replicate to both external and underlay ports. + Both, +} + +/// Source filter for multicast forwarding. +#[derive( + Clone, Debug, Default, Deserialize, Serialize, JsonSchema, PartialEq, +)] +pub struct McastSourceFilter { + /// Filter mode. + pub mode: McastFilterMode, + /// Source addresses to include or exclude. + pub sources: Vec, +} + +/// Filter mode for multicast source filtering. +#[derive( + Clone, + Copy, + Debug, + Default, + Deserialize, + Serialize, + JsonSchema, + PartialEq, + Eq, +)] +#[serde(rename_all = "snake_case")] +pub enum McastFilterMode { + /// Accept only packets from listed sources (SSM). + Include, + /// Accept packets from all sources except those listed. + /// With an empty sources list this is any-source multicast (ASM). + #[default] + Exclude, +} + +/// Declarative multicast group subscription for an OPTE port. +/// +/// Represents a single group membership with optional source filtering. +/// Empty `sources` means any-source multicast (ASM) and non-empty means +/// source-specific multicast (SSM). +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq)] +pub struct MulticastGroupCfg { + /// The multicast group IP address (IPv4 or IPv6). + pub group_ip: IpAddr, + /// Source addresses for source-filtered multicast. + pub sources: Vec, +} diff --git a/tools/install_opte.sh b/tools/install_opte.sh index 35e2999cc76..1f649ec473d 100755 --- a/tools/install_opte.sh +++ b/tools/install_opte.sh @@ -112,9 +112,9 @@ else fi RC=0 -pfexec pkg freeze -c "$OMICRON_FROZEN_PKG_COMMENT" driver/network/opte || RC=$? +pfexec pkg freeze -c "$OMICRON_FROZEN_PKG_COMMENT" driver/network/opte@"$OPTE_VERSION" || RC=$? if [[ "$RC" -ne 0 ]]; then - echo "Failed to pin opte package" + echo "Failed to pin opte package to $OPTE_VERSION" exit $RC fi From a28f6cf0fd9afd495fd4073594d1a373c58f29f6 Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Tue, 31 Mar 2026 23:06:25 +0000 Subject: [PATCH 3/8] [review] Address feedback, nexthop is now the selected switch Changes: - Remove global eip_gateways map from PortManagerInner, as the VPC route manager RPW activates after instance start - Refactor member reconciler methods to take &MemberReconcileCtx - Change forwarding next hop from member sleds to a single switch zone IP - Add resolver to MulticastSledClient for switch zone address lookup --- .github/buildomat/jobs/deploy.sh | 1 - illumos-utils/src/opte/port_manager.rs | 51 +--- .../app/background/tasks/multicast/groups.rs | 4 +- .../app/background/tasks/multicast/members.rs | 218 +++++------------- .../src/app/background/tasks/multicast/mod.rs | 5 +- nexus/src/app/multicast/sled.rs | 167 ++++++-------- .../multicast/networking_integration.rs | 36 ++- sled-agent/src/instance.rs | 8 - 8 files changed, 153 insertions(+), 337 deletions(-) diff --git a/.github/buildomat/jobs/deploy.sh b/.github/buildomat/jobs/deploy.sh index dd07bcc6757..f95b8bf8585 100755 --- a/.github/buildomat/jobs/deploy.sh +++ b/.github/buildomat/jobs/deploy.sh @@ -190,7 +190,6 @@ source .github/buildomat/ci-env.sh # swap them in. The deploy target is a ramdisk image without pkg(5), so we # use rem_drv/add_drv instead of the p5p approach used by install_opte.sh # and releng. -# shellcheck source=/dev/null source tools/opte_version_override if [[ "x$OPTE_COMMIT" != "x" ]]; then curl -sSfOL "https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/module/$OPTE_COMMIT/xde" diff --git a/illumos-utils/src/opte/port_manager.rs b/illumos-utils/src/opte/port_manager.rs index 30ecae011d8..ecac68ff287 100644 --- a/illumos-utils/src/opte/port_manager.rs +++ b/illumos-utils/src/opte/port_manager.rs @@ -109,14 +109,6 @@ struct RouteSet { /// /// The only lock nesting is: /// - `routes` then `ports` in `vpc_routes_ensure` -/// - `ports` then `eip_gateways` in `create_port` -/// -/// `set_eip_gateways` acquires each lock separately (global map first, -/// then ports), so there is no nesting. A concurrent `create_port` -/// between the two reads the already-updated global map. The subsequent -/// port iteration is redundant but idempotent. Neither path is hot: -/// `set_eip_gateways` runs once per background task pass and -/// `create_port` runs at instance boot. /// /// Note: `release_inner` acquires `ports` then `routes` sequentially /// (dropping each before acquiring the next). @@ -137,11 +129,6 @@ struct PortManagerInner { /// Map of all current resolved routes. routes: Mutex>, - - /// Most recent EIP gateway mappings, keyed by NIC ID. We store this here so - /// that ports created after `set_eip_gateways` can seed their initial - /// gateway state. - eip_gateways: Mutex>>>, } /// Mutable per-port state tracked alongside the immutable `Port`. @@ -403,7 +390,6 @@ impl PortManager { underlay_ip, ports: Mutex::new(BTreeMap::new()), routes: Mutex::new(Default::default()), - eip_gateways: Mutex::new(HashMap::new()), }); Self { inner } @@ -479,17 +465,7 @@ impl PortManager { vni, gateway, }); - let mut new_port_state = PortState::new(port.clone()); - - // Seed gateway mappings from the global map so that a port - // created after set_eip_gateways has the correct state - // immediately. Lock order: ports then eip_gateways. - if let Some(gw) = - self.inner.eip_gateways.lock().unwrap().get(&nic.id).cloned() - { - new_port_state.eip_gateways = gw; - } - + let new_port_state = PortState::new(port.clone()); let old = ports.insert((nic.id, nic.kind), new_port_state); assert!( old.is_none(), @@ -761,24 +737,17 @@ impl PortManager { /// /// Returns whether the internal mappings were changed. pub fn set_eip_gateways(&self, mappings: ExternalIpGatewayMap) -> bool { - // Update global map (single lock). A concurrent create_port - // between these two locks will read the updated global map and - // seed correctly; the port iteration below is then a redundant - // but idempotent overwrite. - let mut global_gw = self.inner.eip_gateways.lock().unwrap(); - let changed = &*global_gw != &mappings.mappings; - *global_gw = mappings.mappings.clone(); - drop(global_gw); - - // Push into existing ports. let mut ports = self.inner.ports.lock().unwrap(); - for ((nic_id, _), port_state) in ports.iter_mut() { + ports.iter_mut().fold(false, |changed, ((nic_id, _), port_state)| { let new_gw = mappings.mappings.get(nic_id).cloned().unwrap_or_default(); - port_state.eip_gateways = new_gw; - } - - changed + if port_state.eip_gateways != new_gw { + port_state.eip_gateways = new_gw; + true + } else { + changed + } + }) } /// Lookup an OPTE port, and ensure its external IP config is up to date. @@ -1627,7 +1596,7 @@ mod tests { const SERVICES_VPC_VNI: Vni = Vni::SERVICES_VNI; let handle = Handle::new().unwrap(); - handle.set_xde_underlay("underlay0", "underlay1").unwrap(); + handle.set_xde_underlay("foo0", "foo1").unwrap(); // First, create a port for a service. // diff --git a/nexus/src/app/background/tasks/multicast/groups.rs b/nexus/src/app/background/tasks/multicast/groups.rs index b05fd90697f..0db31b033dc 100644 --- a/nexus/src/app/background/tasks/multicast/groups.rs +++ b/nexus/src/app/background/tasks/multicast/groups.rs @@ -973,8 +973,8 @@ impl MulticastGroupReconciler { // Clear M2P/forwarding from all sleds before DPD cleanup. // This must succeed before deleting DB records, otherwise - // stale OPTE state would persist on failed sleds with no - // source of truth to drive a later cleanup pass. + // stale OPTE state would persist on sleds where the clear + // failed, with no DB record to drive a retry on a later pass. sled_client .clear_m2p_and_forwarding(opctx, group) .await diff --git a/nexus/src/app/background/tasks/multicast/members.rs b/nexus/src/app/background/tasks/multicast/members.rs index 14175619611..294afe76831 100644 --- a/nexus/src/app/background/tasks/multicast/members.rs +++ b/nexus/src/app/background/tasks/multicast/members.rs @@ -507,9 +507,7 @@ impl MulticastGroupReconciler { let reconcile_res = self .execute_joining_reconciliation( - ctx.opctx, - ctx.group, - ctx.member, + ctx, instance_state.valid, instance_state.sled_id, ) @@ -535,9 +533,7 @@ impl MulticastGroupReconciler { /// Execute the reconciliation CAS operation for a member in "Joining" state. async fn execute_joining_reconciliation( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, instance_valid: bool, current_sled_id: Option, ) -> Result { @@ -545,9 +541,9 @@ impl MulticastGroupReconciler { self.datastore .multicast_group_member_reconcile_joining( - opctx, - MulticastGroupUuid::from_untyped_uuid(group.id()), - InstanceUuid::from_untyped_uuid(member.parent_id), + ctx.opctx, + MulticastGroupUuid::from_untyped_uuid(ctx.group.id()), + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), instance_valid, current_sled_id_db, ) @@ -564,10 +560,7 @@ impl MulticastGroupReconciler { ) -> Result { match reconcile_result.action { ReconcileAction::TransitionedToLeft => { - self.handle_transitioned_to_left( - ctx.opctx, ctx.group, ctx.member, - ) - .await + self.handle_transitioned_to_left(ctx).await } ReconcileAction::UpdatedSledId { old, new } => { @@ -588,18 +581,16 @@ impl MulticastGroupReconciler { /// Handle the case where a member was transitioned to "Left" state. async fn handle_transitioned_to_left( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, ) -> Result { info!( - opctx.log, + ctx.opctx.log, "multicast member lifecycle transition: 'Joining' → 'Left'"; - "member_id" => %member.id, - "instance_id" => %member.parent_id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "group_multicast_ip" => %group.multicast_ip, + "member_id" => %ctx.member.id, + "instance_id" => %ctx.member.parent_id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "group_multicast_ip" => %ctx.group.multicast_ip, "reason" => "instance_not_valid_for_multicast_traffic" ); Ok(StateTransition::StateChanged) @@ -699,16 +690,7 @@ impl MulticastGroupReconciler { .unwrap_or_default(); match (instance_state.valid, instance_state.sled_id) { - (false, _) => { - self.handle_invalid_instance( - ctx.opctx, - ctx.group, - ctx.member, - ctx.dataplane_client, - ctx.sled_client, - ) - .await - } + (false, _) => self.handle_invalid_instance(ctx).await, (true, Some(sled_id)) if ctx.member.sled_id != Some(sled_id.into()) => @@ -722,14 +704,7 @@ impl MulticastGroupReconciler { } (true, Some(_)) => { - self.verify_members( - ctx.opctx, - ctx.group, - ctx.member, - ctx.dataplane_client, - ctx.sled_client, - ) - .await?; + self.verify_members(ctx).await?; trace!( ctx.opctx.log, "member configuration verified, no changes needed"; @@ -739,32 +714,18 @@ impl MulticastGroupReconciler { Ok(StateTransition::NoChange) } - (true, None) => { - self.handle_joined_without_sled( - ctx.opctx, - ctx.group, - ctx.member, - ctx.dataplane_client, - ) - .await - } + (true, None) => self.handle_joined_without_sled(ctx).await, } } /// Handle a joined member whose instance became invalid. async fn handle_invalid_instance( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, - sled_client: &MulticastSledClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { + let MemberReconcileCtx { opctx, group, member, sled_client, .. } = ctx; // Remove from dataplane first - if let Err(e) = self - .remove_member_from_dataplane(opctx, member, dataplane_client) - .await - { + if let Err(e) = self.remove_member_from_dataplane(ctx).await { warn!( opctx.log, "failed to remove member from dataplane, will retry"; @@ -864,14 +825,7 @@ impl MulticastGroupReconciler { ); // Remove from old sled's dataplane first - if let Err(e) = self - .remove_member_from_dataplane( - ctx.opctx, - ctx.member, - ctx.dataplane_client, - ) - .await - { + if let Err(e) = self.remove_member_from_dataplane(ctx).await { warn!( ctx.opctx.log, "failed to remove member from old sled, will retry"; @@ -1013,11 +967,9 @@ impl MulticastGroupReconciler { /// Handle edge case where a "Joined" member has no sled_id. async fn handle_joined_without_sled( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { + let MemberReconcileCtx { opctx, group, member, .. } = ctx; warn!( opctx.log, "'Joined' member has no sled_id: transitioning to 'Left'"; @@ -1026,10 +978,7 @@ impl MulticastGroupReconciler { ); // Remove from dataplane and transition to "Left" - if let Err(e) = self - .remove_member_from_dataplane(opctx, member, dataplane_client) - .await - { + if let Err(e) = self.remove_member_from_dataplane(ctx).await { warn!( opctx.log, "failed to remove member with no sled_id from dataplane"; @@ -1093,14 +1042,7 @@ impl MulticastGroupReconciler { .unwrap_or_default(); if ctx.member.time_deleted.is_some() { - self.cleanup_deleted_member( - ctx.opctx, - ctx.group, - ctx.member, - ctx.dataplane_client, - ctx.sled_client, - ) - .await?; + self.cleanup_deleted_member(ctx).await?; return Ok(StateTransition::NeedsCleanup); } @@ -1110,14 +1052,7 @@ impl MulticastGroupReconciler { // The cleanup is idempotent and handles cases where: // - sled_id is None (uses fallback path) // - member was already removed from DPD - if let Err(e) = self - .remove_member_from_dataplane( - ctx.opctx, - ctx.member, - ctx.dataplane_client, - ) - .await - { + if let Err(e) = self.remove_member_from_dataplane(ctx).await { warn!( ctx.opctx.log, "failed to clean up DPD state for 'Left' member (will retry)"; @@ -1152,14 +1087,7 @@ impl MulticastGroupReconciler { } if instance_valid && ctx.group.state == MulticastGroupState::Active { - return self - .reactivate_left_member( - ctx.opctx, - ctx.group, - ctx.member, - current_sled_id, - ) - .await; + return self.reactivate_left_member(ctx, current_sled_id).await; } Ok(StateTransition::NoChange) @@ -1169,11 +1097,10 @@ impl MulticastGroupReconciler { /// Transitions the member back to "Joining" state so it can rejoin the group. async fn reactivate_left_member( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, current_sled_id: Option, ) -> Result { + let MemberReconcileCtx { opctx, group, member, .. } = ctx; debug!( opctx.log, "transitioning member from 'Left' to 'Joining': instance became valid and group active"; @@ -1307,9 +1234,9 @@ impl MulticastGroupReconciler { /// Returns `None` if the instance has no sled assignment or cannot be found. async fn lookup_and_update_member_sled_id( &self, - opctx: &OpContext, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, ) -> Result>, anyhow::Error> { + let MemberReconcileCtx { opctx, member, .. } = ctx; debug!( opctx.log, "member has no sled_id, attempting to look up instance sled"; @@ -1426,21 +1353,14 @@ impl MulticastGroupReconciler { { id } else if let Some(id) = - self.lookup_and_update_member_sled_id(ctx.opctx, ctx.member).await? + self.lookup_and_update_member_sled_id(ctx).await? { id.into() } else { return Ok(false); }; - self.add_member_to_dataplane( - ctx.opctx, - ctx.group, - ctx.member, - sled_id, - ctx.dataplane_client, - ) - .await?; + self.add_member_to_dataplane(ctx, sled_id).await?; // If the member is already in a "Joined" state (migration path), skip // the state transition but still propagate and subscribe. During @@ -1540,12 +1460,12 @@ impl MulticastGroupReconciler { /// Apply member dataplane configuration (via DPD-client). async fn add_member_to_dataplane( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, sled_id: SledUuid, - dataplane_client: &MulticastDataplaneClient, ) -> Result<(), anyhow::Error> { + let MemberReconcileCtx { + opctx, group, member, dataplane_client, .. + } = ctx; let underlay_group_id = group.underlay_group_id.with_context(|| { format!("no underlay group for external group {}", group.id()) })?; @@ -1848,18 +1768,11 @@ impl MulticastGroupReconciler { /// Remove member dataplane configuration (via DPD-client). async fn remove_member_from_dataplane( &self, - opctx: &OpContext, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result<(), anyhow::Error> { - let group = self - .datastore - .multicast_group_fetch( - opctx, - MulticastGroupUuid::from_untyped_uuid(member.external_group_id), - ) - .await - .context("failed to fetch group for member removal")?; + let MemberReconcileCtx { + opctx, group, member, dataplane_client, .. + } = ctx; let underlay_group_id = group.underlay_group_id.with_context(|| { format!( @@ -1914,11 +1827,9 @@ impl MulticastGroupReconciler { /// Ensures dataplane consistency by failing if removal operations fail. async fn cleanup_member_from_dataplane( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result<(), anyhow::Error> { + let MemberReconcileCtx { opctx, group, member, .. } = ctx; debug!( opctx.log, "cleaning up member from dataplane"; @@ -1930,11 +1841,9 @@ impl MulticastGroupReconciler { ); // Strict removal from dataplane (fail on errors) - self.remove_member_from_dataplane(opctx, member, dataplane_client) - .await - .context( - "failed to remove member configuration via DPD during cleanup", - )?; + self.remove_member_from_dataplane(ctx).await.context( + "failed to remove member configuration via DPD during cleanup", + )?; info!( opctx.log, @@ -1962,12 +1871,16 @@ impl MulticastGroupReconciler { /// location changed but the `sled_id` stayed the same. async fn verify_members( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, - sled_client: &MulticastSledClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result<(), anyhow::Error> { + let MemberReconcileCtx { + opctx, + group, + member, + dataplane_client, + sled_client, + .. + } = ctx; debug!( opctx.log, "verifying joined member consistency"; @@ -2021,13 +1934,7 @@ impl MulticastGroupReconciler { ); // Best effort removal on verification - let _ = self - .remove_member_from_dataplane( - opctx, - member, - dataplane_client, - ) - .await; + let _ = self.remove_member_from_dataplane(ctx).await; // Unsubscribe the VMM before the CAS clears sled_id; // otherwise, the OPTE subscription is stranded with no @@ -2750,12 +2657,9 @@ impl MulticastGroupReconciler { /// it from the dataplane, and hard-deleting the DB row. async fn cleanup_deleted_member( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, - sled_client: &MulticastSledClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result<(), anyhow::Error> { + let MemberReconcileCtx { opctx, group, member, sled_client, .. } = ctx; // Unsubscribe from sled-agent (best-effort, VMM may be gone). if let Some(sled_id) = member.sled_id { if let Err(e) = sled_client @@ -2773,13 +2677,7 @@ impl MulticastGroupReconciler { } // Use the consolidated cleanup helper with strict error handling - self.cleanup_member_from_dataplane( - opctx, - group, - member, - dataplane_client, - ) - .await + self.cleanup_member_from_dataplane(ctx).await } /// Get all multicast groups that need member reconciliation. diff --git a/nexus/src/app/background/tasks/multicast/mod.rs b/nexus/src/app/background/tasks/multicast/mod.rs index 08c56c4154a..6ab622179be 100644 --- a/nexus/src/app/background/tasks/multicast/mod.rs +++ b/nexus/src/app/background/tasks/multicast/mod.rs @@ -538,7 +538,10 @@ impl MulticastGroupReconciler { // Create sled-agent client for OPTE subscriptions and // M2P/forwarding propagation. - let sled_client = MulticastSledClient::new(self.datastore.clone()); + let sled_client = MulticastSledClient::new( + self.datastore.clone(), + self.resolver.clone(), + ); // Process creating groups match self.reconcile_creating_groups(opctx).await { diff --git a/nexus/src/app/multicast/sled.rs b/nexus/src/app/multicast/sled.rs index 73ea17bd71d..49cd6447dd2 100644 --- a/nexus/src/app/multicast/sled.rs +++ b/nexus/src/app/multicast/sled.rs @@ -12,8 +12,8 @@ //! hosting sled //! - **M2P mappings**: Overlay multicast IP to underlay IPv6 address //! translation, installed on all sleds -//! - **Forwarding entries**: Underlay multicast address to next-hop sled -//! replication lists, installed on all sleds +//! - **Forwarding entries**: Underlay multicast address to switch next-hop, +//! installed on all sleds so OPTE forwards to the switch for replication //! //! [`dataplane`]: super::dataplane @@ -41,20 +41,29 @@ use sled_agent_client::types::{ McastSourceFilter, }; -/// Client for sled-agent multicast operations. +/// Utility methods for sled-agent multicast operations used by the +/// background task reconciler. /// -/// Unlike [`MulticastDataplaneClient`] which pre-builds per-switch clients, -/// sled clients are constructed on demand since the target sled set varies -/// per group. +/// Groups sled-agent HTTP calls (OPTE subscriptions, M2P mappings, +/// forwarding entries) behind a single type to keep the reconciler +/// logic focused on state transitions rather than client construction. +/// +/// Unlike [`MulticastDataplaneClient`] which pre-builds per-switch +/// clients, sled clients are constructed on demand since the target +/// sled set varies per group. /// /// [`MulticastDataplaneClient`]: super::dataplane::MulticastDataplaneClient pub(crate) struct MulticastSledClient { datastore: Arc, + resolver: internal_dns_resolver::Resolver, } impl MulticastSledClient { - pub(crate) fn new(datastore: Arc) -> Self { - Self { datastore } + pub(crate) fn new( + datastore: Arc, + resolver: internal_dns_resolver::Resolver, + ) -> Self { + Self { datastore, resolver } } /// Create a sled-agent client for the given sled. @@ -229,9 +238,10 @@ impl MulticastSledClient { /// M2P mappings and forwarding entries are pushed to all VPC-routing /// sleds, not just member sleds. Any instance on any sled may send to /// a multicast group address. Hence, without the M2P mapping, OPTE's - /// overlay layer silently drops the packet. Forwarding entries are needed - /// on sender sleds so OPTE can replicate to member sleds. Subscriptions - /// (per-port group membership) remain member-sled-only. + /// overlay layer silently drops the packet. Forwarding entries point + /// each sled at a switch, which replicates to member ports via DPD + /// multicast group config. Subscriptions (per-port group membership) remain + /// member-sled-only. pub(crate) async fn propagate_m2p_and_forwarding( &self, opctx: &OpContext, @@ -280,59 +290,6 @@ impl MulticastSledClient { let desired_m2p = Mcast2PhysMapping { group: group_ip, underlay: underlay_ip }; - // Look up member sled underlay IPs for forwarding next-hop - // computation. These are the sleds that host "Joined" members - // and should appear as next hops in every sled's forwarding - // entry. - let mut member_sled_ips: Vec<(SledUuid, Ipv6Addr)> = Vec::new(); - let mut failed_lookups: usize = 0; - for sled_id in &member_sled_ids { - let lookup = match nexus_networking::sled_lookup( - &self.datastore, - opctx, - *sled_id, - ) { - Ok(found) => found, - Err(e) => { - warn!( - opctx.log, - "failed to resolve sled for M2P/forwarding"; - "sled_id" => %sled_id, - "error" => %e - ); - failed_lookups += 1; - continue; - } - }; - - match lookup.fetch().await { - Ok((.., sled)) => { - member_sled_ips.push((*sled_id, sled.ip())); - } - Err(e) => { - warn!( - opctx.log, - "failed to resolve sled for M2P/forwarding"; - "sled_id" => %sled_id, - "error" => %e - ); - failed_lookups += 1; - } - } - } - - // Abort before mutating sled state if any member lookups failed. - // Pushing the partial member set would prune forwarding entries - // for the unresolved sleds, turning a transient lookup failure - // into packet loss for still-joined members. - if failed_lookups > 0 { - anyhow::bail!( - "aborting convergence: {failed_lookups} member sled \ - lookup(s) failed out of {} joined members", - member_sled_ids.len() - ); - } - // The group is active if any members are "Joined". M2P and // forwarding are pushed to all sleds when active, cleared // from all sleds when inactive. @@ -345,12 +302,36 @@ impl MulticastSledClient { .await .context("failed to enumerate sleds")?; + // Select one of the available switches as the forwarding next hop. + // + // OPTE treats each next hop as a duplication it performs itself, so + // pointing at individual member sleds would cause O(n) copies over + // cxgbe per sender. + // + // A single switch next hop means one copy to the switch, which + // replicates to member sled ports via DPD multicast group membership. + // ECMP over both switches is the more correct longer-term answer, + // but OPTE and mgd lack the tooling to express that today. + let switch_zone_addrs = crate::app::switch_zone_address_mappings( + &self.resolver, + &opctx.log, + ) + .await + .map_err(|e| anyhow::anyhow!(e)) + .context("failed to resolve switch zone addresses")?; + + let switch_ip = switch_zone_addrs + .iter() + .min_by_key(|(slot, _)| *slot) + .map(|(_, ip)| *ip) + .context("no switch zone found for forwarding next hop")?; + let convergence_params = GroupConvergenceParams { group_ip, underlay_ip, group_is_active, desired_m2p: &desired_m2p, - member_sled_ips: &member_sled_ips, + switch_ip, }; let mut failed_sleds: usize = 0; @@ -372,12 +353,9 @@ impl MulticastSledClient { } }; - if let Err(e) = converge_sled_m2p_and_forwarding( - &client, - sled_id, - &convergence_params, - ) - .await + if let Err(e) = + converge_sled_m2p_and_forwarding(&client, &convergence_params) + .await { warn!( opctx.log, @@ -436,7 +414,9 @@ struct GroupConvergenceParams<'a> { underlay_ip: Ipv6Addr, group_is_active: bool, desired_m2p: &'a Mcast2PhysMapping, - member_sled_ips: &'a [(SledUuid, Ipv6Addr)], + /// Switch zone underlay IP chosen as the forwarding next hop. + /// The switch replicates to member sled ports via DPD config. + switch_ip: Ipv6Addr, } /// Per-sled convergence of M2P and forwarding state. @@ -447,11 +427,10 @@ struct GroupConvergenceParams<'a> { /// The caller increments `failed_sleds` and continues to the next sled. async fn converge_sled_m2p_and_forwarding( client: &sled_agent_client::Client, - sled_id: SledUuid, params: &GroupConvergenceParams<'_>, ) -> Result<(), anyhow::Error> { converge_m2p(client, params).await?; - converge_forwarding(client, sled_id, params).await?; + converge_forwarding(client, params).await?; Ok(()) } @@ -502,12 +481,12 @@ async fn converge_m2p( /// Converge a single sled's forwarding entries for one group. /// -/// When the group is active, computes desired next hops (all member -/// sleds except this one) and updates only if the current state -/// differs. When inactive, clears any stale entries. +/// When the group is active, this sets a single next hop to the switch +/// zone. The switch replicates to member sled ports via its DPD +/// multicast group membership. When inactive, this clears any stale +/// entries. async fn converge_forwarding( client: &sled_agent_client::Client, - sled_id: SledUuid, params: &GroupConvergenceParams<'_>, ) -> Result<(), anyhow::Error> { let found = client @@ -529,31 +508,17 @@ async fn converge_forwarding( return Ok(()); } - let desired_next_hops: Vec = params - .member_sled_ips - .iter() - .filter(|(id, _)| *id != sled_id) - .map(|(_, ip)| McastForwardingNextHop { - next_hop: *ip, - replication: McastReplication::Underlay, - filter: McastSourceFilter { - mode: McastFilterMode::Exclude, - sources: Vec::new(), - }, - }) - .collect(); + let desired_next_hops = vec![McastForwardingNextHop { + next_hop: params.switch_ip, + replication: McastReplication::Underlay, + filter: McastSourceFilter { + mode: McastFilterMode::Exclude, + sources: Vec::new(), + }, + }]; - // Comparison via sets: OPTE may return next hops in a different order - // than we build them, so a naive Vec comparison would cause spurious - // clear+set cycles on every reconciliation pass. let needs_update = match current_entry { - Some(f) if f.next_hops.len() == desired_next_hops.len() => { - !desired_next_hops.iter().all(|d| f.next_hops.contains(d)) - } - Some(_) => true, - // Always create the entry when the group is active; even an - // empty next-hops list signals to OPTE that the underlay - // address is known. + Some(f) => f.next_hops != desired_next_hops, None => true, }; diff --git a/nexus/tests/integration_tests/multicast/networking_integration.rs b/nexus/tests/integration_tests/multicast/networking_integration.rs index 6103633e9ef..f0f10a9ff15 100644 --- a/nexus/tests/integration_tests/multicast/networking_integration.rs +++ b/nexus/tests/integration_tests/multicast/networking_integration.rs @@ -831,8 +831,7 @@ async fn test_multicast_sled_agent_m2p_and_subscriptions( } // Verify forwarding entries on the sim sled-agent. - // With a single sled, the forwarding entry exists but has no next hops - // (no other sleds to forward to). + // The forwarding entry points at a switch for replication. { let fwd = sled_agent.mcast_fwd.lock().unwrap(); assert!( @@ -841,9 +840,10 @@ async fn test_multicast_sled_agent_m2p_and_subscriptions( got: {fwd:?}" ); let next_hops = &fwd[&underlay_ipv6]; - assert!( - next_hops.is_empty(), - "Single-sled setup should have empty next_hops, got: {next_hops:?}" + assert_eq!( + next_hops.len(), + 1, + "Should have 1 next_hop (a switch), got: {next_hops:?}" ); } @@ -1116,24 +1116,14 @@ async fn test_multicast_multi_sled_m2p_propagation( let fwd = agent.mcast_fwd.lock().unwrap(); let next_hops = &fwd[&underlay_ipv6]; - if sled_agent.sled_agent_id() == hosting_sled_id { - // Hosting sled: no next hops (only local member, OPTE - // delivers locally via subscription). - assert!( - next_hops.is_empty(), - "Hosting sled forwarding should have empty next_hops, \ - got: {next_hops:?}" - ); - } else { - // Non-hosting sled: next hop is the hosting sled so - // senders on this sled can reach the member. - assert_eq!( - next_hops.len(), - 1, - "Non-hosting sled {i} should have 1 next_hop (the hosting \ - sled), got: {next_hops:?}" - ); - } + // Every sled gets a single next hop pointing at a switch. + // The switch replicates to member sled ports via DPD config. + assert_eq!( + next_hops.len(), + 1, + "Sled {i} should have 1 next_hop (a switch), \ + got: {next_hops:?}" + ); } // Verify per-VMM subscription on the hosting sled only. diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index a09aa8463a1..c043432eb46 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -2708,14 +2708,6 @@ impl InstanceRunner { self.multicast_groups.push(membership.clone()); self.ensure_multicast_groups()?; - - // OPTE's xde driver uses mac_siphon_set on the underlay NIC to - // receive all packets (including multicast) at the MAC layer. - // - // Subscription filtering and delivery happen inside OPTE via - // mcast_subscribe. Rack-wide dataplane forwarding is handled by - // the RPW reconciler + DPD. - Ok(()) } From c7fd6b80a3c779dade4c1bf5784a724a27c6d37a Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Wed, 1 Apr 2026 17:11:26 +0000 Subject: [PATCH 4/8] maghemite update --- Cargo.lock | 36 ++++++++++++++--------------- package-manifest.toml | 10 ++++---- tools/maghemite_ddm_openapi_version | 2 +- tools/maghemite_mg_openapi_version | 2 +- 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9fe6c424b3b..abe778618cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1696,7 +1696,7 @@ version = "3.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] @@ -2513,7 +2513,7 @@ dependencies = [ [[package]] name = "ddm-admin-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?branch=multicast-e2e#954f63a1b47f8199de44081d394ae50897855a3b" +source = "git+https://github.com/oxidecomputer/maghemite?branch=multicast-e2e#8456ce46352a1b5afd6c9a35b40ef2ab6620cc02" dependencies = [ "oxnet", "progenitor 0.13.0", @@ -3548,7 +3548,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -5007,7 +5007,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.2", + "socket2 0.5.10", "system-configuration", "tokio", "tower-layer", @@ -5672,7 +5672,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" dependencies = [ "hermit-abi 0.5.2", "libc", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -5749,7 +5749,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -6474,7 +6474,7 @@ dependencies = [ [[package]] name = "mg-admin-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?branch=multicast-e2e#954f63a1b47f8199de44081d394ae50897855a3b" +source = "git+https://github.com/oxidecomputer/maghemite?branch=multicast-e2e#8456ce46352a1b5afd6c9a35b40ef2ab6620cc02" dependencies = [ "chrono", "colored 3.1.1", @@ -7915,7 +7915,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -11449,7 +11449,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls 0.23.37", - "socket2 0.6.2", + "socket2 0.5.10", "thiserror 2.0.18", "tokio", "tracing", @@ -11487,9 +11487,9 @@ dependencies = [ "cfg_aliases 0.2.1", "libc", "once_cell", - "socket2 0.6.2", + "socket2 0.5.10", "tracing", - "windows-sys 0.60.2", + "windows-sys 0.59.0", ] [[package]] @@ -11751,7 +11751,7 @@ dependencies = [ [[package]] name = "rdb-types" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?branch=multicast-e2e#954f63a1b47f8199de44081d394ae50897855a3b" +source = "git+https://github.com/oxidecomputer/maghemite?branch=multicast-e2e#8456ce46352a1b5afd6c9a35b40ef2ab6620cc02" dependencies = [ "oxnet", "schemars 0.8.22", @@ -12404,7 +12404,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.11.0", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -12507,7 +12507,7 @@ dependencies = [ "security-framework", "security-framework-sys", "webpki-root-certs", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -13961,7 +13961,7 @@ version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1c97747dbf44bb1ca44a561ece23508e99cb592e862f22222dcf42f51d1e451" dependencies = [ - "heck 0.5.0", + "heck 0.4.1", "proc-macro2", "quote", "syn 2.0.117", @@ -14598,7 +14598,7 @@ dependencies = [ "getrandom 0.4.1", "once_cell", "rustix 1.1.3", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -14618,7 +14618,7 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8c27177b12a6399ffc08b98f76f7c9a1f4fe9fc967c784c5a071fa8d93cf7e1" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -16952,7 +16952,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] diff --git a/package-manifest.toml b/package-manifest.toml index f6be5f04819..334fb5f48f4 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -678,10 +678,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "954f63a1b47f8199de44081d394ae50897855a3b" +source.commit = "8456ce46352a1b5afd6c9a35b40ef2ab6620cc02" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm-gz.sha256.txt -source.sha256 = "b305afa1efa5889cc5bbf6ec5b7c9d90b31298ab95c68168edfff797f56cafeb" +source.sha256 = "d5e57c64ae50e775a90cf8f0c821334106b52ecf02575bf430fa88957ca5c328" output.type = "tarball" [package.mg-ddm] @@ -694,10 +694,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "954f63a1b47f8199de44081d394ae50897855a3b" +source.commit = "8456ce46352a1b5afd6c9a35b40ef2ab6620cc02" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "e7301cd0d20940596f0d99836b2c57c48072539839aedc59cf44b09020aeb2bf" +source.sha256 = "a9529b2a653e2ede9fe18a7acc10181b3e7685eb897db73f83d892b07d939435" output.type = "zone" output.intermediate_only = true @@ -709,7 +709,7 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "954f63a1b47f8199de44081d394ae50897855a3b" +source.commit = "8456ce46352a1b5afd6c9a35b40ef2ab6620cc02" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mgd.sha256.txt source.sha256 = "8802d8cf7e77c9280eab919734bc8fb7449b30e5dc5a9674f5d69a71b9ed40b9" diff --git a/tools/maghemite_ddm_openapi_version b/tools/maghemite_ddm_openapi_version index 397bb8448f2..53886c585ec 100644 --- a/tools/maghemite_ddm_openapi_version +++ b/tools/maghemite_ddm_openapi_version @@ -1 +1 @@ -COMMIT="954f63a1b47f8199de44081d394ae50897855a3b" +COMMIT="8456ce46352a1b5afd6c9a35b40ef2ab6620cc02" diff --git a/tools/maghemite_mg_openapi_version b/tools/maghemite_mg_openapi_version index 397bb8448f2..53886c585ec 100644 --- a/tools/maghemite_mg_openapi_version +++ b/tools/maghemite_mg_openapi_version @@ -1 +1 @@ -COMMIT="954f63a1b47f8199de44081d394ae50897855a3b" +COMMIT="8456ce46352a1b5afd6c9a35b40ef2ab6620cc02" From d98a51c7778777ab179c5475b6fbcda7be673c7f Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Fri, 8 May 2026 15:01:30 +0000 Subject: [PATCH 5/8] [maghemite] update --- Cargo.lock | 6 +++--- Cargo.toml | 6 +++--- package-manifest.toml | 12 ++++++------ tools/maghemite_ddm_openapi_version | 2 +- tools/maghemite_mg_openapi_version | 2 +- tools/maghemite_mgd_checksums | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8fd9d7e21b2..7d84e35ac9e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2526,7 +2526,7 @@ dependencies = [ [[package]] name = "ddm-admin-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?rev=ebee0537a54c5dd6c7cc8bb47c62039cc4aee5f5#ebee0537a54c5dd6c7cc8bb47c62039cc4aee5f5" +source = "git+https://github.com/oxidecomputer/maghemite?rev=c11f21a5cdad8bb6bfaa7d1dd38bd8b2ee7ff4e7#c11f21a5cdad8bb6bfaa7d1dd38bd8b2ee7ff4e7" dependencies = [ "oxnet", "progenitor 0.14.0", @@ -6444,7 +6444,7 @@ dependencies = [ [[package]] name = "mg-admin-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?rev=ebee0537a54c5dd6c7cc8bb47c62039cc4aee5f5#ebee0537a54c5dd6c7cc8bb47c62039cc4aee5f5" +source = "git+https://github.com/oxidecomputer/maghemite?rev=c11f21a5cdad8bb6bfaa7d1dd38bd8b2ee7ff4e7#c11f21a5cdad8bb6bfaa7d1dd38bd8b2ee7ff4e7" dependencies = [ "chrono", "colored 3.1.1", @@ -11701,7 +11701,7 @@ dependencies = [ [[package]] name = "rdb-types" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?rev=ebee0537a54c5dd6c7cc8bb47c62039cc4aee5f5#ebee0537a54c5dd6c7cc8bb47c62039cc4aee5f5" +source = "git+https://github.com/oxidecomputer/maghemite?rev=c11f21a5cdad8bb6bfaa7d1dd38bd8b2ee7ff4e7#c11f21a5cdad8bb6bfaa7d1dd38bd8b2ee7ff4e7" dependencies = [ "oxnet", "schemars 0.8.22", diff --git a/Cargo.toml b/Cargo.toml index 625bb219e1a..b1a3cb162f3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -604,8 +604,8 @@ ntp-admin-api = { path = "ntp-admin/api" } ntp-admin-client = { path = "clients/ntp-admin-client" } ntp-admin-types = { path = "ntp-admin/types" } ntp-admin-types-versions = { path = "ntp-admin/types/versions" } -mg-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "ebee0537a54c5dd6c7cc8bb47c62039cc4aee5f5" } -ddm-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "ebee0537a54c5dd6c7cc8bb47c62039cc4aee5f5" } +mg-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "c11f21a5cdad8bb6bfaa7d1dd38bd8b2ee7ff4e7" } +ddm-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "c11f21a5cdad8bb6bfaa7d1dd38bd8b2ee7ff4e7" } multimap = "0.10.1" nexus-auth = { path = "nexus/auth" } nexus-background-task-interface = { path = "nexus/background-task-interface" } @@ -742,7 +742,7 @@ rats-corim = { git = "https://github.com/oxidecomputer/rats-corim.git", rev = "f raw-cpuid = { git = "https://github.com/oxidecomputer/rust-cpuid.git", rev = "a4cf01df76f35430ff5d39dc2fe470bcb953503b" } rayon = "1.10" rcgen = "0.12.1" -rdb-types = { git = "https://github.com/oxidecomputer/maghemite", rev = "ebee0537a54c5dd6c7cc8bb47c62039cc4aee5f5" } +rdb-types = { git = "https://github.com/oxidecomputer/maghemite", rev = "c11f21a5cdad8bb6bfaa7d1dd38bd8b2ee7ff4e7" } reconfigurator-cli = { path = "dev-tools/reconfigurator-cli" } reedline = "0.40.0" ref-cast = "1.0" diff --git a/package-manifest.toml b/package-manifest.toml index be9d27f41c5..36133b1d66c 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -683,10 +683,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "ebee0537a54c5dd6c7cc8bb47c62039cc4aee5f5" +source.commit = "c11f21a5cdad8bb6bfaa7d1dd38bd8b2ee7ff4e7" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm-gz.sha256.txt -source.sha256 = "6c197f7b92b359c177574a2e92bf72366cf4966c126a0b68499e3b5777df40fa" +source.sha256 = "4afcf57ef4d6a743d420344c09a18e8f59696d1ecbb9298507446fc9c24a396a" output.type = "tarball" [package.mg-ddm] @@ -699,10 +699,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "ebee0537a54c5dd6c7cc8bb47c62039cc4aee5f5" +source.commit = "c11f21a5cdad8bb6bfaa7d1dd38bd8b2ee7ff4e7" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "47de9f2516710292ae1870eabed80bb2484957c1ac6ee4b3c1c15469514dde13" +source.sha256 = "0c7a0bfbad39375d27ca81d4f3352f6828e6f98a270bf472e9bd3303175860d6" output.type = "zone" output.intermediate_only = true @@ -714,10 +714,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "ebee0537a54c5dd6c7cc8bb47c62039cc4aee5f5" +source.commit = "c11f21a5cdad8bb6bfaa7d1dd38bd8b2ee7ff4e7" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mgd.sha256.txt -source.sha256 = "f3c3cef00ab75ab7dbc9453797ec18b43deaed20fbbd8ebfae01307f76e68c75" +source.sha256 = "ed23649aba3e7a8624f7c46ddbe5b87a05138ca6082b31890f9ed0cb74f20ca5" output.type = "zone" output.intermediate_only = true diff --git a/tools/maghemite_ddm_openapi_version b/tools/maghemite_ddm_openapi_version index 16cc10e4518..c32841b55c8 100644 --- a/tools/maghemite_ddm_openapi_version +++ b/tools/maghemite_ddm_openapi_version @@ -1 +1 @@ -COMMIT="ebee0537a54c5dd6c7cc8bb47c62039cc4aee5f5" +COMMIT="c11f21a5cdad8bb6bfaa7d1dd38bd8b2ee7ff4e7" diff --git a/tools/maghemite_mg_openapi_version b/tools/maghemite_mg_openapi_version index 16cc10e4518..c32841b55c8 100644 --- a/tools/maghemite_mg_openapi_version +++ b/tools/maghemite_mg_openapi_version @@ -1 +1 @@ -COMMIT="ebee0537a54c5dd6c7cc8bb47c62039cc4aee5f5" +COMMIT="c11f21a5cdad8bb6bfaa7d1dd38bd8b2ee7ff4e7" diff --git a/tools/maghemite_mgd_checksums b/tools/maghemite_mgd_checksums index 7ba3d1ca102..f1ccc3277bf 100644 --- a/tools/maghemite_mgd_checksums +++ b/tools/maghemite_mgd_checksums @@ -1,2 +1,2 @@ -CIDL_SHA256="f3c3cef00ab75ab7dbc9453797ec18b43deaed20fbbd8ebfae01307f76e68c75" +CIDL_SHA256="ed23649aba3e7a8624f7c46ddbe5b87a05138ca6082b31890f9ed0cb74f20ca5" MGD_LINUX_SHA256="5163a351f9d49ee610010eb6519b59ef5dba44612cb84cac78d734c5b3472baa" \ No newline at end of file From 3458dd8f62117f47a4059b94d11cea9e5e425b80 Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Sat, 16 May 2026 04:37:30 +0000 Subject: [PATCH 6/8] [multicast] add MULTICAST_SOURCE_LIMITS API version with per-member and per-group source IP caps New nexus external API version 2026_05_16_00 (MULTICAST_SOURCE_LIMITS) splits the multicast group join endpoint to introduce two policy bounds: - MAX_SOURCE_IPS_PER_MEMBER (32): caps a single member's source filter list and rejects duplicate entries explicitly rather than silently deduplicating. - MAX_SOURCE_IPS_PER_GROUP (256): caps the union of source IPs across all active members of a group. Enforced atomically inside the member-attach CTE plus a preflight check at the Nexus app layer for a descriptive 400. Both caps apply whenever a member declares a non-empty source list, covering SSM groups and ASM members using INCLUDE-mode source filtering. This quantifies and qualifies the Oxide policy framing alongside Linux igmp_max_msf and FreeBSD maxsocksrc precedent, while also giving us a threshold to go off on the switch side of things. --- common/src/address.rs | 26 ++++- .../src/db/datastore/multicast/members.rs | 4 + .../datastore/multicast/ops/member_attach.rs | 105 +++++++++++++++--- nexus/external-api/src/lib.rs | 40 ++++++- nexus/src/app/multicast/mod.rs | 81 +++++++++++++- nexus/src/external_api/http_entrypoints.rs | 27 +++++ .../nexus-2026050800.0.0-d2276f.json.gitstub | 1 + ....json => nexus-2026051600.0.0-fa596f.json} | 4 +- openapi/nexus/nexus-latest.json | 2 +- 9 files changed, 264 insertions(+), 26 deletions(-) create mode 100644 openapi/nexus/nexus-2026050800.0.0-d2276f.json.gitstub rename openapi/nexus/{nexus-2026050800.0.0-d2276f.json => nexus-2026051600.0.0-fa596f.json} (99%) diff --git a/common/src/address.rs b/common/src/address.rs index af34bcff8f7..e7e0cd0001a 100644 --- a/common/src/address.rs +++ b/common/src/address.rs @@ -58,10 +58,32 @@ pub const IPV4_SSM_SUBNET: Ipv4Net = pub const IPV6_SSM_SUBNET: Ipv6Net = Ipv6Net::new_unchecked(Ipv6Addr::new(0xff30, 0, 0, 0, 0, 0, 0, 0), 12); -/// Maximum source IPs per SSM group member (per [RFC 3376] IGMPv3). +/// Maximum source IPs a single multicast group member may declare for +/// source filtering. +/// +/// Applies to SSM members (which always declare sources) and to ASM members +/// using `INCLUDE`-mode filtering. The cap reflects `(S,G)` fanout cost, +/// which is identical regardless of group-address semantics. +/// +/// Oxide policy bound. [RFC 3376] §4.2.1 (IGMPv3) and [RFC 3810] §5.2.1 +/// (MLDv2) leave per-group source-list size implementation-defined, MTU-bound +/// at 16-bit max. For comparison: Linux defaults to 10 (`igmp_max_msf`), +/// FreeBSD to 128 (`maxsocksrc`). 32 was chosen to cover realistic workloads +/// (1-8 sources per channel typical) while protecting the shared `(S,G)` +/// forwarding state from a single tenant's fan-out. /// /// [RFC 3376]: https://www.rfc-editor.org/rfc/rfc3376 -pub const MAX_SSM_SOURCE_IPS: usize = 64; +/// [RFC 3810]: https://www.rfc-editor.org/rfc/rfc3810 +pub const MAX_SOURCE_IPS_PER_MEMBER: usize = 32; + +/// Maximum size of the union of source IPs across all members of a single +/// multicast group. +/// +/// Oxide policy bound. Bounds the `(S,G)` install count one group can produce +/// by aggregating fan-out across members. 256 leaves headroom for large +/// multi-tenant deployments while keeping dataplane forwarding state +/// predictable. +pub const MAX_SOURCE_IPS_PER_GROUP: usize = 256; /// Check if an IP is in the SSM (Source-Specific Multicast) range. /// diff --git a/nexus/db-queries/src/db/datastore/multicast/members.rs b/nexus/db-queries/src/db/datastore/multicast/members.rs index 1c2d25a703b..c9f2cd712b9 100644 --- a/nexus/db-queries/src/db/datastore/multicast/members.rs +++ b/nexus/db-queries/src/db/datastore/multicast/members.rs @@ -91,6 +91,10 @@ impl DataStore { /// - `None` → preserve existing `source_ips` (rejoin without changes) /// - `Some([])` → clear `source_ips` (switch to ASM) /// - `Some([a,b])` → replace with new `source_ips` (update sources) + /// + /// Atomically enforces the per-group source IP union cap + /// ([`omicron_common::address::MAX_SOURCE_IPS_PER_GROUP`]) when a + /// non-empty source list is being applied. pub async fn multicast_group_member_attach_to_instance( &self, opctx: &OpContext, diff --git a/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs b/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs index 254a2485bd7..22cd3676020 100644 --- a/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs +++ b/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs @@ -38,6 +38,7 @@ use uuid::Uuid; use nexus_db_lookup::DbConnection; use nexus_db_model::{MulticastGroupMember, MulticastGroupMemberState}; +use omicron_common::address::MAX_SOURCE_IPS_PER_GROUP; use omicron_common::api::external; use crate::db::true_or_cast_error::matches_sentinel; @@ -47,6 +48,7 @@ use crate::db::true_or_cast_error::matches_sentinel; // the specific failure reason from the error message. const GROUP_NOT_FOUND_SENTINEL: &str = "group-not-found"; const INSTANCE_NOT_FOUND_SENTINEL: &str = "instance-not-found"; +const UNION_EXCEEDED_SENTINEL: &str = "source-union-exceeded"; /// Result of attaching an instance to a multicast group. #[derive(Debug, Clone, PartialEq)] @@ -62,6 +64,9 @@ pub(crate) enum AttachMemberError { GroupNotFound, /// Instance doesn't exist or has been deleted InstanceNotFound, + /// Attaching this member would push the group's source IP union past + /// the per-group cap. + SourceUnionExceeded { cap: usize }, /// Database constraint violation (unique index, etc.) ConstraintViolation(String), /// Other database error @@ -72,16 +77,26 @@ impl AttachMemberError { /// Construct an [`AttachMemberError`] from a database error. /// /// This catches the sentinel errors that indicate validation failures - /// (group not found, instance not found) as well as constraint violations. - fn from_diesel(err: DieselError) -> Self { + /// (group not found, instance not found, source union cap) as well as + /// constraint violations. + fn from_diesel(err: DieselError, max_union_size: usize) -> Self { // Check for sentinel errors first - let sentinels = [GROUP_NOT_FOUND_SENTINEL, INSTANCE_NOT_FOUND_SENTINEL]; + let sentinels = [ + GROUP_NOT_FOUND_SENTINEL, + INSTANCE_NOT_FOUND_SENTINEL, + UNION_EXCEEDED_SENTINEL, + ]; if let Some(sentinel) = matches_sentinel(&err, &sentinels) { return match sentinel { GROUP_NOT_FOUND_SENTINEL => AttachMemberError::GroupNotFound, INSTANCE_NOT_FOUND_SENTINEL => { AttachMemberError::InstanceNotFound } + UNION_EXCEEDED_SENTINEL => { + AttachMemberError::SourceUnionExceeded { + cap: max_union_size, + } + } _ => unreachable!("Unknown sentinel: {sentinel}"), }; } @@ -115,6 +130,12 @@ impl From for external::Error { "Instance does not exist or has been deleted", ) } + AttachMemberError::SourceUnionExceeded { cap } => { + external::Error::invalid_request(format!( + "attaching this member would exceed the per-group \ + source IP union cap of {cap}", + )) + } AttachMemberError::ConstraintViolation(msg) => { external::Error::invalid_request(&format!( "Constraint violation: {msg}" @@ -173,9 +194,11 @@ impl AttachMemberToGroupStatement { /// - `new_member_id`: UUID for new member row (if creating) /// - `source_ips`: Source IPs for filtering (`None` preserves existing on reactivation) /// - /// CTEs atomically validate group is not in a "Deleting" state, - /// that the instance exists, retrieves the current `sled_id` from - /// VMM table, then performs the upsert. + /// CTEs atomically validate group is not in a "Deleting" state, that the + /// instance exists, retrieves the current `sled_id` from VMM table, and + /// (when a non-empty source list is being applied) verifies that the + /// resulting per-group source IP union stays within + /// [`MAX_SOURCE_IPS_PER_GROUP`]. pub fn new( group_id: Uuid, instance_id: Uuid, @@ -201,7 +224,9 @@ impl AttachMemberToGroupStatement { ) -> Result { self.get_result_async::(conn) .await - .map_err(AttachMemberError::from_diesel) + .map_err(|e| { + AttachMemberError::from_diesel(e, MAX_SOURCE_IPS_PER_GROUP) + }) .map(|member| AttachMemberResult { member }) } } @@ -273,22 +298,16 @@ impl AttachMemberToGroupStatement { /// Uses CAST to trigger a predictable error when validation fails: /// - If group not found → CAST('group-not-found' AS BOOL) fails /// - If instance not found → CAST('instance-not-found' AS BOOL) fails - /// - If both valid → CAST('TRUE' AS BOOL) succeeds + /// - If the resulting source IP union would exceed the per-group cap + /// → CAST('source-union-exceeded' AS BOOL) fails (only checked when a + /// non-empty source list is being applied) + /// - If all valid → CAST('TRUE' AS BOOL) succeeds /// /// This follows the pattern used in `network_interface.rs` and `external_ip.rs`. fn push_validation_cte<'a>( &'a self, mut out: AstPass<'_, 'a, Pg>, ) -> QueryResult<()> { - // SELECT CAST( - // CASE - // WHEN NOT EXISTS (SELECT 1 FROM instance_sled) THEN 'instance-not-found' - // WHEN NOT EXISTS (SELECT 1 FROM valid_group) THEN 'group-not-found' - // ELSE 'TRUE' - // END AS BOOL - // ) AS validated - // - // Instance is checked first to provide more those errors up front out.push_sql("SELECT CAST(CASE "); out.push_sql("WHEN NOT EXISTS (SELECT 1 FROM instance_sled) THEN '"); out.push_sql(INSTANCE_NOT_FOUND_SENTINEL); @@ -296,10 +315,54 @@ impl AttachMemberToGroupStatement { out.push_sql("WHEN NOT EXISTS (SELECT 1 FROM valid_group) THEN '"); out.push_sql(GROUP_NOT_FOUND_SENTINEL); out.push_sql("' "); + if self.check_union_size() { + out.push_sql("WHEN (SELECT size FROM proposed_union_size) > "); + out.push_sql(&MAX_SOURCE_IPS_PER_GROUP.to_string()); + out.push_sql(" THEN '"); + out.push_sql(UNION_EXCEEDED_SENTINEL); + out.push_sql("' "); + } out.push_sql("ELSE 'TRUE' END AS BOOL) AS validated"); Ok(()) } + /// Whether the resulting source IP union should be checked against the + /// per-group cap. Skipped when the caller is preserving existing sources + /// (`None`) or explicitly clearing them (empty list), since neither path + /// grows the union. + fn check_union_size(&self) -> bool { + self.update_source_ips_on_reactivation + && !self.source_ips_for_insert.is_empty() + } + + /// Generates the `proposed_union_size` CTE. + /// + /// Computes the size of the source IP union that would result from this + /// attach: all other active members' source IPs unioned with the proposed + /// list. This member's existing row (if any) is excluded because its + /// sources are being replaced. + fn push_proposed_union_size_cte<'a>( + &'a self, + mut out: AstPass<'_, 'a, Pg>, + ) -> QueryResult<()> { + out.push_sql( + "SELECT count(DISTINCT source_ip) AS size FROM (\ + SELECT unnest(source_ips) AS source_ip \ + FROM multicast_group_member \ + WHERE external_group_id = ", + ); + out.push_bind_param::(&self.group_id)?; + out.push_sql(" AND parent_id != "); + out.push_bind_param::(&self.instance_id)?; + out.push_sql(" AND time_deleted IS NULL "); + out.push_sql("UNION ALL SELECT unnest("); + out.push_bind_param::, _>( + &self.source_ips_for_insert, + )?; + out.push_sql(") AS source_ip) s"); + Ok(()) + } + /// Generates the `upserted_member` CTE (performs unconditional upsert). /// /// SELECT joins with both `valid_group` and `instance_sled` CTEs to: @@ -433,6 +496,14 @@ impl QueryFragment for AttachMemberToGroupStatement { self.push_instance_sled_cte(out.reborrow())?; out.push_sql("), "); + // CTE: Compute the prospective per-group source IP union size when + // a non-empty source list is being applied. + if self.check_union_size() { + out.push_sql("proposed_union_size AS ("); + self.push_proposed_union_size_cte(out.reborrow())?; + out.push_sql("), "); + } + // CTE: Validation that triggers sentinel errors on failure out.push_sql("validation AS MATERIALIZED ("); self.push_validation_cte(out.reborrow())?; diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index 9b81215750b..247a2ad6894 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -83,6 +83,7 @@ api_versions!([ // | date-based version should be at the top of the list. // v // (next_yyyy_mm_dd_nn, IDENT), + (2026_05_16_00, MULTICAST_SOURCE_LIMITS), (2026_05_08_00, MANUAL_DISK_ADOPTION), (2026_05_07_00, REMOVE_DUPLICATED_NETWORKING_TYPES), (2026_04_30_00, PROBE_AND_SAML_DOCS), @@ -5936,13 +5937,15 @@ pub trait NexusExternalApi { /// the group must already exist. /// /// Source IPs are optional for ASM addresses but required for SSM addresses - /// (232.0.0.0/8 for IPv4, ff3x::/32 for IPv6). Duplicate IPs in the request - /// are automatically deduplicated, with a maximum of 64 source IPs allowed. + /// (232.0.0.0/8 for IPv4, ff3x::/32 for IPv6). Duplicate source IPs in a + /// single request are rejected. Per-member source list is capped at 32, and + /// the union of source IPs across all members of a single group is capped + /// at 256. #[endpoint { method = PUT, path = "/v1/instances/{instance}/multicast-groups/{multicast_group}", tags = ["experimental"], - versions = VERSION_MULTICAST_IMPLICIT_LIFECYCLE_UPDATES.., + versions = VERSION_MULTICAST_SOURCE_LIMITS.., }] async fn instance_multicast_group_join( rqctx: RequestContext, @@ -5954,6 +5957,37 @@ pub trait NexusExternalApi { HttpError, >; + /// Join multicast group by name, IP address, or UUID + /// + /// Groups can be referenced by name, IP address, or UUID. If the group + /// doesn't exist, it's implicitly created with an auto-allocated IP from a + /// multicast pool linked to the caller's silo. When referencing by UUID, + /// the group must already exist. + /// + /// Source IPs are optional for ASM addresses but required for SSM addresses + /// (232.0.0.0/8 for IPv4, ff3x::/32 for IPv6). Duplicate IPs in the request + /// are automatically deduplicated, with a maximum of 64 source IPs allowed. + #[endpoint { + method = PUT, + path = "/v1/instances/{instance}/multicast-groups/{multicast_group}", + tags = ["experimental"], + operation_id = "instance_multicast_group_join", + versions = VERSION_MULTICAST_IMPLICIT_LIFECYCLE_UPDATES..VERSION_MULTICAST_SOURCE_LIMITS, + }] + async fn instance_multicast_group_join_v2026_01_08_00( + rqctx: RequestContext, + path_params: Path< + v2026_01_08_00::multicast::InstanceMulticastGroupPath, + >, + query_params: Query, + body_params: TypedBody< + v2026_01_08_00::multicast::InstanceMulticastGroupJoin, + >, + ) -> Result< + HttpResponseCreated, + HttpError, + >; + /// Join multicast group /// /// Deprecated: newer version supports implicit group creation, accepts group diff --git a/nexus/src/app/multicast/mod.rs b/nexus/src/app/multicast/mod.rs index 57b48188d77..a5eea32b8d4 100644 --- a/nexus/src/app/multicast/mod.rs +++ b/nexus/src/app/multicast/mod.rs @@ -61,7 +61,9 @@ use nexus_db_queries::db::datastore::multicast::ExternalMulticastGroupWithSource use nexus_db_queries::{authz, db}; use nexus_types::external_api::multicast; use nexus_types::multicast::MulticastGroupCreate; -use omicron_common::address::is_ssm_address; +use omicron_common::address::{ + MAX_SOURCE_IPS_PER_GROUP, MAX_SOURCE_IPS_PER_MEMBER, is_ssm_address, +}; use omicron_common::api::external::{ self, CreateResult, DataPageParams, DeleteResult, IdentityMetadataCreateParams, ListResultVec, LookupResult, @@ -112,6 +114,42 @@ pub(crate) fn validate_ssm_sources( Ok(()) } +/// Validate per-member source IP list shape. +/// +/// Applies whenever a member declares source IPs, irrespective of SSM or ASM +/// group semantics. Enforces: +/// +/// - At most [`MAX_SOURCE_IPS_PER_MEMBER`] entries +/// - No duplicates (rejected explicitly rather than silently deduplicated, so +/// client bugs are surfaced and downstream consumers can assume the list is +/// canonical) +pub(crate) fn validate_member_source_ips( + source_ips: Option<&[std::net::IpAddr]>, +) -> Result<(), external::Error> { + let Some(sources) = source_ips else { + return Ok(()); + }; + if sources.is_empty() { + return Ok(()); + } + let count = sources.len(); + if count > MAX_SOURCE_IPS_PER_MEMBER { + return Err(external::Error::invalid_request(format!( + "membership source IP count {count} exceeds per-member limit \ + of {MAX_SOURCE_IPS_PER_MEMBER}", + ))); + } + let mut seen = std::collections::BTreeSet::new(); + for ip in sources { + if !seen.insert(*ip) { + return Err(external::Error::invalid_request(format!( + "duplicate source IP {ip} in membership request", + ))); + } + } + Ok(()) +} + impl super::Nexus { /// Look up a fleet-scoped multicast group by name, ID, or IP address. /// @@ -352,6 +390,10 @@ impl super::Nexus { ))); } + // Per-member source IP shape (count + duplicate) check runs once + // up front, independent of group resolution. + validate_member_source_ips(source_ips)?; + // Find or create the group based on identifier type. // SSM validation happens inside resolve functions. let group_id = match group_identifier { @@ -373,6 +415,13 @@ impl super::Nexus { } }; + // Preflight per-group source IP union cap for a descriptive 400 in + // the non-racing common case. The datastore CTE enforces the same + // bound atomically inside `multicast_group_member_attach_to_instance`. + if let Some(sources) = source_ips.filter(|s| !s.is_empty()) { + self.validate_group_source_union(opctx, group_id, sources).await?; + } + // Attach the member with its source IPs let member = self .db_datastore @@ -596,6 +645,36 @@ impl super::Nexus { Ok(MulticastGroupUuid::from_untyped_uuid(db_group.identity.id)) } + /// Preflight check that the union of existing member source IPs and + /// `proposed` for `group_id` stays within + /// [`MAX_SOURCE_IPS_PER_GROUP`]. + async fn validate_group_source_union( + &self, + opctx: &OpContext, + group_id: MulticastGroupUuid, + proposed: &[IpAddr], + ) -> Result<(), external::Error> { + let filter_state = self + .db_datastore + .multicast_groups_source_filter_state(opctx, &[group_id]) + .await?; + let mut union = filter_state + .get(&group_id.into_untyped_uuid()) + .map(|s| s.specific_sources.clone()) + .unwrap_or_default(); + union.extend(proposed.iter().copied()); + if union.len() > MAX_SOURCE_IPS_PER_GROUP { + return Err(external::Error::invalid_request(format!( + "adding {} source IP(s) would push group source union to \ + {}, exceeding per-group cap of {}", + proposed.len(), + union.len(), + MAX_SOURCE_IPS_PER_GROUP, + ))); + } + Ok(()) + } + /// Resolve a multicast group identifier to a UUID (lookup only). /// /// This is a lookup that does not create groups or perform validation. diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index b0fa2ab8408..0152fc20156 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -73,6 +73,7 @@ use nexus_types::external_api::user::{Group, User, UserBuiltin}; use nexus_types::external_api::vpc::{Vpc, VpcRouter, VpcSubnet}; use nexus_types_versions::latest::headers::RangeRequest; use nexus_types_versions::v2025_11_20_00; +use nexus_types_versions::v2026_01_08_00; use omicron_common::address::IpRange; use omicron_common::api::external::AddressLot; use omicron_common::api::external::AddressLotBlock; @@ -5289,6 +5290,32 @@ impl NexusExternalApi for NexusExternalApiImpl { .await } + // Pre-MULTICAST_SOURCE_LIMITS version: same types as the latest variant + // (re-exported through `latest::`), so delegate directly. The behavioral + // difference (per-member and per-group source IP caps) is enforced + // unconditionally in the Nexus app layer. + async fn instance_multicast_group_join_v2026_01_08_00( + rqctx: RequestContext, + path_params: Path< + v2026_01_08_00::multicast::InstanceMulticastGroupPath, + >, + query_params: Query, + body_params: TypedBody< + v2026_01_08_00::multicast::InstanceMulticastGroupJoin, + >, + ) -> Result< + HttpResponseCreated, + HttpError, + > { + Self::instance_multicast_group_join( + rqctx, + path_params, + query_params, + body_params, + ) + .await + } + // Cannot delegate to lib.rs: old API version has no body parameter, but the // new `instance_multicast_group_join` requires `TypedBody`. // TypedBody has no public constructor, so we can't create a default body for delegation. diff --git a/openapi/nexus/nexus-2026050800.0.0-d2276f.json.gitstub b/openapi/nexus/nexus-2026050800.0.0-d2276f.json.gitstub new file mode 100644 index 00000000000..f90adfcd6f7 --- /dev/null +++ b/openapi/nexus/nexus-2026050800.0.0-d2276f.json.gitstub @@ -0,0 +1 @@ +7a911ab47e46b902fe7acc5e98dca5f79138fbf4:openapi/nexus/nexus-2026050800.0.0-d2276f.json diff --git a/openapi/nexus/nexus-2026050800.0.0-d2276f.json b/openapi/nexus/nexus-2026051600.0.0-fa596f.json similarity index 99% rename from openapi/nexus/nexus-2026050800.0.0-d2276f.json rename to openapi/nexus/nexus-2026051600.0.0-fa596f.json index 71ee60f8ce9..bb780180396 100644 --- a/openapi/nexus/nexus-2026050800.0.0-d2276f.json +++ b/openapi/nexus/nexus-2026051600.0.0-fa596f.json @@ -7,7 +7,7 @@ "url": "https://oxide.computer", "email": "api@oxide.computer" }, - "version": "2026050800.0.0" + "version": "2026051600.0.0" }, "paths": { "/device/auth": { @@ -4770,7 +4770,7 @@ "experimental" ], "summary": "Join multicast group by name, IP address, or UUID", - "description": "Groups can be referenced by name, IP address, or UUID. If the group doesn't exist, it's implicitly created with an auto-allocated IP from a multicast pool linked to the caller's silo. When referencing by UUID, the group must already exist.\n\nSource IPs are optional for ASM addresses but required for SSM addresses (232.0.0.0/8 for IPv4, ff3x::/32 for IPv6). Duplicate IPs in the request are automatically deduplicated, with a maximum of 64 source IPs allowed.", + "description": "Groups can be referenced by name, IP address, or UUID. If the group doesn't exist, it's implicitly created with an auto-allocated IP from a multicast pool linked to the caller's silo. When referencing by UUID, the group must already exist.\n\nSource IPs are optional for ASM addresses but required for SSM addresses (232.0.0.0/8 for IPv4, ff3x::/32 for IPv6). Duplicate source IPs in a single request are rejected. Per-member source list is capped at 32, and the union of source IPs across all members of a single group is capped at 256.", "operationId": "instance_multicast_group_join", "parameters": [ { diff --git a/openapi/nexus/nexus-latest.json b/openapi/nexus/nexus-latest.json index 47e667754f5..f57d7e2f9c9 120000 --- a/openapi/nexus/nexus-latest.json +++ b/openapi/nexus/nexus-latest.json @@ -1 +1 @@ -nexus-2026050800.0.0-d2276f.json \ No newline at end of file +nexus-2026051600.0.0-fa596f.json \ No newline at end of file From 225723dd182737ca86b5f1ec3780c943c0ed8119 Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Fri, 22 May 2026 13:33:20 +0000 Subject: [PATCH 7/8] [review] fix InvalidMcastUnderlay error: ff04::/64 -> ff04::/16 (the actual opte-api check) --- illumos-utils/src/opte/illumos.rs | 2 +- illumos-utils/src/opte/non_illumos.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/illumos-utils/src/opte/illumos.rs b/illumos-utils/src/opte/illumos.rs index f17adacf52a..3dcdd8cfdcd 100644 --- a/illumos-utils/src/opte/illumos.rs +++ b/illumos-utils/src/opte/illumos.rs @@ -73,7 +73,7 @@ pub enum Error { AttachedSubnetUpdateMissingPort(uuid::Uuid, NetworkInterfaceKind), #[error( - "address {0} is not within the underlay multicast subnet (ff04::/64)" + "address {0} is not within the underlay multicast subnet (ff04::/16)" )] InvalidMcastUnderlay(Ipv6Addr), } diff --git a/illumos-utils/src/opte/non_illumos.rs b/illumos-utils/src/opte/non_illumos.rs index 016e3b22bca..dcc6dcd893d 100644 --- a/illumos-utils/src/opte/non_illumos.rs +++ b/illumos-utils/src/opte/non_illumos.rs @@ -90,7 +90,7 @@ pub enum Error { AttachedSubnetUpdateMissingPort(uuid::Uuid, NetworkInterfaceKind), #[error( - "address {0} is not within the underlay multicast subnet (ff04::/64)" + "address {0} is not within the underlay multicast subnet (ff04::/16)" )] InvalidMcastUnderlay(std::net::Ipv6Addr), } From e6c0a8bd2dd64454acd9c915b36972c2169ca5e5 Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Sat, 23 May 2026 02:52:59 +0000 Subject: [PATCH 8/8] [review] address nit --- .../src/db/datastore/multicast/ops/member_attach.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs b/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs index 22cd3676020..6c5b43cfc85 100644 --- a/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs +++ b/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs @@ -79,7 +79,7 @@ impl AttachMemberError { /// This catches the sentinel errors that indicate validation failures /// (group not found, instance not found, source union cap) as well as /// constraint violations. - fn from_diesel(err: DieselError, max_union_size: usize) -> Self { + fn from_diesel(err: DieselError) -> Self { // Check for sentinel errors first let sentinels = [ GROUP_NOT_FOUND_SENTINEL, @@ -94,7 +94,7 @@ impl AttachMemberError { } UNION_EXCEEDED_SENTINEL => { AttachMemberError::SourceUnionExceeded { - cap: max_union_size, + cap: MAX_SOURCE_IPS_PER_GROUP, } } _ => unreachable!("Unknown sentinel: {sentinel}"), @@ -224,9 +224,7 @@ impl AttachMemberToGroupStatement { ) -> Result { self.get_result_async::(conn) .await - .map_err(|e| { - AttachMemberError::from_diesel(e, MAX_SOURCE_IPS_PER_GROUP) - }) + .map_err(AttachMemberError::from_diesel) .map(|member| AttachMemberResult { member }) } }