diff --git a/common/src/address.rs b/common/src/address.rs index af34bcff8f7..e7e0cd0001a 100644 --- a/common/src/address.rs +++ b/common/src/address.rs @@ -58,10 +58,32 @@ pub const IPV4_SSM_SUBNET: Ipv4Net = pub const IPV6_SSM_SUBNET: Ipv6Net = Ipv6Net::new_unchecked(Ipv6Addr::new(0xff30, 0, 0, 0, 0, 0, 0, 0), 12); -/// Maximum source IPs per SSM group member (per [RFC 3376] IGMPv3). +/// Maximum source IPs a single multicast group member may declare for +/// source filtering. +/// +/// Applies to SSM members (which always declare sources) and to ASM members +/// using `INCLUDE`-mode filtering. The cap reflects `(S,G)` fanout cost, +/// which is identical regardless of group-address semantics. +/// +/// Oxide policy bound. [RFC 3376] §4.2.1 (IGMPv3) and [RFC 3810] §5.2.1 +/// (MLDv2) leave per-group source-list size implementation-defined, MTU-bound +/// at 16-bit max. For comparison: Linux defaults to 10 (`igmp_max_msf`), +/// FreeBSD to 128 (`maxsocksrc`). 32 was chosen to cover realistic workloads +/// (1-8 sources per channel typical) while protecting the shared `(S,G)` +/// forwarding state from a single tenant's fan-out. /// /// [RFC 3376]: https://www.rfc-editor.org/rfc/rfc3376 -pub const MAX_SSM_SOURCE_IPS: usize = 64; +/// [RFC 3810]: https://www.rfc-editor.org/rfc/rfc3810 +pub const MAX_SOURCE_IPS_PER_MEMBER: usize = 32; + +/// Maximum size of the union of source IPs across all members of a single +/// multicast group. +/// +/// Oxide policy bound. Bounds the `(S,G)` install count one group can produce +/// by aggregating fan-out across members. 256 leaves headroom for large +/// multi-tenant deployments while keeping dataplane forwarding state +/// predictable. +pub const MAX_SOURCE_IPS_PER_GROUP: usize = 256; /// Check if an IP is in the SSM (Source-Specific Multicast) range. /// diff --git a/illumos-utils/src/opte/illumos.rs b/illumos-utils/src/opte/illumos.rs index 28ca9f85566..3dcdd8cfdcd 100644 --- a/illumos-utils/src/opte/illumos.rs +++ b/illumos-utils/src/opte/illumos.rs @@ -13,6 +13,7 @@ use sled_agent_types::inventory::NetworkInterfaceKind; use slog::Logger; use slog::info; use std::net::IpAddr; +use std::net::Ipv6Addr; #[derive(thiserror::Error, Debug)] pub enum Error { @@ -70,6 +71,11 @@ pub enum Error { "Tried to update attached subnets on non-existent port ({0}, {1:?})" )] AttachedSubnetUpdateMissingPort(uuid::Uuid, NetworkInterfaceKind), + + #[error( + "address {0} is not within the underlay multicast subnet (ff04::/16)" + )] + InvalidMcastUnderlay(Ipv6Addr), } /// Delete all xde devices on the system. diff --git a/illumos-utils/src/opte/mod.rs b/illumos-utils/src/opte/mod.rs index e9e2546cb0a..4903e61db8b 100644 --- a/illumos-utils/src/opte/mod.rs +++ b/illumos-utils/src/opte/mod.rs @@ -33,10 +33,10 @@ use oxnet::IpNet; use oxnet::Ipv4Net; use oxnet::Ipv6Net; pub use port::Port; -pub use port_manager::MulticastGroupCfg; pub use port_manager::PortCreateParams; pub use port_manager::PortManager; pub use port_manager::PortTicket; +pub use sled_agent_types::multicast::MulticastGroupCfg; use std::net::IpAddr; use std::net::Ipv4Addr; use std::net::Ipv6Addr; diff --git a/illumos-utils/src/opte/non_illumos.rs b/illumos-utils/src/opte/non_illumos.rs index 42487cde09c..dcc6dcd893d 100644 --- a/illumos-utils/src/opte/non_illumos.rs +++ b/illumos-utils/src/opte/non_illumos.rs @@ -2,25 +2,37 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Mock / dummy versions of the OPTE module, for non-illumos platforms +//! Mock / dummy versions of the OPTE module, for non-illumos platforms. +//! +//! Most methods are either `unimplemented!()` or silent no-ops. +//! Multicast subscribe/unsubscribe is an exception, as it maintains real +//! in-memory state because port manager tests assert on subscription contents. use crate::addrobj::AddrObject; use oxide_vpc::api::AddRouterEntryReq; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2PhysReq; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DetachSubnetResp; -use oxide_vpc::api::Direction; +use oxide_vpc::api::DumpMcast2PhysResp; +use oxide_vpc::api::DumpMcastForwardingResp; use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::IpCfg; use oxide_vpc::api::IpCidr; use oxide_vpc::api::ListPortsResp; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; use oxide_vpc::api::NoResp; use oxide_vpc::api::PortInfo; use oxide_vpc::api::RouterClass; use oxide_vpc::api::RouterTarget; use oxide_vpc::api::SetExternalIpsReq; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2PhysReq; +use oxide_vpc::api::SourceFilter; use oxide_vpc::api::VpcCfg; use sled_agent_types::inventory::NetworkInterfaceKind; use slog::Logger; @@ -76,6 +88,11 @@ pub enum Error { "Tried to update attached subnets on non-existent port ({0}, {1:?})" )] AttachedSubnetUpdateMissingPort(uuid::Uuid, NetworkInterfaceKind), + + #[error( + "address {0} is not within the underlay multicast subnet (ff04::/16)" + )] + InvalidMcastUnderlay(std::net::Ipv6Addr), } pub fn initialize_xde_driver( @@ -172,6 +189,8 @@ pub(crate) struct PortData { pub port: PortInfo, /// The routes for this port. This simulates the router layer. pub routes: Vec, + /// Multicast group subscriptions: group IP → source filter. + pub mcast_subscriptions: HashMap, } #[derive(Debug)] @@ -237,7 +256,11 @@ impl Handle { return Err(OpteError::DuplicatePort(entry.key().to_string())); } Entry::Vacant(entry) => { - entry.insert(PortData { port, routes: Vec::new() }); + entry.insert(PortData { + port, + routes: Vec::new(), + mcast_subscriptions: HashMap::new(), + }); } } Ok(NO_RESPONSE) @@ -270,14 +293,46 @@ impl Handle { Ok(NO_RESPONSE) } - /// Allow traffic to / from a CIDR block on a port. - pub fn allow_cidr( + /// Subscribe a port to a multicast group. + pub fn mcast_subscribe( &self, - _: &str, - _: IpCidr, - _: Direction, + req: &McastSubscribeReq, ) -> Result { - unimplemented!("Not yet used in tests") + let mut inner = opte_state().lock().unwrap(); + let Some(port_data) = inner.ports.get_mut(&req.port_name) else { + return Err(OpteError::NoPort(req.port_name.clone())); + }; + let group_ip: IpAddr = match req.group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + std::net::Ipv4Addr::from(v4).into() + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + std::net::Ipv6Addr::from(v6).into() + } + }; + port_data.mcast_subscriptions.insert(group_ip, req.filter.clone()); + Ok(NO_RESPONSE) + } + + /// Unsubscribe a port from a multicast group. + pub fn mcast_unsubscribe( + &self, + req: &McastUnsubscribeReq, + ) -> Result { + let mut inner = opte_state().lock().unwrap(); + let Some(port_data) = inner.ports.get_mut(&req.port_name) else { + return Err(OpteError::NoPort(req.port_name.clone())); + }; + let group_ip: IpAddr = match req.group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + std::net::Ipv4Addr::from(v4).into() + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + std::net::Ipv6Addr::from(v6).into() + } + }; + port_data.mcast_subscriptions.remove(&group_ip); + Ok(NO_RESPONSE) } /// Delete a router entry from a port. @@ -323,6 +378,45 @@ impl Handle { unimplemented!("Not yet used in tests") } + /// Set a multicast-to-physical mapping. + pub fn set_m2p(&self, _: &SetMcast2PhysReq) -> Result { + Ok(NO_RESPONSE) + } + + /// Clear a multicast-to-physical mapping. + pub fn clear_m2p( + &self, + _: &ClearMcast2PhysReq, + ) -> Result { + Ok(NO_RESPONSE) + } + + /// Set multicast forwarding for a port. + pub fn set_mcast_fwd( + &self, + _: &SetMcastForwardingReq, + ) -> Result { + Ok(NO_RESPONSE) + } + + /// Clear multicast forwarding for a port. + pub fn clear_mcast_fwd( + &self, + _: &ClearMcastForwardingReq, + ) -> Result { + Ok(NO_RESPONSE) + } + + /// Dump all multicast-to-physical mappings. + pub fn dump_m2p(&self) -> Result { + Ok(DumpMcast2PhysResp { ip4: Vec::new(), ip6: Vec::new() }) + } + + /// Dump all multicast forwarding entries. + pub fn dump_mcast_fwd(&self) -> Result { + Ok(DumpMcastForwardingResp { entries: Vec::new() }) + } + /// List ports on the current system. #[allow(dead_code)] pub(crate) fn list_ports(&self) -> Result { diff --git a/illumos-utils/src/opte/port_manager.rs b/illumos-utils/src/opte/port_manager.rs index 638dd52de3d..8ffb155638c 100644 --- a/illumos-utils/src/opte/port_manager.rs +++ b/illumos-utils/src/opte/port_manager.rs @@ -17,8 +17,6 @@ use crate::opte::port::PortData; use ipnetwork::Ipv4Network; use ipnetwork::Ipv6Network; use macaddr::MacAddr6; -use omicron_common::address::IPV4_MULTICAST_RANGE; -use omicron_common::address::IPV6_MULTICAST_RANGE; use omicron_common::api::external; use omicron_common::api::internal::shared::ExternalIpGatewayMap; use omicron_common::api::internal::shared::InternetGatewayRouterTarget; @@ -35,10 +33,13 @@ use omicron_common::api::internal::shared::RouterVersion; use omicron_common::api::internal::shared::VirtualNetworkInterfaceHost; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::AttachedSubnetConfig; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DetachSubnetResp; use oxide_vpc::api::DhcpCfg; use oxide_vpc::api::ExternalIpCfg; +use oxide_vpc::api::FilterMode; use oxide_vpc::api::IpCfg; use oxide_vpc::api::IpCidr; use oxide_vpc::api::Ipv4Cfg; @@ -46,10 +47,16 @@ use oxide_vpc::api::Ipv4Cidr; use oxide_vpc::api::Ipv6Cfg; use oxide_vpc::api::Ipv6Cidr; use oxide_vpc::api::MacAddr; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::MulticastUnderlay; use oxide_vpc::api::RouterClass; use oxide_vpc::api::SNat4Cfg; use oxide_vpc::api::SNat6Cfg; use oxide_vpc::api::SetExternalIpsReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; +use oxide_vpc::api::SourceFilter; use oxide_vpc::api::TransitIpConfig; use oxide_vpc::api::VpcCfg; use oxnet::IpNet; @@ -61,6 +68,15 @@ use sled_agent_types::instance::ExternalIpv6Config; use sled_agent_types::instance::ResolvedVpcFirewallRule; use sled_agent_types::inventory::NetworkInterface; use sled_agent_types::inventory::NetworkInterfaceKind; +use sled_agent_types::multicast::ClearMcast2Phys; +use sled_agent_types::multicast::ClearMcastForwarding; +use sled_agent_types::multicast::Mcast2PhysMapping; +use sled_agent_types::multicast::McastFilterMode; +use sled_agent_types::multicast::McastForwardingEntry; +use sled_agent_types::multicast::McastForwardingNextHop; +use sled_agent_types::multicast::McastReplication; +use sled_agent_types::multicast::McastSourceFilter; +use sled_agent_types::multicast::MulticastGroupCfg; use slog::Logger; use slog::debug; use slog::error; @@ -89,20 +105,36 @@ struct RouteSet { active_ports: usize, } -/// Configuration for multicast groups on an OPTE port. -/// -/// TODO: This type should be moved to [oxide_vpc::api] when OPTE dependencies -/// are updated, following the same pattern as other VPC configuration types -/// like [ExternalIpCfg], [IpCfg], etc. +/// Mutable per-port state tracked alongside the immutable `Port`. +#[derive(Debug)] +struct PortState { + port: Port, + /// Active multicast subscriptions, mapping group IP to source filter. + mcast_subscriptions: HashMap, +} + +impl PortState { + fn new(port: Port) -> Self { + Self { port, mcast_subscriptions: HashMap::new() } + } +} + +/// Convert a `MulticastGroupCfg` into OPTE's `SourceFilter`. /// -/// TODO: Eventually remove. -#[derive(Debug, Clone, PartialEq)] -pub struct MulticastGroupCfg { - /// The multicast group IP address (IPv4 or IPv6). - pub group_ip: IpAddr, - /// Source addresses for source-filtered multicast (optional for ASM, - /// required for SSM). - pub sources: Vec, +/// Empty sources maps to ASM (EXCLUDE with no entries, accepting all +/// sources). Non-empty sources maps to SSM (INCLUDE with the listed +/// sources). +fn multicast_cfg_to_source_filter(cfg: &MulticastGroupCfg) -> SourceFilter { + if cfg.sources.is_empty() { + SourceFilter::default() + } else { + SourceFilter::Include( + cfg.sources + .iter() + .map(|s| oxide_vpc::api::IpAddr::from(*s)) + .collect(), + ) + } } #[derive(Debug)] @@ -115,9 +147,10 @@ struct PortManagerInner { /// IP address of the hosting sled on the underlay. underlay_ip: Ipv6Addr, - /// Map of all ports, keyed on the interface Uuid and its kind - /// (which includes the Uuid of the parent instance or service) - ports: Mutex>, + /// Map of all ports and their mutable state, keyed on the interface + /// Uuid and its kind (which includes the Uuid of the parent instance + /// or service). + ports: Mutex>, /// Map of all current resolved routes. routes: Mutex>, @@ -147,6 +180,7 @@ pub struct PortCreateParams<'a> { pub firewall_rules: &'a [ResolvedVpcFirewallRule], pub dhcp_config: DhcpCfg, pub attached_subnets: Vec, + pub multicast_groups: &'a [MulticastGroupCfg], } impl<'a> TryFrom<&PortCreateParams<'a>> for IpCfg { @@ -371,6 +405,7 @@ impl PortManager { firewall_rules, dhcp_config, attached_subnets: _, + multicast_groups, } = params; let is_service = matches!(nic.kind, NetworkInterfaceKind::Service { .. }); @@ -434,7 +469,7 @@ impl PortManager { .ports .lock() .unwrap() - .insert((nic.id, nic.kind), port.clone()); + .insert((nic.id, nic.kind), PortState::new(port.clone())); assert!( old.is_none(), "Duplicate OPTE port detected: interface_id = {}, kind = {:?}", @@ -553,6 +588,12 @@ impl PortManager { } drop(route_map); + // Configure multicast group subscriptions if any were + // provided at instance start. + if !multicast_groups.is_empty() { + self.multicast_groups_ensure(nic.id, nic.kind, multicast_groups)?; + } + info!( self.inner.log, "Created OPTE port"; @@ -620,13 +661,14 @@ impl PortManager { } // Note: We're deliberately holding both locks here - // to prevent several nexuses computng and applying deltas + // to prevent several nexuses computing and applying deltas // out of order. let ports = self.inner.ports.lock().unwrap(); let hdl = Handle::new()?; // Propagate deltas out to all ports. - for port in ports.values() { + for port_state in ports.values() { + let port = &port_state.port; // Fetch deltas for all router keys: system, IPv4 subnet, and IPv6 // subnet. let system_delta = deltas.get(&port.system_router_key()); @@ -714,11 +756,11 @@ impl PortManager { external_ips: &ExternalIpConfig, ) -> Result<(), Error> { let ports = self.inner.ports.lock().unwrap(); - let port = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { + let port_state = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { Error::ExternalIpUpdateMissingPort(nic_id, nic_kind) })?; - self.external_ips_ensure_port(port, nic_id, external_ips) + self.external_ips_ensure_port(&port_state.port, nic_id, external_ips) } /// Ensure external IPs for an OPTE port are up to date. @@ -772,73 +814,332 @@ impl PortManager { Ok(()) } - /// Validate multicast group memberships for an OPTE port. - /// - /// This method validates multicast group configurations but does not yet - /// configure OPTE port-level multicast group membership. The actual - /// multicast forwarding is currently handled by the reconciler + DPD - /// at the dataplane switch level. - /// - /// TODO: Once OPTE kernel module supports multicast group APIs, this - /// method should be updated to configure OPTE port-level multicast - /// group membership. Note: multicast groups are fleet-scoped and can span - /// across VPCs. + /// Ensure multicast group subscriptions for an OPTE port match the + /// requested set. This diffs current vs new state and issues + /// subscribe/unsubscribe ioctls as needed. pub fn multicast_groups_ensure( &self, nic_id: Uuid, nic_kind: NetworkInterfaceKind, multicast_groups: &[MulticastGroupCfg], ) -> Result<(), Error> { - let ports = self.inner.ports.lock().unwrap(); - let port = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { - Error::MulticastUpdateMissingPort(nic_id, nic_kind) - })?; - - debug!( - self.inner.log, - "Validating multicast group configuration for OPTE port"; - "port_name" => port.name(), - "nic_id" => ?nic_id, - "groups" => ?multicast_groups, - ); - - // Validate multicast group configurations + // Validate and build the new subscription set before acquiring locks. + let mut new_subs: HashMap = HashMap::new(); for group in multicast_groups { if !group.group_ip.is_multicast() { - error!( + return Err(Error::InvalidPortIpConfig(format!( + "not a multicast address: {}", + group.group_ip, + ))); + } + new_subs + .insert(group.group_ip, multicast_cfg_to_source_filter(group)); + } + + let hdl = Handle::new()?; + + let mut ports = self.inner.ports.lock().unwrap(); + let port_state = + ports.get_mut(&(nic_id, nic_kind)).ok_or_else(|| { + Error::MulticastUpdateMissingPort(nic_id, nic_kind) + })?; + let port_name = port_state.port.name().to_string(); + + // Unsubscribe groups that are no longer requested. + let to_remove: Vec = port_state + .mcast_subscriptions + .keys() + .filter(|g| !new_subs.contains_key(g)) + .copied() + .collect(); + + let removed = to_remove.len(); + for group_ip in &to_remove { + debug!( + self.inner.log, + "unsubscribing from multicast group"; + "port" => &port_name, + "group" => %group_ip, + ); + + // Effectively infallible, as the IPs are verified as multicast, + // the operation is idempotent, and the port exists. + hdl.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: port_name.clone(), + group: (*group_ip).into(), + })?; + + port_state.mcast_subscriptions.remove(group_ip); + } + + // Subscribe to new groups or update changed filters. + let mut added = 0usize; + for (group_ip, filter) in &new_subs { + let needs_subscribe = + match port_state.mcast_subscriptions.get(group_ip) { + None => true, + Some(current) => current != filter, + }; + + if needs_subscribe { + added += 1; + debug!( self.inner.log, - "Invalid multicast IP address"; - "group_ip" => %group.group_ip, - "port_name" => port.name(), + "subscribing to multicast group"; + "port" => &port_name, + "group" => %group_ip, + "filter" => ?filter, ); - return Err(Error::InvalidPortIpConfig(String::from( - "invalid multicast IP address", - ))); + + // Effectively infallible as the IPs are verified as multicast, + // the operation is idempotent, and the port exists. + hdl.mcast_subscribe(&McastSubscribeReq { + port_name: port_name.clone(), + group: (*group_ip).into(), + filter: filter.clone(), + })?; + + port_state + .mcast_subscriptions + .insert(*group_ip, filter.clone()); } } - // TODO: Configure firewall rules to allow multicast traffic. - // Add exceptions in source/dest MAC/L3 addr checking for multicast - // addresses matching known groups, only doing cidr-checking on the - // multicasst destination side. + if added > 0 || removed > 0 { + info!( + self.inner.log, + "multicast subscriptions updated"; + "port" => &port_name, + "added" => added, + "removed" => removed, + "active_groups" => port_state.mcast_subscriptions.len(), + ); + } else { + debug!( + self.inner.log, + "multicast subscriptions reconciled, no change"; + "port" => &port_name, + "active_groups" => port_state.mcast_subscriptions.len(), + ); + } + + Ok(()) + } + + /// Install a multicast overlay-to-underlay (M2P) mapping in OPTE. + pub fn set_mcast_m2p(&self, req: &Mcast2PhysMapping) -> Result<(), Error> { + let addr: Ipv6Addr = req.underlay; info!( self.inner.log, - "OPTE port configured for multicast traffic"; - "port_name" => port.name(), - "ipv4_range" => %IPV4_MULTICAST_RANGE, - "ipv6_range" => %IPV6_MULTICAST_RANGE, - "multicast_groups" => multicast_groups.len(), + "Setting multicast overlay-to-underlay mapping"; + "group" => %req.group, + "underlay" => %addr, ); - // TODO: Configure OPTE port for specific multicast group membership - // once OPTE kernel module APIs are available. This is distinct from - // zone vNIC underlay configuration (see instance.rs - // `join_multicast_group_inner`). + let underlay = MulticastUnderlay::new(addr.into()) + .map_err(|_| Error::InvalidMcastUnderlay(addr))?; + let hdl = Handle::new()?; + hdl.set_m2p(&SetMcast2PhysReq { group: req.group.into(), underlay })?; + Ok(()) + } + + /// Remove a multicast overlay-to-underlay (M2P) mapping from OPTE. + pub fn clear_mcast_m2p(&self, req: &ClearMcast2Phys) -> Result<(), Error> { + let addr: Ipv6Addr = req.underlay; + + info!( + self.inner.log, + "Clearing multicast overlay-to-underlay mapping"; + "group" => %req.group, + "underlay" => %addr, + ); + let underlay = MulticastUnderlay::new(addr.into()) + .map_err(|_| Error::InvalidMcastUnderlay(addr))?; + let hdl = Handle::new()?; + hdl.clear_m2p(&ClearMcast2PhysReq { + group: req.group.into(), + underlay, + })?; Ok(()) } + /// Set multicast forwarding next hops for an underlay group address. + pub fn set_mcast_fwd( + &self, + req: &McastForwardingEntry, + ) -> Result<(), Error> { + // Safe to unwrap: 77 is well within the 24-bit VNI range. + let mcast_vni = + Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI).unwrap(); + let addr: Ipv6Addr = req.underlay; + + info!( + self.inner.log, + "Setting multicast forwarding"; + "underlay" => %addr, + "next_hops" => req.next_hops.len(), + ); + + let underlay = MulticastUnderlay::new(addr.into()) + .map_err(|_| Error::InvalidMcastUnderlay(addr))?; + let next_hops = req + .next_hops + .iter() + .map(|nexthop| oxide_vpc::api::McastForwardingNextHop { + next_hop: oxide_vpc::api::NextHopV6 { + addr: nexthop.next_hop.into(), + vni: mcast_vni, + }, + replication: match nexthop.replication { + McastReplication::External => { + oxide_vpc::api::Replication::External + } + McastReplication::Underlay => { + oxide_vpc::api::Replication::Underlay + } + McastReplication::Both => oxide_vpc::api::Replication::Both, + }, + source_filter: match nexthop.filter.mode { + McastFilterMode::Include => SourceFilter::Include( + nexthop + .filter + .sources + .iter() + .copied() + .map(Into::into) + .collect(), + ), + McastFilterMode::Exclude => SourceFilter::Exclude( + nexthop + .filter + .sources + .iter() + .copied() + .map(Into::into) + .collect(), + ), + }, + }) + .collect(); + let hdl = Handle::new()?; + hdl.set_mcast_fwd(&SetMcastForwardingReq { underlay, next_hops })?; + Ok(()) + } + + /// Remove all multicast forwarding entries for an underlay group address. + pub fn clear_mcast_fwd( + &self, + req: &ClearMcastForwarding, + ) -> Result<(), Error> { + let addr: Ipv6Addr = req.underlay; + + info!( + self.inner.log, + "Clearing multicast forwarding"; + "underlay" => %addr, + ); + + let underlay = MulticastUnderlay::new(addr.into()) + .map_err(|_| Error::InvalidMcastUnderlay(addr))?; + let hdl = Handle::new()?; + hdl.clear_mcast_fwd(&ClearMcastForwardingReq { underlay })?; + Ok(()) + } + + /// Dump all multicast overlay-to-underlay (M2P) mappings from OPTE. + pub fn list_mcast_m2p(&self) -> Result, Error> { + let hdl = Handle::new()?; + let resp = hdl.dump_m2p()?; + let mappings = resp + .ip4 + .into_iter() + .map(|(group, underlay)| Mcast2PhysMapping { + group: IpAddr::V4(group.into()), + underlay: Ipv6Addr::from(underlay.addr()), + }) + .chain(resp.ip6.into_iter().map(|(group, underlay)| { + Mcast2PhysMapping { + group: IpAddr::V6(group.into()), + underlay: Ipv6Addr::from(underlay.addr()), + } + })) + .collect(); + Ok(mappings) + } + + /// Dump all multicast forwarding entries from OPTE. + pub fn list_mcast_fwd(&self) -> Result, Error> { + let hdl = Handle::new()?; + let resp = hdl.dump_mcast_fwd()?; + resp.entries + .into_iter() + .map(|entry| { + let next_hops = entry + .next_hops + .into_iter() + .filter_map(|nexthop| { + let replication = match nexthop.replication { + oxide_vpc::api::Replication::External => { + McastReplication::External + } + oxide_vpc::api::Replication::Underlay => { + McastReplication::Underlay + } + oxide_vpc::api::Replication::Both => { + McastReplication::Both + } + oxide_vpc::api::Replication::Reserved => { + // Reserved is a 2-bit padding value with + // no valid semantic meaning. Its presence + // in the forwarding table indicates a bug + // or manual opteadm intervention. Skip + // this hop rather than failing the entire + // list so the reconciler can still program + // valid next-hops. + warn!( + self.inner.log, + "skipping next hop with Reserved \ + replication mode"; + "next_hop" => %nexthop.next_hop.addr + ); + return None; + } + }; + + Some(McastForwardingNextHop { + next_hop: nexthop.next_hop.addr.into(), + replication, + filter: McastSourceFilter { + mode: match nexthop.source_filter.mode() { + FilterMode::Include => { + McastFilterMode::Include + } + FilterMode::Exclude => { + McastFilterMode::Exclude + } + }, + sources: nexthop + .source_filter + .sources() + .iter() + .copied() + .map(Into::into) + .collect(), + }, + }) + }) + .collect(); + + Ok(McastForwardingEntry { + underlay: Ipv6Addr::from(entry.underlay.addr()), + next_hops, + }) + }) + .collect() + } + pub fn firewall_rules_ensure( &self, vni: external::Vni, @@ -856,10 +1157,11 @@ impl PortManager { // We update VPC rules as a set so grab only // the relevant ports using the VPC's VNI. - let vpc_ports = ports - .iter() - .filter(|((_, _), port)| u32::from(vni) == u32::from(*port.vni())); - for ((_, _), port) in vpc_ports { + let vpc_ports = ports.iter().filter(|((_, _), port_state)| { + u32::from(vni) == u32::from(*port_state.port.vni()) + }); + for ((_, _), port_state) in vpc_ports { + let port = &port_state.port; let rules = opte_firewall_rules(rules, port.vni(), port.mac()); let port_name = port.name().to_string(); info!( @@ -969,7 +1271,7 @@ impl PortManager { ensure_added: Vec, ) -> EnsureAttachedSubnetResult { let ports = self.inner.ports.lock().unwrap(); - let Some(port) = ports.get(&(nic_id, nic_kind)) else { + let Some(port_state) = ports.get(&(nic_id, nic_kind)) else { return EnsureAttachedSubnetResult { diff: Default::default(), error: Some(Error::AttachedSubnetUpdateMissingPort( @@ -977,7 +1279,11 @@ impl PortManager { )), }; }; - self.attached_subnets_ensure_port(port, ensure_removed, ensure_added) + self.attached_subnets_ensure_port( + &port_state.port, + ensure_removed, + ensure_added, + ) } fn attached_subnets_ensure_port( @@ -1029,10 +1335,10 @@ impl PortManager { subnet: AttachedSubnet, ) -> Result<(), Error> { let ports = self.inner.ports.lock().unwrap(); - let port = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { + let port_state = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { Error::AttachedSubnetUpdateMissingPort(nic_id, nic_kind) })?; - self.attach_subnet_port(port, subnet) + self.attach_subnet_port(&port_state.port, subnet) } fn attach_subnet_port( @@ -1078,10 +1384,10 @@ impl PortManager { subnet: IpCidr, ) -> Result<(), Error> { let ports = self.inner.ports.lock().unwrap(); - let port = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { + let port_state = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { Error::AttachedSubnetUpdateMissingPort(nic_id, nic_kind) })?; - self.detach_subnet_port(port, subnet) + self.detach_subnet_port(&port_state.port, subnet) } fn detach_subnet_port( @@ -1154,7 +1460,7 @@ impl PortTicket { fn release_inner(&mut self) -> Result<(), Error> { let mut ports = self.manager.ports.lock().unwrap(); - let Some(port) = ports.remove(&(self.id, self.kind)) else { + let Some(port_state) = ports.remove(&(self.id, self.kind)) else { error!( self.manager.log, "Tried to release non-existent port"; @@ -1163,6 +1469,7 @@ impl PortTicket { ); return Err(Error::ReleaseMissingPort(self.id, self.kind)); }; + let port = &port_state.port; drop(ports); // Cleanup the set of subnets we want to receive routes for. @@ -1199,7 +1506,7 @@ impl PortTicket { "Removed OPTE port from manager"; "id" => ?&self.id, "kind" => ?&self.kind, - "port" => ?&port, + "port" => ?&port_state, ); Ok(()) } @@ -1229,6 +1536,7 @@ impl Drop for PortTicket { mod tests { use super::PortCreateParams; use super::PortManager; + use crate::opte::Error; use crate::opte::Handle; use macaddr::MacAddr6; use omicron_common::api::external::{MacAddr, Vni}; @@ -1242,9 +1550,11 @@ mod tests { use omicron_common::api::internal::shared::RouterVersion; use omicron_test_utils::dev::test_setup_log; use oxide_vpc::api::DhcpCfg; + use oxide_vpc::api::FilterMode; use oxide_vpc::api::IpCfg; use oxide_vpc::api::Ipv4Cidr; use oxide_vpc::api::Ipv6Cidr; + use oxide_vpc::api::SourceFilter; use oxnet::IpNet; use oxnet::Ipv4Net; use oxnet::Ipv6Net; @@ -1255,11 +1565,16 @@ mod tests { use sled_agent_types::inventory::NetworkInterfaceKind; use sled_agent_types::inventory::SourceNatConfigV4; use sled_agent_types::inventory::SourceNatConfigV6; + use sled_agent_types::multicast::MulticastGroupCfg; use std::collections::HashSet; + use std::net::IpAddr; use std::net::Ipv4Addr; use std::net::Ipv6Addr; use uuid::Uuid; + // Maximum ephemeral port number for source NAT (14-bit range). + const MAX_PORT: u16 = (1 << 14) - 1; + // Regression for https://github.com/oxidecomputer/omicron/issues/7541. #[test] fn multiple_ports_does_not_destroy_default_route() { @@ -1310,7 +1625,6 @@ mod tests { }), v6: None, }; - const MAX_PORT: u16 = (1 << 14) - 1; let (port0, _ticket0) = manager .create_port(PortCreateParams { nic: &NetworkInterface { @@ -1335,6 +1649,7 @@ mod tests { dns6_servers: Vec::new(), }, attached_subnets: vec![], + multicast_groups: &[], }) .unwrap(); @@ -1514,6 +1829,7 @@ mod tests { dns6_servers: Vec::new(), }, attached_subnets: vec![], + multicast_groups: &[], }) .unwrap(); @@ -1685,6 +2001,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let IpCfg::Ipv4(oxide_vpc::api::Ipv4Cfg { vpc_subnet, @@ -1758,6 +2075,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let IpCfg::Ipv6(oxide_vpc::api::Ipv6Cfg { vpc_subnet, @@ -1842,6 +2160,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let IpCfg::DualStack { ipv4, ipv6 } = IpCfg::try_from(&prs).unwrap() else { @@ -1932,6 +2251,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let _ = IpCfg::try_from(&prs).expect_err( "Should fail to convert with public IPv6 and private IPv4", @@ -1978,9 +2298,274 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let _ = IpCfg::try_from(&prs).expect_err( "Should fail to convert with public IPv4 and private IPv6", ); } + + #[test] + fn multicast_groups_ensure_diffing() { + let logctx = test_setup_log("multicast_groups_ensure_diffing"); + let manager = PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST); + + let handle = Handle::new().unwrap(); + handle.set_xde_underlay("underlay0", "underlay1").unwrap(); + + let nic_id = Uuid::new_v4(); + let nic_kind = NetworkInterfaceKind::Service { id: Uuid::new_v4() }; + + let private_subnet = + Ipv4Net::new(Ipv4Addr::new(172, 20, 0, 0), 24).unwrap(); + let private_ip = Ipv4Addr::new(172, 20, 0, 4); + let ip_config = + PrivateIpConfig::new_ipv4(private_ip, private_subnet).unwrap(); + let public_ip = Ipv4Addr::new(10, 0, 0, 4); + + let external_ips = ExternalIpConfig { + v4: Some(ExternalIpv4Config { + source_nat: Some( + SourceNatConfigV4::new(public_ip, 0, MAX_PORT).unwrap(), + ), + ..Default::default() + }), + v6: None, + }; + + // Bindings keep the port registered in the manager for this scope. + let (_port, _ticket) = manager + .create_port(PortCreateParams { + nic: &NetworkInterface { + id: nic_id, + kind: nic_kind, + name: "opte0".parse().unwrap(), + ip_config, + mac: MacAddr(MacAddr6::new( + 0xa8, 0x40, 0x25, 0x00, 0x00, 0x01, + )), + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + }, + external_ips: &external_ips, + firewall_rules: &[], + dhcp_config: DhcpCfg { + hostname: None, + host_domain: None, + domain_search_list: Vec::new(), + dns4_servers: Vec::new(), + dns6_servers: Vec::new(), + }, + attached_subnets: vec![], + multicast_groups: &[], + }) + .unwrap(); + + let group1: IpAddr = "239.1.1.1".parse().unwrap(); + let group2: IpAddr = "239.1.1.2".parse().unwrap(); + let source_a: IpAddr = "10.0.0.1".parse().unwrap(); + + // Subscribe to two groups: one ASM, one SSM. + manager + .multicast_groups_ensure( + nic_id, + nic_kind, + &[ + MulticastGroupCfg { group_ip: group1, sources: vec![] }, + MulticastGroupCfg { + group_ip: group2, + sources: vec![source_a], + }, + ], + ) + .unwrap(); + + // Verify port manager tracking. + { + let ports = manager.inner.ports.lock().unwrap(); + let port_state = ports.get(&(nic_id, nic_kind)).unwrap(); + assert_eq!(port_state.mcast_subscriptions.len(), 2); + assert_eq!( + *port_state.mcast_subscriptions.get(&group1).unwrap(), + SourceFilter::default(), + ); + assert_eq!( + port_state.mcast_subscriptions.get(&group2).unwrap().mode(), + FilterMode::Include, + ); + } + + // Verify mock OPTE state matches. + { + let opte = handle.state().lock().unwrap(); + let port = opte.ports.get("opte0").unwrap(); + assert_eq!(port.mcast_subscriptions.len(), 2); + assert!(port.mcast_subscriptions.contains_key(&group1)); + assert!(port.mcast_subscriptions.contains_key(&group2)); + } + + // Remove group2, keep group1. + manager + .multicast_groups_ensure( + nic_id, + nic_kind, + &[MulticastGroupCfg { group_ip: group1, sources: vec![] }], + ) + .unwrap(); + + { + let ports = manager.inner.ports.lock().unwrap(); + let port_state = ports.get(&(nic_id, nic_kind)).unwrap(); + assert_eq!(port_state.mcast_subscriptions.len(), 1); + assert!(port_state.mcast_subscriptions.contains_key(&group1)); + assert!(!port_state.mcast_subscriptions.contains_key(&group2)); + } + + { + let opte = handle.state().lock().unwrap(); + let port = opte.ports.get("opte0").unwrap(); + assert_eq!(port.mcast_subscriptions.len(), 1); + assert!(!port.mcast_subscriptions.contains_key(&group2)); + } + + // Remove all groups. + manager.multicast_groups_ensure(nic_id, nic_kind, &[]).unwrap(); + + { + let ports = manager.inner.ports.lock().unwrap(); + let port_state = ports.get(&(nic_id, nic_kind)).unwrap(); + assert!(port_state.mcast_subscriptions.is_empty()); + } + + { + let opte = handle.state().lock().unwrap(); + let port = opte.ports.get("opte0").unwrap(); + assert!(port.mcast_subscriptions.is_empty()); + } + + logctx.cleanup_successful(); + } + + #[test] + fn multicast_port_deletion_cleanup() { + let logctx = test_setup_log("multicast_port_deletion_cleanup"); + let manager = PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST); + + let handle = Handle::new().unwrap(); + handle.set_xde_underlay("underlay0", "underlay1").unwrap(); + + let nic_id = Uuid::new_v4(); + let nic_kind = NetworkInterfaceKind::Service { id: Uuid::new_v4() }; + + let private_subnet = + Ipv4Net::new(Ipv4Addr::new(172, 20, 0, 0), 24).unwrap(); + let private_ip = Ipv4Addr::new(172, 20, 0, 4); + let ip_config = + PrivateIpConfig::new_ipv4(private_ip, private_subnet).unwrap(); + let public_ip = Ipv4Addr::new(10, 0, 0, 4); + + let external_ips = ExternalIpConfig { + v4: Some(ExternalIpv4Config { + source_nat: Some( + SourceNatConfigV4::new(public_ip, 0, MAX_PORT).unwrap(), + ), + ..Default::default() + }), + v6: None, + }; + + let (_port, ticket) = manager + .create_port(PortCreateParams { + nic: &NetworkInterface { + id: nic_id, + kind: nic_kind, + name: "opte0".parse().unwrap(), + ip_config, + mac: MacAddr(MacAddr6::new( + 0xa8, 0x40, 0x25, 0x00, 0x00, 0x01, + )), + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + }, + external_ips: &external_ips, + firewall_rules: &[], + dhcp_config: DhcpCfg { + hostname: None, + host_domain: None, + domain_search_list: Vec::new(), + dns4_servers: Vec::new(), + dns6_servers: Vec::new(), + }, + attached_subnets: vec![], + multicast_groups: &[], + }) + .unwrap(); + + let group1: IpAddr = "239.2.2.1".parse().unwrap(); + + // Subscribe to a multicast group. + manager + .multicast_groups_ensure( + nic_id, + nic_kind, + &[MulticastGroupCfg { group_ip: group1, sources: vec![] }], + ) + .unwrap(); + + // Verify subscription tracking exists. + { + let ports = manager.inner.ports.lock().unwrap(); + let port_state = ports.get(&(nic_id, nic_kind)).unwrap(); + assert_eq!( + port_state.mcast_subscriptions.len(), + 1, + "subscription tracking should exist before release" + ); + } + + // Release the port ticket, which should clean up the port + // and its subscription tracking. + ticket.release(); + + // Verify port is removed entirely. + { + let ports = manager.inner.ports.lock().unwrap(); + assert!( + !ports.contains_key(&(nic_id, nic_kind)), + "port should be removed after release" + ); + } + + logctx.cleanup_successful(); + } + + #[test] + fn multicast_ensure_missing_port_error() { + let logctx = test_setup_log("multicast_ensure_missing_port_error"); + let manager = PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST); + + let nic_id = Uuid::new_v4(); + let nic_kind = NetworkInterfaceKind::Instance { id: Uuid::new_v4() }; + let group: IpAddr = "239.3.3.1".parse().unwrap(); + + let res = manager.multicast_groups_ensure( + nic_id, + nic_kind, + &[MulticastGroupCfg { group_ip: group, sources: vec![] }], + ); + + match res { + Err(Error::MulticastUpdateMissingPort(id, kind)) => { + assert_eq!(id, nic_id); + assert_eq!(kind, nic_kind); + } + other => { + panic!("expected MulticastUpdateMissingPort, got {other:?}") + } + } + + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/src/db/datastore/multicast/members.rs b/nexus/db-queries/src/db/datastore/multicast/members.rs index 1c2d25a703b..c9f2cd712b9 100644 --- a/nexus/db-queries/src/db/datastore/multicast/members.rs +++ b/nexus/db-queries/src/db/datastore/multicast/members.rs @@ -91,6 +91,10 @@ impl DataStore { /// - `None` → preserve existing `source_ips` (rejoin without changes) /// - `Some([])` → clear `source_ips` (switch to ASM) /// - `Some([a,b])` → replace with new `source_ips` (update sources) + /// + /// Atomically enforces the per-group source IP union cap + /// ([`omicron_common::address::MAX_SOURCE_IPS_PER_GROUP`]) when a + /// non-empty source list is being applied. pub async fn multicast_group_member_attach_to_instance( &self, opctx: &OpContext, diff --git a/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs b/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs index 254a2485bd7..6c5b43cfc85 100644 --- a/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs +++ b/nexus/db-queries/src/db/datastore/multicast/ops/member_attach.rs @@ -38,6 +38,7 @@ use uuid::Uuid; use nexus_db_lookup::DbConnection; use nexus_db_model::{MulticastGroupMember, MulticastGroupMemberState}; +use omicron_common::address::MAX_SOURCE_IPS_PER_GROUP; use omicron_common::api::external; use crate::db::true_or_cast_error::matches_sentinel; @@ -47,6 +48,7 @@ use crate::db::true_or_cast_error::matches_sentinel; // the specific failure reason from the error message. const GROUP_NOT_FOUND_SENTINEL: &str = "group-not-found"; const INSTANCE_NOT_FOUND_SENTINEL: &str = "instance-not-found"; +const UNION_EXCEEDED_SENTINEL: &str = "source-union-exceeded"; /// Result of attaching an instance to a multicast group. #[derive(Debug, Clone, PartialEq)] @@ -62,6 +64,9 @@ pub(crate) enum AttachMemberError { GroupNotFound, /// Instance doesn't exist or has been deleted InstanceNotFound, + /// Attaching this member would push the group's source IP union past + /// the per-group cap. + SourceUnionExceeded { cap: usize }, /// Database constraint violation (unique index, etc.) ConstraintViolation(String), /// Other database error @@ -72,16 +77,26 @@ impl AttachMemberError { /// Construct an [`AttachMemberError`] from a database error. /// /// This catches the sentinel errors that indicate validation failures - /// (group not found, instance not found) as well as constraint violations. + /// (group not found, instance not found, source union cap) as well as + /// constraint violations. fn from_diesel(err: DieselError) -> Self { // Check for sentinel errors first - let sentinels = [GROUP_NOT_FOUND_SENTINEL, INSTANCE_NOT_FOUND_SENTINEL]; + let sentinels = [ + GROUP_NOT_FOUND_SENTINEL, + INSTANCE_NOT_FOUND_SENTINEL, + UNION_EXCEEDED_SENTINEL, + ]; if let Some(sentinel) = matches_sentinel(&err, &sentinels) { return match sentinel { GROUP_NOT_FOUND_SENTINEL => AttachMemberError::GroupNotFound, INSTANCE_NOT_FOUND_SENTINEL => { AttachMemberError::InstanceNotFound } + UNION_EXCEEDED_SENTINEL => { + AttachMemberError::SourceUnionExceeded { + cap: MAX_SOURCE_IPS_PER_GROUP, + } + } _ => unreachable!("Unknown sentinel: {sentinel}"), }; } @@ -115,6 +130,12 @@ impl From for external::Error { "Instance does not exist or has been deleted", ) } + AttachMemberError::SourceUnionExceeded { cap } => { + external::Error::invalid_request(format!( + "attaching this member would exceed the per-group \ + source IP union cap of {cap}", + )) + } AttachMemberError::ConstraintViolation(msg) => { external::Error::invalid_request(&format!( "Constraint violation: {msg}" @@ -173,9 +194,11 @@ impl AttachMemberToGroupStatement { /// - `new_member_id`: UUID for new member row (if creating) /// - `source_ips`: Source IPs for filtering (`None` preserves existing on reactivation) /// - /// CTEs atomically validate group is not in a "Deleting" state, - /// that the instance exists, retrieves the current `sled_id` from - /// VMM table, then performs the upsert. + /// CTEs atomically validate group is not in a "Deleting" state, that the + /// instance exists, retrieves the current `sled_id` from VMM table, and + /// (when a non-empty source list is being applied) verifies that the + /// resulting per-group source IP union stays within + /// [`MAX_SOURCE_IPS_PER_GROUP`]. pub fn new( group_id: Uuid, instance_id: Uuid, @@ -273,22 +296,16 @@ impl AttachMemberToGroupStatement { /// Uses CAST to trigger a predictable error when validation fails: /// - If group not found → CAST('group-not-found' AS BOOL) fails /// - If instance not found → CAST('instance-not-found' AS BOOL) fails - /// - If both valid → CAST('TRUE' AS BOOL) succeeds + /// - If the resulting source IP union would exceed the per-group cap + /// → CAST('source-union-exceeded' AS BOOL) fails (only checked when a + /// non-empty source list is being applied) + /// - If all valid → CAST('TRUE' AS BOOL) succeeds /// /// This follows the pattern used in `network_interface.rs` and `external_ip.rs`. fn push_validation_cte<'a>( &'a self, mut out: AstPass<'_, 'a, Pg>, ) -> QueryResult<()> { - // SELECT CAST( - // CASE - // WHEN NOT EXISTS (SELECT 1 FROM instance_sled) THEN 'instance-not-found' - // WHEN NOT EXISTS (SELECT 1 FROM valid_group) THEN 'group-not-found' - // ELSE 'TRUE' - // END AS BOOL - // ) AS validated - // - // Instance is checked first to provide more those errors up front out.push_sql("SELECT CAST(CASE "); out.push_sql("WHEN NOT EXISTS (SELECT 1 FROM instance_sled) THEN '"); out.push_sql(INSTANCE_NOT_FOUND_SENTINEL); @@ -296,10 +313,54 @@ impl AttachMemberToGroupStatement { out.push_sql("WHEN NOT EXISTS (SELECT 1 FROM valid_group) THEN '"); out.push_sql(GROUP_NOT_FOUND_SENTINEL); out.push_sql("' "); + if self.check_union_size() { + out.push_sql("WHEN (SELECT size FROM proposed_union_size) > "); + out.push_sql(&MAX_SOURCE_IPS_PER_GROUP.to_string()); + out.push_sql(" THEN '"); + out.push_sql(UNION_EXCEEDED_SENTINEL); + out.push_sql("' "); + } out.push_sql("ELSE 'TRUE' END AS BOOL) AS validated"); Ok(()) } + /// Whether the resulting source IP union should be checked against the + /// per-group cap. Skipped when the caller is preserving existing sources + /// (`None`) or explicitly clearing them (empty list), since neither path + /// grows the union. + fn check_union_size(&self) -> bool { + self.update_source_ips_on_reactivation + && !self.source_ips_for_insert.is_empty() + } + + /// Generates the `proposed_union_size` CTE. + /// + /// Computes the size of the source IP union that would result from this + /// attach: all other active members' source IPs unioned with the proposed + /// list. This member's existing row (if any) is excluded because its + /// sources are being replaced. + fn push_proposed_union_size_cte<'a>( + &'a self, + mut out: AstPass<'_, 'a, Pg>, + ) -> QueryResult<()> { + out.push_sql( + "SELECT count(DISTINCT source_ip) AS size FROM (\ + SELECT unnest(source_ips) AS source_ip \ + FROM multicast_group_member \ + WHERE external_group_id = ", + ); + out.push_bind_param::(&self.group_id)?; + out.push_sql(" AND parent_id != "); + out.push_bind_param::(&self.instance_id)?; + out.push_sql(" AND time_deleted IS NULL "); + out.push_sql("UNION ALL SELECT unnest("); + out.push_bind_param::, _>( + &self.source_ips_for_insert, + )?; + out.push_sql(") AS source_ip) s"); + Ok(()) + } + /// Generates the `upserted_member` CTE (performs unconditional upsert). /// /// SELECT joins with both `valid_group` and `instance_sled` CTEs to: @@ -433,6 +494,14 @@ impl QueryFragment for AttachMemberToGroupStatement { self.push_instance_sled_cte(out.reborrow())?; out.push_sql("), "); + // CTE: Compute the prospective per-group source IP union size when + // a non-empty source list is being applied. + if self.check_union_size() { + out.push_sql("proposed_union_size AS ("); + self.push_proposed_union_size_cte(out.reborrow())?; + out.push_sql("), "); + } + // CTE: Validation that triggers sentinel errors on failure out.push_sql("validation AS MATERIALIZED ("); self.push_validation_cte(out.reborrow())?; diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index fdf511da6f8..4d069bf609f 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -83,6 +83,7 @@ api_versions!([ // | date-based version should be at the top of the list. // v // (next_yyyy_mm_dd_nn, IDENT), + (2026_05_22_00, MULTICAST_SOURCE_LIMITS), (2026_05_20_00, ADD_CONTACT_SUPPORT_TO_UPDATE_STATUS), (2026_05_08_00, MANUAL_DISK_ADOPTION), (2026_05_07_00, REMOVE_DUPLICATED_NETWORKING_TYPES), @@ -5937,13 +5938,15 @@ pub trait NexusExternalApi { /// the group must already exist. /// /// Source IPs are optional for ASM addresses but required for SSM addresses - /// (232.0.0.0/8 for IPv4, ff3x::/32 for IPv6). Duplicate IPs in the request - /// are automatically deduplicated, with a maximum of 64 source IPs allowed. + /// (232.0.0.0/8 for IPv4, ff3x::/32 for IPv6). Duplicate source IPs in a + /// single request are rejected. Per-member source list is capped at 32, and + /// the union of source IPs across all members of a single group is capped + /// at 256. #[endpoint { method = PUT, path = "/v1/instances/{instance}/multicast-groups/{multicast_group}", tags = ["experimental"], - versions = VERSION_MULTICAST_IMPLICIT_LIFECYCLE_UPDATES.., + versions = VERSION_MULTICAST_SOURCE_LIMITS.., }] async fn instance_multicast_group_join( rqctx: RequestContext, @@ -5955,6 +5958,37 @@ pub trait NexusExternalApi { HttpError, >; + /// Join multicast group by name, IP address, or UUID + /// + /// Groups can be referenced by name, IP address, or UUID. If the group + /// doesn't exist, it's implicitly created with an auto-allocated IP from a + /// multicast pool linked to the caller's silo. When referencing by UUID, + /// the group must already exist. + /// + /// Source IPs are optional for ASM addresses but required for SSM addresses + /// (232.0.0.0/8 for IPv4, ff3x::/32 for IPv6). Duplicate IPs in the request + /// are automatically deduplicated, with a maximum of 64 source IPs allowed. + #[endpoint { + method = PUT, + path = "/v1/instances/{instance}/multicast-groups/{multicast_group}", + tags = ["experimental"], + operation_id = "instance_multicast_group_join", + versions = VERSION_MULTICAST_IMPLICIT_LIFECYCLE_UPDATES..VERSION_MULTICAST_SOURCE_LIMITS, + }] + async fn instance_multicast_group_join_v2026_01_08_00( + rqctx: RequestContext, + path_params: Path< + v2026_01_08_00::multicast::InstanceMulticastGroupPath, + >, + query_params: Query, + body_params: TypedBody< + v2026_01_08_00::multicast::InstanceMulticastGroupJoin, + >, + ) -> Result< + HttpResponseCreated, + HttpError, + >; + /// Join multicast group /// /// Deprecated: newer version supports implicit group creation, accepts group diff --git a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs index 0cede8c217c..be16db45055 100644 --- a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs +++ b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs @@ -224,7 +224,7 @@ mod api_impl { use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; use sled_agent_types::instance::InstanceEnsureBody; use sled_agent_types::instance::InstanceExternalIpBody; - use sled_agent_types::instance::InstanceMulticastBody; + use sled_agent_types::instance::InstanceMulticastMembership; use sled_agent_types::instance::SledVmmState; use sled_agent_types::instance::VmmIssueDiskSnapshotRequestBody; use sled_agent_types::instance::VmmIssueDiskSnapshotRequestPathParam; @@ -250,6 +250,10 @@ mod api_impl { use sled_agent_types::inventory::SledCpuFamily; use sled_agent_types::inventory::SledRole; use sled_agent_types::inventory::SvcsEnabledNotOnlineResult; + use sled_agent_types::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, + }; use sled_agent_types::probes::ProbeSet; use sled_agent_types::sled::AddSledRequest; use sled_agent_types::support_bundle::RangeRequestHeaders; @@ -631,45 +635,17 @@ mod api_impl { async fn vmm_join_multicast_group( _rqctx: RequestContext, _path_params: Path, - body: TypedBody, + _body: TypedBody, ) -> Result { - let body_args = body.into_inner(); - match body_args { - InstanceMulticastBody::Join(_) => { - // MGS test utility - just return success for test compatibility - Ok(HttpResponseUpdatedNoContent()) - } - InstanceMulticastBody::Leave(_) => { - // This endpoint is for joining - reject leave operations - Err(HttpError::for_bad_request( - None, - "Join endpoint cannot process Leave operations" - .to_string(), - )) - } - } + unimplemented!() } async fn vmm_leave_multicast_group( _rqctx: RequestContext, _path_params: Path, - body: TypedBody, + _body: TypedBody, ) -> Result { - let body_args = body.into_inner(); - match body_args { - InstanceMulticastBody::Leave(_) => { - // MGS test utility - just return success for test compatibility - Ok(HttpResponseUpdatedNoContent()) - } - InstanceMulticastBody::Join(_) => { - // This endpoint is for leaving - reject join operations - Err(HttpError::for_bad_request( - None, - "Leave endpoint cannot process Join operations" - .to_string(), - )) - } - } + unimplemented!() } async fn disk_put( @@ -760,6 +736,47 @@ mod api_impl { unimplemented!() } + async fn set_mcast_m2p( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn clear_mcast_m2p( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn set_mcast_fwd( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn clear_mcast_fwd( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn list_mcast_m2p( + _rqctx: RequestContext, + ) -> Result>, HttpError> { + unimplemented!() + } + + async fn list_mcast_fwd( + _rqctx: RequestContext, + ) -> Result>, HttpError> + { + unimplemented!() + } + async fn uplink_ensure( _rqctx: RequestContext, _body: TypedBody, diff --git a/nexus/src/app/background/tasks/multicast/groups.rs b/nexus/src/app/background/tasks/multicast/groups.rs index db2c51938a5..0db31b033dc 100644 --- a/nexus/src/app/background/tasks/multicast/groups.rs +++ b/nexus/src/app/background/tasks/multicast/groups.rs @@ -20,6 +20,9 @@ //! - **"Creating" state**: Initiate DPD "ensure" to apply configuration //! - **"Active" state**: Detect DPD drift and sync directly //! - **"Deleting" state**: Switch cleanup and database removal +//! - **M2P/forwarding propagation**: Convergent per-sled propagation of +//! M2P mappings and forwarding entries via sled-agent after member +//! state changes //! - **Extensible processing**: Support for different group types //! //! # Group State Transition Matrix @@ -93,6 +96,7 @@ use super::{ use crate::app::multicast::dataplane::{ GroupUpdateParams, MulticastDataplaneClient, }; +use crate::app::multicast::sled::MulticastSledClient; use crate::app::saga::create_saga_dag; use crate::app::sagas; @@ -100,7 +104,7 @@ use crate::app::sagas; /// /// This grace period avoids racing with in-progress member attachment operations /// that occur immediately after group creation. -const ORPHAN_GROUP_MIN_AGE: chrono::Duration = chrono::Duration::seconds(10); +const ORPHAN_GROUP_MIN_AGE: chrono::TimeDelta = chrono::TimeDelta::seconds(10); /// Check if DPD tag matches the database group's tag. /// @@ -130,35 +134,48 @@ fn dpd_state_matches_sources( let dpd_sources = dpd_group.sources.clone(); let group_ip = group.multicast_ip.ip(); - // Expected DPD state based on source filter logic (RFC 4607) - let expected_sources = if is_ssm_address(group_ip) { - Some(&source_filter.specific_sources) + if is_ssm_address(group_ip) { + // SSM: always expect specific sources + match dpd_sources { + None => false, + Some(dpd_srcs) => { + let mut dpd_ips: Vec<_> = dpd_srcs + .into_iter() + .filter_map(|src| match src { + dpd_client::types::IpSrc::Exact(ip) => Some(ip), + _ => None, + }) + .collect(); + dpd_ips.sort(); + + let mut expected: Vec<_> = + source_filter.specific_sources.iter().copied().collect(); + expected.sort(); + + dpd_ips == expected + } + } } else if source_filter.has_any_source_member { - None + dpd_sources.is_none() } else { - Some(&source_filter.specific_sources) - }; - - match (dpd_sources, expected_sources) { - (None, None) => true, - (Some(_), None) => false, // DPD has sources but shouldn't - (None, Some(_)) => false, // DPD missing sources - (Some(dpd_srcs), Some(expected)) => { - // Extract exact IPs from DPD sources - let mut dpd_ips: Vec<_> = dpd_srcs - .into_iter() - .filter_map(|src| match src { - dpd_client::types::IpSrc::Exact(ip) => Some(ip), - _ => None, - }) - .collect(); - dpd_ips.sort(); - - let mut expected_sorted: Vec<_> = - expected.iter().copied().collect(); - expected_sorted.sort(); - - dpd_ips == expected_sorted + match dpd_sources { + None => source_filter.specific_sources.is_empty(), + Some(dpd_srcs) => { + let mut dpd_ips: Vec<_> = dpd_srcs + .into_iter() + .filter_map(|src| match src { + dpd_client::types::IpSrc::Exact(ip) => Some(ip), + _ => None, + }) + .collect(); + dpd_ips.sort(); + + let mut expected: Vec<_> = + source_filter.specific_sources.iter().copied().collect(); + expected.sort(); + + dpd_ips == expected + } } } } @@ -180,6 +197,7 @@ trait GroupStateProcessor { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result; /// Process a group in "Active" state (check DPD sync status). @@ -189,6 +207,7 @@ trait GroupStateProcessor { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result; } @@ -213,9 +232,15 @@ impl GroupStateProcessor for ExternalGroupProcessor { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { reconciler - .handle_deleting_external_group(opctx, group, dataplane_client) + .handle_deleting_external_group( + opctx, + group, + dataplane_client, + sled_client, + ) .await } @@ -226,9 +251,15 @@ impl GroupStateProcessor for ExternalGroupProcessor { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { reconciler - .handle_active_external_group(opctx, group, dataplane_client) + .handle_active_external_group( + opctx, + group, + dataplane_client, + sled_client, + ) .await } } @@ -336,6 +367,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, state: MulticastGroupState, dataplane_client: Option<&MulticastDataplaneClient>, + sled_client: Option<&MulticastSledClient>, ) -> Result { trace!(opctx.log, "searching for multicast groups"; "state" => %state); @@ -363,7 +395,12 @@ impl MulticastGroupReconciler { let results = stream::iter(groups) .map(|group| async move { let result = self - .process_group_state(opctx, &group, dataplane_client) + .process_group_state( + opctx, + &group, + dataplane_client, + sled_client, + ) .await; (group, result) }) @@ -404,7 +441,7 @@ impl MulticastGroupReconciler { processed += 1; } - debug!( + trace!( opctx.log, "processed multicast group"; "state" => %state, @@ -446,6 +483,7 @@ impl MulticastGroupReconciler { opctx, MulticastGroupState::Creating, None, + None, ) .await } @@ -455,11 +493,13 @@ impl MulticastGroupReconciler { &self, opctx: &OpContext, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { self.reconcile_groups_by_state( opctx, MulticastGroupState::Deleting, Some(dataplane_client), + Some(sled_client), ) .await } @@ -469,11 +509,13 @@ impl MulticastGroupReconciler { &self, opctx: &OpContext, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { self.reconcile_groups_by_state( opctx, MulticastGroupState::Active, Some(dataplane_client), + Some(sled_client), ) .await } @@ -485,6 +527,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: Option<&MulticastDataplaneClient>, + sled_client: Option<&MulticastSledClient>, ) -> Result { // Future: Match on group type to select different processors if // we add more nuanced group types @@ -497,15 +540,31 @@ impl MulticastGroupReconciler { MulticastGroupState::Deleting => { let dataplane_client = dataplane_client .context("dataplane client required for deleting state")?; + let sled_client = sled_client + .context("sled client required for deleting state")?; processor - .process_deleting(self, opctx, group, dataplane_client) + .process_deleting( + self, + opctx, + group, + dataplane_client, + sled_client, + ) .await } MulticastGroupState::Active => { let dataplane_client = dataplane_client .context("dataplane client required for active state")?; + let sled_client = sled_client + .context("sled client required for active state")?; processor - .process_active(self, opctx, group, dataplane_client) + .process_active( + self, + opctx, + group, + dataplane_client, + sled_client, + ) .await } MulticastGroupState::Deleted => { @@ -623,6 +682,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { debug!( opctx.log, @@ -635,8 +695,13 @@ impl MulticastGroupReconciler { "dpd_cleanup_required" => true ); - self.process_deleting_group_inner(opctx, group, dataplane_client) - .await?; + self.process_deleting_group_inner( + opctx, + group, + dataplane_client, + sled_client, + ) + .await?; Ok(StateTransition::StateChanged) } @@ -649,6 +714,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { let underlay_group_id = group .underlay_group_id @@ -747,6 +813,22 @@ impl MulticastGroupReconciler { "group_id" => %group.id(), "multicast_ip" => %group.multicast_ip ); + + // Propagate M2P/forwarding to member sleds after DPD + // sync to ensure OPTE state is also consistent. + if let Err(e) = sled_client + .propagate_m2p_and_forwarding(opctx, group) + .await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding after \ + drift correction (will retry)"; + "group_id" => %group.id(), + "error" => %e + ); + } + Ok(StateTransition::StateChanged) } Err(e) => { @@ -761,6 +843,19 @@ impl MulticastGroupReconciler { } } } else { + // Even when DPD is in sync, propagate M2P/forwarding to + // member sleds to correct any sled-level drift. + if let Err(e) = + sled_client.propagate_m2p_and_forwarding(opctx, group).await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding (will retry)"; + "group_id" => %group.id(), + "error" => %e + ); + } + Ok(StateTransition::NoChange) } } @@ -772,7 +867,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, ) -> Result { - debug!( + trace!( opctx.log, "processing creating multicast group"; "group" => ?group @@ -789,7 +884,7 @@ impl MulticastGroupReconciler { format!("failed to fetch linked underlay group {underlay_id}") })?; - debug!( + trace!( opctx.log, "found linked underlay group"; "group" => ?group, @@ -798,7 +893,7 @@ impl MulticastGroupReconciler { underlay } None => { - debug!( + trace!( opctx.log, "creating new underlay group"; "group" => ?group @@ -860,6 +955,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result<(), anyhow::Error> { let tag = Self::get_multicast_tag(group) .context("multicast group missing tag")?; @@ -875,6 +971,15 @@ impl MulticastGroupReconciler { "cleanup_includes" => "[external_group, underlay_group, forwarding_rules, member_ports]" ); + // Clear M2P/forwarding from all sleds before DPD cleanup. + // This must succeed before deleting DB records, otherwise + // stale OPTE state would persist on sleds where the clear + // failed, with no DB record to drive a retry on a later pass. + sled_client + .clear_m2p_and_forwarding(opctx, group) + .await + .context("failed to clear M2P/forwarding from sleds")?; + // Use dataplane client from reconciliation pass to cleanup switch(es) // state by tag dataplane_client @@ -1034,9 +1139,8 @@ mod tests { } #[test] - fn test_dpd_state_matches_sources_asm_address() { - // ASM address with all members specifying sources: expect those - // sources in DPD. + fn test_dpd_state_matches_sources_asm_with_specific_sources() { + // ASM address with specific sources only (no any-source members) let source_filter = SourceFilterState { specific_sources: BTreeSet::from(["10.0.0.1" .parse::() @@ -1044,23 +1148,29 @@ mod tests { has_any_source_member: false, }; - let group = create_group("224.1.1.1"); // ASM address (not 232.x.x.x) + let group = create_group("224.1.1.1"); // ASM address - // DPD has matching sources (correct) + // DPD has matching specific sources let dpd_group = create_dpd_group(Some(vec![dpd_client::types::IpSrc::Exact( "10.0.0.1".parse().unwrap(), )])); assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); - // DPD has None (mismatch: ASM with all-specific should have sources) + // DPD has None (mismatch: should have specific sources) let dpd_group = create_dpd_group(None); assert!(!dpd_state_matches_sources(&dpd_group, &source_filter, &group)); + + // DPD has IpSrc::Any (mismatch: should have specific sources) + let dpd_group = + create_dpd_group(Some(vec![dpd_client::types::IpSrc::Any])); + assert!(!dpd_state_matches_sources(&dpd_group, &source_filter, &group)); } #[test] fn test_dpd_state_matches_sources_asm_with_any_source_member() { - // ASM address with has_any_source_member=true - expects None from DPD + // ASM address with has_any_source_member=true: we send None to DPD, + // and DPD canonicalizes any-source representations to None. let source_filter = SourceFilterState { specific_sources: BTreeSet::new(), has_any_source_member: true, @@ -1068,11 +1178,33 @@ mod tests { let group = create_group("224.1.1.1"); // ASM address - // DPD has None (correct for ASM with any-source members) + // DPD has None (correct: any-source canonicalizes to None) + let dpd_group = create_dpd_group(None); + assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); + + // DPD has specific sources (mismatch) + let dpd_group = + create_dpd_group(Some(vec![dpd_client::types::IpSrc::Exact( + "10.0.0.1".parse().unwrap(), + )])); + assert!(!dpd_state_matches_sources(&dpd_group, &source_filter, &group)); + } + + #[test] + fn test_dpd_state_matches_sources_asm_no_sources() { + // ASM with no source filters at all expects None + let source_filter = SourceFilterState { + specific_sources: BTreeSet::new(), + has_any_source_member: false, + }; + + let group = create_group("224.1.1.1"); // ASM address + + // DPD has None (correct: no sources configured) let dpd_group = create_dpd_group(None); assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); - // DPD has sources (mismatch: should be none) + // DPD has sources (mismatch) let dpd_group = create_dpd_group(Some(vec![dpd_client::types::IpSrc::Exact( "10.0.0.1".parse().unwrap(), diff --git a/nexus/src/app/background/tasks/multicast/members.rs b/nexus/src/app/background/tasks/multicast/members.rs index 1b7f81c6ab3..294afe76831 100644 --- a/nexus/src/app/background/tasks/multicast/members.rs +++ b/nexus/src/app/background/tasks/multicast/members.rs @@ -42,6 +42,11 @@ //! - **State transitions**: "Joining" → "Joined" → "Left" with reactivation //! - **Dataplane updates**: Applying and removing configuration via DPD //! client(s) on switches +//! - **M2P/forwarding propagation**: After join, leave, or migration, M2P +//! mappings and forwarding entries are propagated to all sleds via +//! sled-agent inline (not deferred to the next reconciliation pass) +//! - **OPTE subscriptions**: Per-VMM multicast group filters managed via +//! sled-agent on the hosting sled //! - **Sled migration**: Detecting moves and updating dataplane configuration //! (no transition to "Left") //! - **Cleanup**: Removing orphaned switch state for deleted members @@ -124,10 +129,31 @@ use omicron_uuid_kinds::{ use super::{MulticastGroupReconciler, StateTransition, SwitchBackplanePort}; use crate::app::multicast::dataplane::MulticastDataplaneClient; +use crate::app::multicast::sled::MulticastSledClient; + +/// Pre-fetched instance state for multicast reconciliation. +#[derive(Clone, Copy, Debug, Default)] +struct InstanceMulticastState { + /// Whether the instance is in a state that can receive multicast traffic. + valid: bool, + /// Current sled hosting the VMM, if any. + sled_id: Option, + /// Current propolis VMM identifier, if any. + propolis_id: Option, +} + +/// Context shared across member reconciliation operations. +struct MemberReconcileCtx<'a> { + opctx: &'a OpContext, + group: &'a MulticastGroup, + member: &'a MulticastGroupMember, + instance_states: &'a InstanceStateMap, + dataplane_client: &'a MulticastDataplaneClient, + sled_client: &'a MulticastSledClient, +} -/// Pre-fetched instance state data for batch processing. -/// Maps instance_id -> (is_valid_for_multicast, current_sled_id). -type InstanceStateMap = HashMap)>; +/// Maps instance_id to pre-fetched multicast-relevant state. +type InstanceStateMap = HashMap; /// Backplane port mapping from DPD-client. /// Maps switch port ID to backplane link configuration. @@ -168,33 +194,21 @@ trait MemberStateProcessor { async fn process_joining( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result; /// Process a member in "Joined" state. async fn process_joined( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result; /// Process a member in "Left" state. async fn process_left( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result; } @@ -205,61 +219,25 @@ impl MemberStateProcessor for InstanceMemberProcessor { async fn process_joining( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - reconciler - .handle_instance_joining( - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + reconciler.handle_instance_joining(ctx).await } async fn process_joined( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - reconciler - .handle_instance_joined( - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + reconciler.handle_instance_joined(ctx).await } async fn process_left( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - reconciler - .handle_instance_left( - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + reconciler.handle_instance_left(ctx).await } } @@ -276,6 +254,7 @@ impl MulticastGroupReconciler { &self, opctx: &OpContext, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { trace!(opctx.log, "reconciling member state changes"); @@ -286,7 +265,12 @@ impl MulticastGroupReconciler { for group in groups { match self - .process_group_member_states(opctx, &group, dataplane_client) + .process_group_member_states( + opctx, + &group, + dataplane_client, + sled_client, + ) .await { Ok(count) => { @@ -326,6 +310,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { let mut processed = 0; @@ -348,6 +333,7 @@ impl MulticastGroupReconciler { &member, &instance_states, dataplane_client, + sled_client, ) .await; (member, res) @@ -364,7 +350,7 @@ impl MulticastGroupReconciler { StateTransition::StateChanged | StateTransition::NoChange => { processed += 1; - debug!( + trace!( opctx.log, "processed member state change"; "member" => ?member, @@ -374,7 +360,7 @@ impl MulticastGroupReconciler { } StateTransition::NeedsCleanup => { processed += 1; - debug!( + trace!( opctx.log, "member marked for cleanup"; "member" => ?member, @@ -382,7 +368,7 @@ impl MulticastGroupReconciler { ); } StateTransition::EntityGone => { - debug!( + trace!( opctx.log, "member deleted during processing"; "member" => ?member, @@ -407,7 +393,7 @@ impl MulticastGroupReconciler { /// Main dispatch function for processing member state changes. /// - /// Routes to appropriate node based on member type. + /// Routes to the appropriate handler based on member state. async fn process_member_state( &self, opctx: &OpContext, @@ -415,6 +401,7 @@ impl MulticastGroupReconciler { member: &MulticastGroupMember, instance_states: &InstanceStateMap, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { // Check if the parent group has been deleted or is being deleted. // If so, delete the member so cleanup can proceed. @@ -444,43 +431,24 @@ impl MulticastGroupReconciler { // For now, all members are instance-based, but this is where we'd // dispatch to different processors for different member types let processor = InstanceMemberProcessor; + let ctx = MemberReconcileCtx { + opctx, + group, + member, + instance_states, + dataplane_client, + sled_client, + }; match member.state { MulticastGroupMemberState::Joining => { - processor - .process_joining( - self, - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + processor.process_joining(self, &ctx).await } MulticastGroupMemberState::Joined => { - processor - .process_joined( - self, - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + processor.process_joined(self, &ctx).await } MulticastGroupMemberState::Left => { - processor - .process_left( - self, - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + processor.process_left(self, &ctx).await } } } @@ -495,7 +463,7 @@ impl MulticastGroupReconciler { ) -> Result { // Skip if member is already deleted if member.time_deleted.is_some() { - debug!( + trace!( opctx.log, "member already deleted, no action needed"; "member_id" => %member.id, @@ -532,35 +500,23 @@ impl MulticastGroupReconciler { /// when ready. Uses CAS operations for concurrent-safe state updates. async fn handle_instance_joining( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - // Extract pre-fetched instance state - let (instance_valid, current_sled_id) = - self.get_instance_state_from_cache(instance_states, member); + let instance_state = + self.get_instance_state_from_cache(ctx.instance_states, ctx.member); - // Execute reconciliation CAS operation let reconcile_res = self .execute_joining_reconciliation( - opctx, - group, - member, - instance_valid, - current_sled_id, + ctx, + instance_state.valid, + instance_state.sled_id, ) .await?; - // Process reconciliation result self.process_joining_reconcile_result( - opctx, - group, - member, - instance_valid, + ctx, + instance_state, reconcile_res, - dataplane_client, ) .await } @@ -570,16 +526,14 @@ impl MulticastGroupReconciler { &self, instance_states: &InstanceStateMap, member: &MulticastGroupMember, - ) -> (bool, Option) { - instance_states.get(&member.parent_id).copied().unwrap_or((false, None)) + ) -> InstanceMulticastState { + instance_states.get(&member.parent_id).copied().unwrap_or_default() } /// Execute the reconciliation CAS operation for a member in "Joining" state. async fn execute_joining_reconciliation( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, instance_valid: bool, current_sled_id: Option, ) -> Result { @@ -587,9 +541,9 @@ impl MulticastGroupReconciler { self.datastore .multicast_group_member_reconcile_joining( - opctx, - MulticastGroupUuid::from_untyped_uuid(group.id()), - InstanceUuid::from_untyped_uuid(member.parent_id), + ctx.opctx, + MulticastGroupUuid::from_untyped_uuid(ctx.group.id()), + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), instance_valid, current_sled_id_db, ) @@ -600,39 +554,26 @@ impl MulticastGroupReconciler { /// Process the result of a "Joining" state reconciliation operation. async fn process_joining_reconcile_result( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, reconcile_result: ReconcileJoiningResult, - dataplane_client: &MulticastDataplaneClient, ) -> Result { match reconcile_result.action { ReconcileAction::TransitionedToLeft => { - self.handle_transitioned_to_left(opctx, group, member).await + self.handle_transitioned_to_left(ctx).await } ReconcileAction::UpdatedSledId { old, new } => { self.handle_sled_id_updated( - opctx, - group, - member, - instance_valid, + ctx, + instance_state, SledIdUpdate { old, new }, - dataplane_client, ) .await } ReconcileAction::NotFound | ReconcileAction::NoChange => { - self.handle_no_change_or_not_found( - opctx, - group, - member, - instance_valid, - dataplane_client, - ) - .await + self.handle_no_change_or_not_found(ctx, instance_state).await } } } @@ -640,18 +581,16 @@ impl MulticastGroupReconciler { /// Handle the case where a member was transitioned to "Left" state. async fn handle_transitioned_to_left( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, ) -> Result { info!( - opctx.log, + ctx.opctx.log, "multicast member lifecycle transition: 'Joining' → 'Left'"; - "member_id" => %member.id, - "instance_id" => %member.parent_id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "group_multicast_ip" => %group.multicast_ip, + "member_id" => %ctx.member.id, + "instance_id" => %ctx.member.parent_id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "group_multicast_ip" => %ctx.group.multicast_ip, "reason" => "instance_not_valid_for_multicast_traffic" ); Ok(StateTransition::StateChanged) @@ -660,63 +599,43 @@ impl MulticastGroupReconciler { /// Handle the case where a member's sled_id was updated. async fn handle_sled_id_updated( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, sled_id_update: SledIdUpdate, - dataplane_client: &MulticastDataplaneClient, ) -> Result { - debug!( - opctx.log, + trace!( + ctx.opctx.log, "updated member sled_id, checking if ready to join"; - "member_id" => %member.id, + "member_id" => %ctx.member.id, "old_sled_id" => ?sled_id_update.old, "new_sled_id" => ?sled_id_update.new, - "group_state" => ?group.state, - "instance_valid" => instance_valid + "group_state" => ?ctx.group.state, + "instance_valid" => instance_state.valid ); - self.try_complete_join_if_ready( - opctx, - group, - member, - instance_valid, - dataplane_client, - ) - .await + self.try_complete_join_if_ready(ctx, instance_state).await } /// Handle the case where no changes were made or member was not found. async fn handle_no_change_or_not_found( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, ) -> Result { // Check if member is already in Joined state - if member.state == MulticastGroupMemberState::Joined { - debug!( - opctx.log, + if ctx.member.state == MulticastGroupMemberState::Joined { + trace!( + ctx.opctx.log, "member already in 'Joined' state, no action needed"; - "member_id" => %member.id, - "group_id" => %group.id(), - "group_name" => group.name().as_str() + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str() ); return Ok(StateTransition::NoChange); } // Try to complete the join if conditions are met - self.try_complete_join_if_ready( - opctx, - group, - member, - instance_valid, - dataplane_client, - ) - .await + self.try_complete_join_if_ready(ctx, instance_state).await } fn is_ready_to_join( @@ -729,30 +648,31 @@ impl MulticastGroupReconciler { async fn try_complete_join_if_ready( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, ) -> Result { - if self.is_ready_to_join(group, instance_valid) { - self.complete_instance_member_join( - opctx, - group, - member, - dataplane_client, - ) - .await?; - Ok(StateTransition::StateChanged) + if self.is_ready_to_join(ctx.group, instance_state.valid) { + let joined = self + .complete_instance_member_join( + ctx, + None, + instance_state.propolis_id, + ) + .await?; + if joined { + Ok(StateTransition::StateChanged) + } else { + Ok(StateTransition::NoChange) + } } else { - debug!( - opctx.log, + trace!( + ctx.opctx.log, "member not ready to join: waiting for next run"; - "member_id" => %member.id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "instance_valid" => instance_valid, - "group_state" => ?group.state + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "instance_valid" => instance_state.valid, + "group_state" => ?ctx.group.state ); Ok(StateTransition::NoChange) } @@ -761,82 +681,52 @@ impl MulticastGroupReconciler { /// Instance-specific handler for members in "Joined" state. async fn handle_instance_joined( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - // Get pre-fetched instance state and sled_id - let (instance_valid, current_sled_id) = instance_states - .get(&member.parent_id) + let instance_state = ctx + .instance_states + .get(&ctx.member.parent_id) .copied() - .unwrap_or((false, None)); + .unwrap_or_default(); - match (instance_valid, current_sled_id) { - // Invalid instance -> remove from dataplane and transition to "Left" - (false, _) => { - self.handle_invalid_instance( - opctx, - group, - member, - dataplane_client, - ) - .await - } + match (instance_state.valid, instance_state.sled_id) { + (false, _) => self.handle_invalid_instance(ctx).await, - // Valid instance with sled, but sled changed (migration) - (true, Some(sled_id)) if member.sled_id != Some(sled_id.into()) => { + (true, Some(sled_id)) + if ctx.member.sled_id != Some(sled_id.into()) => + { self.handle_sled_migration( - opctx, - group, - member, + ctx, sled_id, - dataplane_client, + instance_state.propolis_id, ) .await } - // Valid instance with sled, sled unchanged -> verify configuration (true, Some(_)) => { - self.verify_members(opctx, group, member, dataplane_client) - .await?; + self.verify_members(ctx).await?; trace!( - opctx.log, + ctx.opctx.log, "member configuration verified, no changes needed"; - "member_id" => %member.id, - "group_id" => %group.id() + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id() ); Ok(StateTransition::NoChange) } - // Valid instance but no sled_id (shouldn't typically happen in "Joined" state) - (true, None) => { - self.handle_joined_without_sled( - opctx, - group, - member, - dataplane_client, - ) - .await - } + (true, None) => self.handle_joined_without_sled(ctx).await, } } /// Handle a joined member whose instance became invalid. async fn handle_invalid_instance( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { + let MemberReconcileCtx { opctx, group, member, sled_client, .. } = ctx; // Remove from dataplane first - if let Err(e) = self - .remove_member_from_dataplane(opctx, member, dataplane_client) - .await - { - debug!( + if let Err(e) = self.remove_member_from_dataplane(ctx).await { + warn!( opctx.log, "failed to remove member from dataplane, will retry"; "member_id" => %member.id, @@ -845,6 +735,24 @@ impl MulticastGroupReconciler { return Err(e); } + // Unsubscribe the VMM from the multicast group before the CAS + // clears the sled ID. Best-effort since the VMM may already be torn + // down. + if let Some(sled_id) = member.sled_id { + if let Err(e) = sled_client + .unsubscribe_vmm(opctx, group, member, sled_id.into(), None) + .await + { + warn!( + opctx.log, + "failed to unsubscribe VMM during instance invalidation"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } + } + // Update database state (atomically set "Left" and clear `sled_id`) let updated = self .datastore @@ -870,6 +778,21 @@ impl MulticastGroupReconciler { return Ok(StateTransition::NoChange); } + // Propagate updated M2P/forwarding to all sleds so the + // dataplane reflects the member's departure. Best-effort since + // group reconciliation will converge if this fails. + if let Err(e) = + sled_client.propagate_m2p_and_forwarding(opctx, group).await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding after member leave"; + "member_id" => %member.id, + "group_id" => %group.id(), + "error" => %e + ); + } + info!( opctx.log, "multicast member lifecycle transition: 'Joined' → 'Left' (instance invalid)"; @@ -877,7 +800,6 @@ impl MulticastGroupReconciler { "instance_id" => %member.parent_id, "group_id" => %group.id(), "group_multicast_ip" => %group.multicast_ip, - "dpd_operation" => "remove_member_from_underlay_group", "reason" => "instance_no_longer_valid_for_multicast_traffic" ); Ok(StateTransition::StateChanged) @@ -886,46 +808,51 @@ impl MulticastGroupReconciler { /// Handle sled migration for a "Joined" member. async fn handle_sled_migration( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, new_sled_id: SledUuid, - dataplane_client: &MulticastDataplaneClient, + cached_propolis_id: Option, ) -> Result { info!( - opctx.log, + ctx.opctx.log, "detected sled migration for 'Joined' member: re-applying configuration"; - "member_id" => %member.id, - "instance_id" => %member.parent_id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "group_multicast_ip" => %group.multicast_ip, - "old_sled_id" => ?member.sled_id, + "member_id" => %ctx.member.id, + "instance_id" => %ctx.member.parent_id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "group_multicast_ip" => %ctx.group.multicast_ip, + "old_sled_id" => ?ctx.member.sled_id, "new_sled_id" => %new_sled_id ); // Remove from old sled's dataplane first - if let Err(e) = self - .remove_member_from_dataplane(opctx, member, dataplane_client) - .await - { - debug!( - opctx.log, + if let Err(e) = self.remove_member_from_dataplane(ctx).await { + warn!( + ctx.opctx.log, "failed to remove member from old sled, will retry"; - "member_id" => %member.id, - "old_sled_id" => ?member.sled_id, + "member_id" => %ctx.member.id, + "old_sled_id" => ?ctx.member.sled_id, "error" => ?e ); return Err(e); } - // Update sled_id in database using CAS + // Source-sled OPTE cleanup (M2P, forwarding, port subscription) + // is handled by VMM teardown: remove_propolis_zone -> + // release_opte_ports -> PortTicket::release_inner, which + // clears multicast subscriptions along with V2P and firewall + // rules. + // + // This is consistent with all other OPTE state. Nexus + // never explicitly calls sled-agent for source-sled cleanup + // after migration. + + // Update `sled_id` in database using CAS let updated = self .datastore .multicast_group_member_update_sled_id_if_current( - opctx, - InstanceUuid::from_untyped_uuid(member.parent_id), - member.sled_id, + ctx.opctx, + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), + ctx.member.sled_id, Some(new_sled_id.into()), ) .await @@ -935,49 +862,53 @@ impl MulticastGroupReconciler { if !updated { debug!( - opctx.log, + ctx.opctx.log, "skipping sled_id update after migration due to concurrent change"; - "member_id" => %member.id, - "group_id" => %group.id(), - "old_sled_id" => ?member.sled_id, + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "old_sled_id" => ?ctx.member.sled_id, "new_sled_id" => %new_sled_id ); return Ok(StateTransition::NoChange); } - // Re-apply configuration on new sled - // If this fails (e.g., sled not yet in inventory), transition to "Joining" for retry + // Re-apply configuration on new sled. Pass `new_sled_id` explicitly + // because the in-memory member struct still has the old sled_id. match self .complete_instance_member_join( - opctx, - group, - member, - dataplane_client, + ctx, + Some(new_sled_id), + cached_propolis_id, ) .await { - Ok(()) => { + Ok(joined) => { info!( - opctx.log, + ctx.opctx.log, "member configuration re-applied after sled migration"; - "member_id" => %member.id, - "instance_id" => %member.parent_id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "group_multicast_ip" => %group.multicast_ip, + "member_id" => %ctx.member.id, + "instance_id" => %ctx.member.parent_id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "group_multicast_ip" => %ctx.group.multicast_ip, "new_sled_id" => %new_sled_id, - "dpd_operation" => "re_add_member_to_underlay_multicast_group" + "action" => "re_add_member_to_underlay_multicast_group", + "joined" => joined ); - Ok(StateTransition::StateChanged) + if joined { + Ok(StateTransition::StateChanged) + } else { + Ok(StateTransition::NoChange) + } } Err(e) => { // Failed to join on new sled. We transition to "Joining" and // retry next cycle/run. warn!( - opctx.log, + ctx.opctx.log, "failed to complete join on new sled after migration: transitioning to 'Joining' for retry"; - "member_id" => %member.id, - "group_id" => %group.id(), + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), "new_sled_id" => %new_sled_id, "error" => %e ); @@ -1005,9 +936,9 @@ impl MulticastGroupReconciler { let updated = self .datastore .multicast_group_member_set_state_if_current( - opctx, - MulticastGroupUuid::from_untyped_uuid(group.id()), - InstanceUuid::from_untyped_uuid(member.parent_id), + ctx.opctx, + MulticastGroupUuid::from_untyped_uuid(ctx.group.id()), + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), MulticastGroupMemberState::Joined, MulticastGroupMemberState::Joining, ) @@ -1018,10 +949,10 @@ impl MulticastGroupReconciler { if updated { info!( - opctx.log, + ctx.opctx.log, "member transitioned to 'Joining': will retry on next reconciliation run"; - "member_id" => %member.id, - "group_id" => %group.id(), + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), "new_sled_id" => %new_sled_id ); Ok(StateTransition::StateChanged) @@ -1036,11 +967,9 @@ impl MulticastGroupReconciler { /// Handle edge case where a "Joined" member has no sled_id. async fn handle_joined_without_sled( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { + let MemberReconcileCtx { opctx, group, member, .. } = ctx; warn!( opctx.log, "'Joined' member has no sled_id: transitioning to 'Left'"; @@ -1049,10 +978,7 @@ impl MulticastGroupReconciler { ); // Remove from dataplane and transition to "Left" - if let Err(e) = self - .remove_member_from_dataplane(opctx, member, dataplane_client) - .await - { + if let Err(e) = self.remove_member_from_dataplane(ctx).await { warn!( opctx.log, "failed to remove member with no sled_id from dataplane"; @@ -1094,7 +1020,7 @@ impl MulticastGroupReconciler { "instance_id" => %member.parent_id, "group_id" => %group.id(), "group_multicast_ip" => %group.multicast_ip, - "dpd_operation" => "remove_member_from_underlay_group", + "action" => "transition_to_left", "reason" => "inconsistent_state_sled_id_missing_in_joined_state" ); Ok(StateTransition::StateChanged) @@ -1103,22 +1029,20 @@ impl MulticastGroupReconciler { /// Instance-specific handler for members in "Left" state. async fn handle_instance_left( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - // Get pre-fetched instance state and sled_id - let (instance_valid, current_sled_id) = instance_states - .get(&member.parent_id) + let InstanceMulticastState { + valid: instance_valid, + sled_id: current_sled_id, + .. + } = ctx + .instance_states + .get(&ctx.member.parent_id) .copied() - .unwrap_or((false, None)); + .unwrap_or_default(); - // Handle permanent deletion first - if member.time_deleted.is_some() { - self.cleanup_deleted_member(opctx, group, member, dataplane_client) - .await?; + if ctx.member.time_deleted.is_some() { + self.cleanup_deleted_member(ctx).await?; return Ok(StateTransition::NeedsCleanup); } @@ -1128,28 +1052,44 @@ impl MulticastGroupReconciler { // The cleanup is idempotent and handles cases where: // - sled_id is None (uses fallback path) // - member was already removed from DPD - if let Err(e) = self - .remove_member_from_dataplane(opctx, member, dataplane_client) - .await - { - debug!( - opctx.log, + if let Err(e) = self.remove_member_from_dataplane(ctx).await { + warn!( + ctx.opctx.log, "failed to clean up DPD state for 'Left' member (will retry)"; - "member_id" => %member.id, + "member_id" => %ctx.member.id, "error" => ?e ); - // Continue to reactivation even on cleanup failure because - // the add operation may succeed if the port was already removed } - // Handle reactivation: instance valid and group active -> transition to "Joining" - if instance_valid && group.state == MulticastGroupState::Active { - return self - .reactivate_left_member(opctx, group, member, current_sled_id) - .await; + // Unsubscribe the VMM's OPTE port from this multicast group. + // Best-effort since if the VMM is already gone, there's nothing to + // unsubscribe (the OPTE port was destroyed with the VMM). + if let Some(sled_id) = ctx.member.sled_id { + if let Err(e) = ctx + .sled_client + .unsubscribe_vmm( + ctx.opctx, + ctx.group, + ctx.member, + sled_id.into(), + None, + ) + .await + { + warn!( + ctx.opctx.log, + "failed to unsubscribe VMM from multicast group"; + "member_id" => %ctx.member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } + } + + if instance_valid && ctx.group.state == MulticastGroupState::Active { + return self.reactivate_left_member(ctx, current_sled_id).await; } - // Stay in "Left" state Ok(StateTransition::NoChange) } @@ -1157,11 +1097,10 @@ impl MulticastGroupReconciler { /// Transitions the member back to "Joining" state so it can rejoin the group. async fn reactivate_left_member( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, current_sled_id: Option, ) -> Result { + let MemberReconcileCtx { opctx, group, member, .. } = ctx; debug!( opctx.log, "transitioning member from 'Left' to 'Joining': instance became valid and group active"; @@ -1250,10 +1189,10 @@ impl MulticastGroupReconciler { // Build the state map from the fetched data state_map.extend(members.iter().map(|member| { - let (is_valid, sled_id) = if let Some((instance, vmm_opt)) = + let state = if let Some((instance, vmm_opt)) = instance_vmm_data.get(&member.parent_id) { - let is_valid = matches!( + let valid = matches!( instance.nexus_state.state(), InstanceState::Creating | InstanceState::Starting @@ -1267,13 +1206,16 @@ impl MulticastGroupReconciler { SledUuid::from_untyped_uuid(vmm.sled_id.into_untyped_uuid()) }); - (is_valid, sled_id) + let propolis_id = vmm_opt + .as_ref() + .map(|vmm| PropolisUuid::from_untyped_uuid(vmm.id)); + + InstanceMulticastState { valid, sled_id, propolis_id } } else { - // Instance not found (mark as invalid) - (false, None) + InstanceMulticastState::default() }; - (member.parent_id, (is_valid, sled_id)) + (member.parent_id, state) })); debug!( @@ -1292,9 +1234,9 @@ impl MulticastGroupReconciler { /// Returns `None` if the instance has no sled assignment or cannot be found. async fn lookup_and_update_member_sled_id( &self, - opctx: &OpContext, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, ) -> Result>, anyhow::Error> { + let MemberReconcileCtx { opctx, member, .. } = ctx; debug!( opctx.log, "member has no sled_id, attempting to look up instance sled"; @@ -1319,13 +1261,13 @@ impl MulticastGroupReconciler { return Ok(None); } Err(e) => { - debug!( + warn!( opctx.log, "failed to look up instance state"; "member" => ?member, "error" => ?e ); - return Ok(None); + return Err(e.into()); } }; @@ -1381,87 +1323,149 @@ impl MulticastGroupReconciler { } } - /// Complete a member join operation ("Joining" -> "Joined") for an instance. + /// Complete a member join by configuring the dataplane and subscribing + /// the VMM. + /// + /// When `sled_id_override` is provided (e.g., during migration), it + /// is used instead of the potentially stale `member.sled_id`. + /// + /// # Returns + /// + /// `Ok(true)` when the join completed successfully. `Ok(false)` when no + /// sled was available and the operation was a no-op. async fn complete_instance_member_join( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, - ) -> Result<(), anyhow::Error> { + ctx: &MemberReconcileCtx<'_>, + sled_id_override: Option, + cached_propolis_id: Option, + ) -> Result { debug!( - opctx.log, + ctx.opctx.log, "completing member join"; - "member" => ?member, - "group" => ?group + "member" => ?ctx.member, + "group" => ?ctx.group ); - // Get sled_id from member record, or look it up and update if missing - let sled_id = match member.sled_id { - Some(id) => id, - None => { - match self - .lookup_and_update_member_sled_id(opctx, member) - .await? - { - Some(id) => id, - None => return Ok(()), // No sled available, cannot join - } - } + // Use the override if provided, then the member's cached sled_id, + // then look it up from the instance as a last resort. + let sled_id: SledUuid = if let Some(id) = + sled_id_override.or(ctx.member.sled_id.map(Into::into)) + { + id + } else if let Some(id) = + self.lookup_and_update_member_sled_id(ctx).await? + { + id.into() + } else { + return Ok(false); }; - self.add_member_to_dataplane( - opctx, - group, - member, - sled_id.into(), - dataplane_client, - ) - .await?; + self.add_member_to_dataplane(ctx, sled_id).await?; - // Transition to "Joined" state (only if still in "Joining") - let updated = self - .datastore - .multicast_group_member_set_state_if_current( - opctx, - MulticastGroupUuid::from_untyped_uuid(group.id()), - InstanceUuid::from_untyped_uuid(member.parent_id), - MulticastGroupMemberState::Joining, - MulticastGroupMemberState::Joined, + // If the member is already in a "Joined" state (migration path), skip + // the state transition but still propagate and subscribe. During + // migration the caller updates the sled ID without changing state, + // so we must not gate propagation on this CAS. + if ctx.member.state != MulticastGroupMemberState::Joined { + let updated = self + .datastore + .multicast_group_member_set_state_if_current( + ctx.opctx, + MulticastGroupUuid::from_untyped_uuid(ctx.group.id()), + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), + MulticastGroupMemberState::Joining, + MulticastGroupMemberState::Joined, + ) + .await + .context( + "failed to conditionally transition member to 'Joined' state", + )?; + + if !updated { + debug!( + ctx.opctx.log, + "skipping Joining→Joined transition due to concurrent update"; + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id() + ); + // Concurrent update moved the member away from the "Joining" + // state, so skip propagation and subscribe. + return Ok(false); + } + } + + // Propagate M2P mappings and forwarding entries to all sleds. + // + // Athis point, the member is now "Joined" in the database, so propagate + // includes this sled in forwarding next-hops. If propagation or + // subscribe fails below, the member remains "Joined" with incomplete + // sled state. The reconciler's next pass converges via + // `handle_instance_joined` -> `verify_members`. + // + // Propagation failures are best-effort since the reconciler will + // re-converge all sleds on the next cycle. Subscribe failures + // below are treated as hard errors because the VMM cannot + // receive traffic without an OPTE port subscription. + if let Err(e) = ctx + .sled_client + .propagate_m2p_and_forwarding(ctx.opctx, ctx.group) + .await + { + warn!( + ctx.opctx.log, + "failed to propagate M2P/forwarding after member join"; + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "error" => %e + ); + } + + // Subscribe the VMM's OPTE port last. Propagation above is + // best-effort, and any sleds that failed will be converged by the + // reconciler on the next cycle. + if let Err(e) = ctx + .sled_client + .subscribe_vmm( + ctx.opctx, + ctx.group, + ctx.member, + sled_id, + cached_propolis_id, ) .await - .context( - "failed to conditionally transition member to 'Joined' state", - )?; - if !updated { - debug!( - opctx.log, - "skipping Joining→Joined transition due to concurrent update"; - "member_id" => %member.id, - "group_id" => %group.id() + { + warn!( + ctx.opctx.log, + "failed to subscribe VMM to multicast group via sled-agent \ + (will retry next cycle)"; + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "sled_id" => %sled_id, + "error" => %e ); + return Err(e); } info!( - opctx.log, + ctx.opctx.log, "member join completed"; - "member_id" => %member.id, - "group_id" => %group.id(), + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), "sled_id" => %sled_id ); - Ok(()) + Ok(true) } /// Apply member dataplane configuration (via DPD-client). async fn add_member_to_dataplane( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, sled_id: SledUuid, - dataplane_client: &MulticastDataplaneClient, ) -> Result<(), anyhow::Error> { + let MemberReconcileCtx { + opctx, group, member, dataplane_client, .. + } = ctx; let underlay_group_id = group.underlay_group_id.with_context(|| { format!("no underlay group for external group {}", group.id()) })?; @@ -1764,18 +1768,11 @@ impl MulticastGroupReconciler { /// Remove member dataplane configuration (via DPD-client). async fn remove_member_from_dataplane( &self, - opctx: &OpContext, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result<(), anyhow::Error> { - let group = self - .datastore - .multicast_group_fetch( - opctx, - MulticastGroupUuid::from_untyped_uuid(member.external_group_id), - ) - .await - .context("failed to fetch group for member removal")?; + let MemberReconcileCtx { + opctx, group, member, dataplane_client, .. + } = ctx; let underlay_group_id = group.underlay_group_id.with_context(|| { format!( @@ -1830,11 +1827,9 @@ impl MulticastGroupReconciler { /// Ensures dataplane consistency by failing if removal operations fail. async fn cleanup_member_from_dataplane( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result<(), anyhow::Error> { + let MemberReconcileCtx { opctx, group, member, .. } = ctx; debug!( opctx.log, "cleaning up member from dataplane"; @@ -1846,11 +1841,9 @@ impl MulticastGroupReconciler { ); // Strict removal from dataplane (fail on errors) - self.remove_member_from_dataplane(opctx, member, dataplane_client) - .await - .context( - "failed to remove member configuration via DPD during cleanup", - )?; + self.remove_member_from_dataplane(ctx).await.context( + "failed to remove member configuration via DPD during cleanup", + )?; info!( opctx.log, @@ -1870,15 +1863,24 @@ impl MulticastGroupReconciler { /// - Removing the member from any unexpected/stale rear ports /// - Adding the member to expected ports /// + /// If the sled cannot be resolved (e.g., decommissioned), the member + /// is transitioned to "Left" and M2P/forwarding is propagated inline + /// to remove stale entries. + /// /// This handles cases like `sp_slot` changes where the sled's physical /// location changed but the `sled_id` stayed the same. async fn verify_members( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result<(), anyhow::Error> { + let MemberReconcileCtx { + opctx, + group, + member, + dataplane_client, + sled_client, + .. + } = ctx; debug!( opctx.log, "verifying joined member consistency"; @@ -1932,13 +1934,24 @@ impl MulticastGroupReconciler { ); // Best effort removal on verification - let _ = self - .remove_member_from_dataplane( - opctx, - member, - dataplane_client, - ) - .await; + let _ = self.remove_member_from_dataplane(ctx).await; + + // Unsubscribe the VMM before the CAS clears sled_id; + // otherwise, the OPTE subscription is stranded with no + // way to identify the sled on later passes. Best-effort + // since the VMM may already be torn down. + if let Err(e) = sled_client + .unsubscribe_vmm(opctx, group, member, sled_id.into(), None) + .await + { + warn!( + opctx.log, + "failed to unsubscribe VMM during port resolution failure"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } let updated = self .datastore @@ -1952,6 +1965,21 @@ impl MulticastGroupReconciler { .context("failed to transition member to 'Left' after port resolution failure")?; if updated { + // Propagate updated M2P/forwarding to remove + // stale entries for this now-Left member. + if let Err(e) = sled_client + .propagate_m2p_and_forwarding(opctx, group) + .await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding after \ + member left due to unresolvable sled"; + "member_id" => %member.id, + "group_id" => %group.id(), + "error" => %e + ); + } info!( opctx.log, "member transitioned to 'Left': sled no longer resolvable"; @@ -2105,6 +2133,23 @@ impl MulticastGroupReconciler { } } + // Ensure the VMM subscription is in place for the current propolis_id. + // This is idempotent and covers cases where the propolis_id changed + // (e.g., after live migration) but the sled_id stayed the same. + if let Err(e) = sled_client + .subscribe_vmm(opctx, group, member, sled_id.into(), None) + .await + { + warn!( + opctx.log, + "failed to verify VMM subscription during member verification"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + return Err(e); + } + info!( opctx.log, "member verification completed"; @@ -2607,21 +2652,32 @@ impl MulticastGroupReconciler { } /// Cleanup a member that is marked for deletion (time_deleted set). + /// + /// This includes unsubscribing a member from its VMM, removing + /// it from the dataplane, and hard-deleting the DB row. async fn cleanup_deleted_member( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result<(), anyhow::Error> { + let MemberReconcileCtx { opctx, group, member, sled_client, .. } = ctx; + // Unsubscribe from sled-agent (best-effort, VMM may be gone). + if let Some(sled_id) = member.sled_id { + if let Err(e) = sled_client + .unsubscribe_vmm(opctx, group, member, sled_id.into(), None) + .await + { + debug!( + opctx.log, + "failed to unsubscribe VMM during member cleanup"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } + } + // Use the consolidated cleanup helper with strict error handling - self.cleanup_member_from_dataplane( - opctx, - group, - member, - dataplane_client, - ) - .await + self.cleanup_member_from_dataplane(ctx).await } /// Get all multicast groups that need member reconciliation. diff --git a/nexus/src/app/background/tasks/multicast/mod.rs b/nexus/src/app/background/tasks/multicast/mod.rs index 8f592a41087..6ab622179be 100644 --- a/nexus/src/app/background/tasks/multicast/mod.rs +++ b/nexus/src/app/background/tasks/multicast/mod.rs @@ -84,7 +84,7 @@ //! - Unlike linear probing (`h + i`), scattered outputs avoid clustering //! - **8-bit salt**: 256 unique underlay addresses per external IP //! - **Resolution**: Exhaustion requires 256 other groups to occupy exactly -//! those 256 scattered addresses—effectively impossible in 2^64 space +//! those 256 scattered addresses, effectively impossible in 2^64 space //! //! ### Forwarding Architecture (Incoming multicast traffic to guests) //! @@ -105,6 +105,8 @@ //! - **Group lifecycle**: "Creating" → "Active" → "Deleting" → hard-deleted //! - **Member lifecycle**: "Joining" → "Joined" → "Left" → soft-deleted → hard-deleted //! - **Dataplane updates**: DPD API calls for P4 table updates +//! - **Sled propagation**: M2P mappings and forwarding entries pushed to sled-agents +//! - **OPTE subscriptions**: Per-VMM multicast group subscriptions on target sleds //! - **Topology mapping**: Sled-to-switch-port resolution (with caching) //! //! ## Deletion Semantics: Groups vs Members @@ -151,6 +153,7 @@ use sled_hardware_types::BaseboardId; use crate::app::background::BackgroundTask; use crate::app::multicast::dataplane::MulticastDataplaneClient; +use crate::app::multicast::sled::MulticastSledClient; use crate::app::saga::StartSaga; pub(crate) mod groups; @@ -362,7 +365,7 @@ impl MulticastGroupReconciler { /// │ 6 │ 0xa ⊕ 6 │ 0xc │ /// │ 7 │ 0xa ⊕ 7 │ 0xd │ /// └──────┴─────────┴────────┘ -/// Outputs: [a, b, 8, 9, e, f, c, d] — scattered, not sequential +/// Outputs: [a, b, 8, 9, e, f, c, d] (scattered, not sequential) /// ``` /// /// On collision (i.e., underlay IP already in use), we increment salt and retry. @@ -533,6 +536,13 @@ impl MulticastGroupReconciler { } }; + // Create sled-agent client for OPTE subscriptions and + // M2P/forwarding propagation. + let sled_client = MulticastSledClient::new( + self.datastore.clone(), + self.resolver.clone(), + ); + // Process creating groups match self.reconcile_creating_groups(opctx).await { Ok(count) => status.groups_created += count, @@ -543,7 +553,10 @@ impl MulticastGroupReconciler { } // Process member state changes - match self.reconcile_member_states(opctx, &dataplane_client).await { + match self + .reconcile_member_states(opctx, &dataplane_client, &sled_client) + .await + { Ok(count) => status.members_processed += count, Err(e) => { let msg = format!("failed to reconcile member states: {e:#}"); @@ -574,7 +587,10 @@ impl MulticastGroupReconciler { } // Reconcile active groups (verify state, update dataplane as needed) - match self.reconcile_active_groups(opctx, &dataplane_client).await { + match self + .reconcile_active_groups(opctx, &dataplane_client, &sled_client) + .await + { Ok(count) => status.groups_verified += count, Err(e) => { let msg = format!("failed to reconcile active groups: {e:#}"); @@ -583,7 +599,10 @@ impl MulticastGroupReconciler { } // Process deleting groups (DPD cleanup + hard-delete from DB) - match self.reconcile_deleting_groups(opctx, &dataplane_client).await { + match self + .reconcile_deleting_groups(opctx, &dataplane_client, &sled_client) + .await + { Ok(count) => status.groups_deleted += count, Err(e) => { let msg = format!("failed to reconcile deleting groups: {e:#}"); diff --git a/nexus/src/app/multicast/dataplane.rs b/nexus/src/app/multicast/dataplane.rs index 5d79df7d078..8d858154b2a 100644 --- a/nexus/src/app/multicast/dataplane.rs +++ b/nexus/src/app/multicast/dataplane.rs @@ -113,7 +113,8 @@ trait IntoUnderlayMulticast { impl IntoUnderlayMulticast for IpAddr { fn into_underlay_multicast(self) -> Result { match self { - IpAddr::V6(ipv6) => Ok(UnderlayMulticastIpv6(ipv6)), + IpAddr::V6(ipv6) => UnderlayMulticastIpv6::try_from(ipv6) + .map_err(|e| Error::invalid_request(e.to_string())), IpAddr::V4(_) => Err(Error::invalid_request( "underlay multicast groups must use IPv6 addresses", )), @@ -179,14 +180,36 @@ impl MulticastDataplaneClient { fn select_one_switch( &self, ) -> MulticastDataplaneResult<(&SwitchSlot, &dpd_client::Client)> { - let mut switches: Vec<_> = self.dpd_clients.iter().collect(); - switches.sort_by_key(|(loc, _)| *loc); - switches - .into_iter() - .next() + self.dpd_clients + .iter() + .min_by_key(|(loc, _)| *loc) .ok_or_else(|| Error::internal_error("no DPD clients available")) } + /// Compute DPD source filter from aggregated member source state. + /// + /// For SSM addresses, always returns specific sources. For ASM addresses, + /// returns `None` (any source) if any member omitted sources, otherwise + /// returns the union of all member sources. + fn compute_sources_for_dpd( + external_group_ip: IpAddr, + source_filter: &SourceFilterState, + ) -> Option> { + if is_ssm_address(external_group_ip) + || !source_filter.has_any_source_member + { + Some( + source_filter + .specific_sources + .iter() + .map(|ip| dpd_client::types::IpSrc::Exact(*ip)) + .collect(), + ) + } else { + None + } + } + async fn dpd_ensure_underlay_created( &self, client: &dpd_client::Client, @@ -413,33 +436,9 @@ impl MulticastDataplaneClient { inner_mac: MacAddr { a: underlay_ipv6.derive_multicast_mac() }, vni: Vni::from(u32::from(external_group.vni.0)), }; - let external_group_ip = external_group.multicast_ip.ip(); - - // Source filtering per RFC 4607: - // - SSM (232/8, ff3x::/32): always use specific sources. API - // validation prevents SSM joins without sources. - // - ASM: use specific sources when all members specify sources, - // otherwise None to allow any source at the switch level. - let sources_dpd = if is_ssm_address(external_group_ip) { - Some( - source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - } else if source_filter.has_any_source_member { - None - } else { - Some( - source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - }; + let sources_dpd = + Self::compute_sources_for_dpd(external_group_ip, source_filter); let create_operations = dpd_clients.into_iter().map(|(switch_slot, client)| { @@ -570,36 +569,12 @@ impl MulticastDataplaneClient { inner_mac: MacAddr { a: underlay_ipv6.derive_multicast_mac() }, vni: Vni::from(u32::from(params.external_group.vni.0)), }; - let new_name_str = params.new_name.to_string(); let external_group_ip = params.external_group.multicast_ip.ip(); - - // Source filtering per RFC 4607: - // - SSM (232/8, ff3x::/32): always use specific sources. API - // validation prevents SSM joins without sources. - // - ASM: use specific sources when all members specify sources, - // otherwise None to allow any source at the switch level. - let sources_dpd = if is_ssm_address(external_group_ip) { - Some( - params - .source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - } else if params.source_filter.has_any_source_member { - None - } else { - Some( - params - .source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - }; + let sources_dpd = Self::compute_sources_for_dpd( + external_group_ip, + params.source_filter, + ); let update_operations = dpd_clients.into_iter().map(|(switch_slot, client)| { diff --git a/nexus/src/app/multicast/mod.rs b/nexus/src/app/multicast/mod.rs index 629d1253c89..a5eea32b8d4 100644 --- a/nexus/src/app/multicast/mod.rs +++ b/nexus/src/app/multicast/mod.rs @@ -61,7 +61,9 @@ use nexus_db_queries::db::datastore::multicast::ExternalMulticastGroupWithSource use nexus_db_queries::{authz, db}; use nexus_types::external_api::multicast; use nexus_types::multicast::MulticastGroupCreate; -use omicron_common::address::is_ssm_address; +use omicron_common::address::{ + MAX_SOURCE_IPS_PER_GROUP, MAX_SOURCE_IPS_PER_MEMBER, is_ssm_address, +}; use omicron_common::api::external::{ self, CreateResult, DataPageParams, DeleteResult, IdentityMetadataCreateParams, ListResultVec, LookupResult, @@ -70,6 +72,7 @@ use omicron_common::api::external::{ use omicron_uuid_kinds::{GenericUuid, InstanceUuid, MulticastGroupUuid}; pub(crate) mod dataplane; +pub(crate) mod sled; /// Validate that SSM addresses have source IPs. /// @@ -111,6 +114,42 @@ pub(crate) fn validate_ssm_sources( Ok(()) } +/// Validate per-member source IP list shape. +/// +/// Applies whenever a member declares source IPs, irrespective of SSM or ASM +/// group semantics. Enforces: +/// +/// - At most [`MAX_SOURCE_IPS_PER_MEMBER`] entries +/// - No duplicates (rejected explicitly rather than silently deduplicated, so +/// client bugs are surfaced and downstream consumers can assume the list is +/// canonical) +pub(crate) fn validate_member_source_ips( + source_ips: Option<&[std::net::IpAddr]>, +) -> Result<(), external::Error> { + let Some(sources) = source_ips else { + return Ok(()); + }; + if sources.is_empty() { + return Ok(()); + } + let count = sources.len(); + if count > MAX_SOURCE_IPS_PER_MEMBER { + return Err(external::Error::invalid_request(format!( + "membership source IP count {count} exceeds per-member limit \ + of {MAX_SOURCE_IPS_PER_MEMBER}", + ))); + } + let mut seen = std::collections::BTreeSet::new(); + for ip in sources { + if !seen.insert(*ip) { + return Err(external::Error::invalid_request(format!( + "duplicate source IP {ip} in membership request", + ))); + } + } + Ok(()) +} + impl super::Nexus { /// Look up a fleet-scoped multicast group by name, ID, or IP address. /// @@ -351,6 +390,10 @@ impl super::Nexus { ))); } + // Per-member source IP shape (count + duplicate) check runs once + // up front, independent of group resolution. + validate_member_source_ips(source_ips)?; + // Find or create the group based on identifier type. // SSM validation happens inside resolve functions. let group_id = match group_identifier { @@ -372,6 +415,13 @@ impl super::Nexus { } }; + // Preflight per-group source IP union cap for a descriptive 400 in + // the non-racing common case. The datastore CTE enforces the same + // bound atomically inside `multicast_group_member_attach_to_instance`. + if let Some(sources) = source_ips.filter(|s| !s.is_empty()) { + self.validate_group_source_union(opctx, group_id, sources).await?; + } + // Attach the member with its source IPs let member = self .db_datastore @@ -595,6 +645,36 @@ impl super::Nexus { Ok(MulticastGroupUuid::from_untyped_uuid(db_group.identity.id)) } + /// Preflight check that the union of existing member source IPs and + /// `proposed` for `group_id` stays within + /// [`MAX_SOURCE_IPS_PER_GROUP`]. + async fn validate_group_source_union( + &self, + opctx: &OpContext, + group_id: MulticastGroupUuid, + proposed: &[IpAddr], + ) -> Result<(), external::Error> { + let filter_state = self + .db_datastore + .multicast_groups_source_filter_state(opctx, &[group_id]) + .await?; + let mut union = filter_state + .get(&group_id.into_untyped_uuid()) + .map(|s| s.specific_sources.clone()) + .unwrap_or_default(); + union.extend(proposed.iter().copied()); + if union.len() > MAX_SOURCE_IPS_PER_GROUP { + return Err(external::Error::invalid_request(format!( + "adding {} source IP(s) would push group source union to \ + {}, exceeding per-group cap of {}", + proposed.len(), + union.len(), + MAX_SOURCE_IPS_PER_GROUP, + ))); + } + Ok(()) + } + /// Resolve a multicast group identifier to a UUID (lookup only). /// /// This is a lookup that does not create groups or perform validation. @@ -887,4 +967,32 @@ mod tests { 0xff1e, 0, 0, 0, 0, 0, 0, 1 )))); } + + #[test] + fn test_generate_group_name_from_ip() { + let v4 = IpAddr::V4(Ipv4Addr::new(224, 1, 2, 3)); + assert_eq!( + generate_group_name_from_ip(v4).unwrap().as_str(), + "mcast-224-1-2-3" + ); + + let v4_zeros = IpAddr::V4(Ipv4Addr::new(224, 0, 0, 1)); + assert_eq!( + generate_group_name_from_ip(v4_zeros).unwrap().as_str(), + "mcast-224-0-0-1" + ); + + let v6: IpAddr = IpAddr::V6(Ipv6Addr::new(0xff0e, 0, 0, 0, 0, 0, 0, 1)); + assert_eq!( + generate_group_name_from_ip(v6).unwrap().as_str(), + "mcast-ff0e-0-0-0-0-0-0-1" + ); + + let v6_ssm: IpAddr = + IpAddr::V6(Ipv6Addr::new(0xff3e, 0, 0, 0, 0, 0, 0, 0xabcd)); + assert_eq!( + generate_group_name_from_ip(v6_ssm).unwrap().as_str(), + "mcast-ff3e-0-0-0-0-0-0-abcd" + ); + } } diff --git a/nexus/src/app/multicast/sled.rs b/nexus/src/app/multicast/sled.rs new file mode 100644 index 00000000000..60d93d4ba8b --- /dev/null +++ b/nexus/src/app/multicast/sled.rs @@ -0,0 +1,555 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Sled-agent multicast operations for OPTE subscriptions, M2P mappings, +//! and forwarding entries. +//! +//! Parallel to [`dataplane`] which handles DPD switch operations, this +//! module manages sled-local multicast state via sled-agent: +//! +//! - **OPTE subscriptions**: Per-VMM multicast group filters on the +//! hosting sled +//! - **M2P mappings**: Overlay multicast IP to underlay IPv6 address +//! translation, installed on all sleds +//! - **Forwarding entries**: Underlay multicast address to switch next-hop, +//! installed on all sleds so OPTE forwards to the switch for replication +//! +//! [`dataplane`]: super::dataplane + +use std::collections::BTreeSet; +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; +use std::net::{IpAddr, Ipv6Addr}; +use std::sync::Arc; + +use anyhow::Context; +use slog::{debug, info, warn}; + +use nexus_db_model::{ + MulticastGroup, MulticastGroupMember, MulticastGroupMemberState, +}; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::deployment::SledFilter; +use nexus_types::identity::{Asset, Resource}; +use omicron_common::api::external::DataPageParams; +use omicron_uuid_kinds::{ + GenericUuid, InstanceUuid, MulticastGroupUuid, PropolisUuid, SledUuid, +}; +use sled_agent_client::types::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, McastFilterMode, + McastForwardingEntry, McastForwardingNextHop, McastReplication, + McastSourceFilter, +}; + +/// Utility methods for sled-agent multicast operations used by the +/// background task reconciler. +/// +/// Groups sled-agent HTTP calls (OPTE subscriptions, M2P mappings, +/// forwarding entries) behind a single type to keep the reconciler +/// logic focused on state transitions rather than client construction. +/// +/// Unlike [`MulticastDataplaneClient`] which pre-builds per-switch +/// clients, sled clients are constructed on demand since the target +/// sled set varies per group. +/// +/// [`MulticastDataplaneClient`]: super::dataplane::MulticastDataplaneClient +pub(crate) struct MulticastSledClient { + datastore: Arc, + resolver: internal_dns_resolver::Resolver, +} + +impl MulticastSledClient { + pub(crate) fn new( + datastore: Arc, + resolver: internal_dns_resolver::Resolver, + ) -> Self { + Self { datastore, resolver } + } + + /// Create a sled-agent client for the given sled. + /// + /// Looks up the sled's address in the database and constructs an HTTP + /// client. Follows the same pattern as V2P mapping propagation. + async fn sled_client( + &self, + opctx: &OpContext, + sled_id: SledUuid, + ) -> Result + { + nexus_networking::sled_client( + &self.datastore, + opctx, + sled_id, + &opctx.log, + ) + .await + } + + /// Look up the current `propolis_id` for an instance. + async fn lookup_propolis_id( + &self, + opctx: &OpContext, + instance_id: InstanceUuid, + ) -> Result, anyhow::Error> { + let instance_state = self + .datastore + .instance_get_state(opctx, &instance_id) + .await + .context("failed to look up instance state")?; + + Ok(instance_state + .and_then(|s| s.propolis_id) + .map(PropolisUuid::from_untyped_uuid)) + } + + /// Build the membership descriptor sent to sled-agent for + /// subscribe/unsubscribe calls. + fn membership_for( + group: &MulticastGroup, + member: &MulticastGroupMember, + ) -> sled_agent_client::types::InstanceMulticastMembership { + sled_agent_client::types::InstanceMulticastMembership { + group_ip: group.multicast_ip.ip(), + sources: member.source_ips.iter().map(|s| s.ip()).collect(), + } + } + + /// Subscribe a VMM to a multicast group via sled-agent. + /// + /// Looks up the instance's current `propolis_id` and calls the sled-agent + /// endpoint to configure OPTE port-level multicast filters. The member's + /// per-instance source IPs are passed for SSM filtering. + pub(crate) async fn subscribe_vmm( + &self, + opctx: &OpContext, + group: &MulticastGroup, + member: &MulticastGroupMember, + sled_id: SledUuid, + cached_propolis_id: Option, + ) -> Result<(), anyhow::Error> { + let instance_id = InstanceUuid::from_untyped_uuid(member.parent_id); + // If the instance has no propolis_id (already stopped/destroyed), + // the OPTE port is gone and there's nothing to subscribe. + let propolis_id = match cached_propolis_id { + Some(id) => id, + None => match self.lookup_propolis_id(opctx, instance_id).await? { + Some(id) => id, + None => { + debug!( + opctx.log, + "no propolis_id for instance, skipping subscribe"; + "member_id" => %member.id, + "instance_id" => %instance_id + ); + return Ok(()); + } + }, + }; + + let client = self + .sled_client(opctx, sled_id) + .await + .context("failed to create sled-agent client")?; + + let membership = Self::membership_for(group, member); + + client + .vmm_join_multicast_group(&propolis_id, &membership) + .await + .context("sled-agent vmm_join_multicast_group call failed")?; + + debug!( + opctx.log, + "subscribed VMM to multicast group via sled-agent"; + "member_id" => %member.id, + "propolis_id" => %propolis_id, + "sled_id" => %sled_id, + "group_ip" => %group.multicast_ip + ); + + Ok(()) + } + + /// Unsubscribe a VMM from a multicast group via sled-agent. + /// + /// Best-effort since if the VMM or sled is already gone, the unsubscribe + /// is effectively a no-op since the OPTE port was destroyed. + pub(crate) async fn unsubscribe_vmm( + &self, + opctx: &OpContext, + group: &MulticastGroup, + member: &MulticastGroupMember, + sled_id: SledUuid, + cached_propolis_id: Option, + ) -> Result<(), anyhow::Error> { + let instance_id = InstanceUuid::from_untyped_uuid(member.parent_id); + + // If the instance has no propolis_id (already stopped/destroyed), + // the OPTE port is gone and there's nothing to unsubscribe. + let propolis_id = match cached_propolis_id { + Some(id) => id, + None => match self.lookup_propolis_id(opctx, instance_id).await? { + Some(id) => id, + None => { + debug!( + opctx.log, + "no propolis_id for instance, skipping unsubscribe"; + "member_id" => %member.id, + "instance_id" => %instance_id + ); + return Ok(()); + } + }, + }; + + let client = self + .sled_client(opctx, sled_id) + .await + .context("failed to create sled-agent client")?; + + let membership = Self::membership_for(group, member); + + client + .vmm_leave_multicast_group(&propolis_id, &membership) + .await + .context("sled-agent vmm_leave_multicast_group call failed")?; + + debug!( + opctx.log, + "unsubscribed VMM from multicast group via sled-agent"; + "member_id" => %member.id, + "propolis_id" => %propolis_id, + "sled_id" => %sled_id, + "group_ip" => %group.multicast_ip + ); + + Ok(()) + } + + /// Propagate M2P mappings and forwarding entries to all VPC-routing sleds. + /// + /// Performs convergent per-sled propagation: each sled's current state + /// is queried and diffed against desired state. New entries are added + /// and stale state is removed (member leaves, instance stops). When no + /// joined members remain, every sled has stale state and it is cleared. + /// + /// # Scope + /// + /// M2P mappings and forwarding entries are pushed to all VPC-routing + /// sleds, not just member sleds. Any instance on any sled may send to + /// a multicast group address. Hence, without the M2P mapping, OPTE's + /// overlay layer silently drops the packet. Forwarding entries point + /// each sled at a switch, which replicates to member ports via DPD + /// multicast group config. Subscriptions (per-port group membership) remain + /// member-sled-only. + pub(crate) async fn propagate_m2p_and_forwarding( + &self, + opctx: &OpContext, + group: &MulticastGroup, + ) -> Result<(), anyhow::Error> { + let underlay_group_id = group + .underlay_group_id + .context("group missing underlay_group_id")?; + + let underlay_group = self + .datastore + .underlay_multicast_group_fetch(opctx, underlay_group_id) + .await + .context("failed to fetch underlay group")?; + + let underlay_ip = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => anyhow::bail!( + "underlay multicast address for group {} is {other}, expected IPv6", + group.id() + ), + }; + + let group_ip = group.multicast_ip.ip(); + + // Compute desired state from DB, determining which sleds should have + // M2P and forwarding entries for this group. + let group_id = MulticastGroupUuid::from_untyped_uuid(group.id()); + let members = self + .datastore + .multicast_group_members_list( + opctx, + group_id, + &DataPageParams::max_page(), + ) + .await + .context("failed to list group members")?; + + let member_sled_ids: BTreeSet = members + .iter() + .filter(|m| m.state == MulticastGroupMemberState::Joined) + .filter_map(|m| m.sled_id.map(SledUuid::from)) + .collect(); + + // Build desired M2P entry. + let desired_m2p = + Mcast2PhysMapping { group: group_ip, underlay: underlay_ip }; + + // The group is active if any members are "Joined". M2P and + // forwarding are pushed to all sleds when active, cleared + // from all sleds when inactive. + let group_is_active = !member_sled_ids.is_empty(); + + // Query all VPC-routing sleds for current state and converge. + let all_sleds = self + .datastore + .sled_list_all_batched(opctx, SledFilter::VpcRouting) + .await + .context("failed to enumerate sleds")?; + + // Select one of the available switches as the forwarding next hop. + // + // OPTE treats each next hop as a duplication it performs itself, so + // pointing at individual member sleds would cause O(n) copies over + // cxgbe per sender. + // + // A single switch next hop means one copy to the switch, which + // replicates to member sled ports via DPD multicast group membership. + // ECMP over both switches is the more correct longer-term answer, + // but OPTE and mgd lack the tooling to express that today. + let switch_zone_addrs = crate::app::switch_zone_address_mappings( + &self.resolver, + &opctx.log, + ) + .await + .map_err(|e| anyhow::anyhow!(e)) + .context("failed to resolve switch zone addresses")?; + + // Hash the group UUID to distribute switch selection across both + // switches. All Nexuses compute the same hash for a given group, + // so they agree on the mapping without coordination. + let mut hasher = DefaultHasher::new(); + group_id.hash(&mut hasher); + let idx = (hasher.finish() as usize) % switch_zone_addrs.len(); + let switch_ip = switch_zone_addrs + .iter() + .nth(idx) + .map(|(_, ip)| *ip) + .context("no switch zone found for forwarding next hop")?; + + let convergence_params = GroupConvergenceParams { + group_ip, + underlay_ip, + group_is_active, + desired_m2p: &desired_m2p, + switch_ip, + }; + + let mut failed_sleds: usize = 0; + + for sled in &all_sleds { + let sled_id: SledUuid = sled.id(); + let client = match self.sled_client(opctx, sled_id).await { + Ok(c) => c, + Err(e) => { + warn!( + opctx.log, + "failed to create sled-agent client for \ + M2P/forwarding convergence"; + "sled_id" => %sled_id, + "error" => %e + ); + failed_sleds += 1; + continue; + } + }; + + if let Err(e) = + converge_sled_m2p_and_forwarding(&client, &convergence_params) + .await + { + warn!( + opctx.log, + "failed to converge M2P/forwarding on sled"; + "sled_id" => %sled_id, + "group_ip" => %group_ip, + "error" => %e + ); + failed_sleds += 1; + } + } + + info!( + opctx.log, + "converged M2P and forwarding state"; + "group_id" => %group.id(), + "group_ip" => %group_ip, + "underlay_ip" => %underlay_ip, + "member_sleds" => member_sled_ids.len(), + "total_sleds_checked" => all_sleds.len(), + "failed_sleds" => failed_sleds + ); + + if failed_sleds > 0 { + anyhow::bail!( + "failed to converge M2P/forwarding: \ + {failed_sleds} sled convergence failures \ + (out of {} sleds)", + all_sleds.len() + ); + } + + Ok(()) + } + + /// Clear M2P mappings and forwarding entries from all sleds for + /// this group. + /// + /// Delegates to the convergent [`propagate_m2p_and_forwarding`] which + /// will detect that no joined members remain and clear stale state + /// from all sleds. + /// + /// [`propagate_m2p_and_forwarding`]: Self::propagate_m2p_and_forwarding + pub(crate) async fn clear_m2p_and_forwarding( + &self, + opctx: &OpContext, + group: &MulticastGroup, + ) -> Result<(), anyhow::Error> { + self.propagate_m2p_and_forwarding(opctx, group).await + } +} + +/// Resolved group state used to converge M2P and forwarding on each sled. +struct GroupConvergenceParams<'a> { + group_ip: IpAddr, + underlay_ip: Ipv6Addr, + group_is_active: bool, + desired_m2p: &'a Mcast2PhysMapping, + /// Switch zone underlay IP chosen as the forwarding next hop. + /// The switch replicates to member sled ports via DPD config. + switch_ip: Ipv6Addr, +} + +/// Per-sled convergence of M2P and forwarding state. +/// +/// # Errors +/// +/// Returns an error when any sled-agent RPC fails (list, set, or clear). +/// The caller increments `failed_sleds` and continues to the next sled. +async fn converge_sled_m2p_and_forwarding( + client: &sled_agent_client::Client, + params: &GroupConvergenceParams<'_>, +) -> Result<(), anyhow::Error> { + converge_m2p(client, params).await?; + converge_forwarding(client, params).await?; + Ok(()) +} + +/// Converge a single sled's M2P mapping for one group. +/// +/// Sets the mapping when the group is active and missing, clears it +/// when the group is inactive and present. Already-correct state +/// is left alone. +async fn converge_m2p( + client: &sled_agent_client::Client, + params: &GroupConvergenceParams<'_>, +) -> Result<(), anyhow::Error> { + let found = client + .list_mcast_m2p() + .await + .context("failed to list M2P mappings on sled")? + .into_inner(); + + let has_m2p = found.iter().any(|m| { + m.group == params.group_ip && m.underlay == params.underlay_ip + }); + + match (params.group_is_active, has_m2p) { + // Active group missing M2P: install it. + (true, false) => { + client + .set_mcast_m2p(params.desired_m2p) + .await + .context("failed to add M2P mapping to sled")?; + } + // Inactive group has stale M2P: remove it. + (false, true) => { + let clear = ClearMcast2Phys { + group: params.group_ip, + underlay: params.underlay_ip, + }; + client + .clear_mcast_m2p(&clear) + .await + .context("failed to clear stale M2P from sled")?; + } + // Already converged. + _ => {} + } + + Ok(()) +} + +/// Converge a single sled's forwarding entries for one group. +/// +/// When the group is active, this sets a single next hop to the switch +/// zone. The switch replicates to member sled ports via its DPD +/// multicast group membership. When inactive, this clears any stale +/// entries. +async fn converge_forwarding( + client: &sled_agent_client::Client, + params: &GroupConvergenceParams<'_>, +) -> Result<(), anyhow::Error> { + let found = client + .list_mcast_fwd() + .await + .context("failed to list forwarding on sled")? + .into_inner(); + + let current_entry = found.iter().find(|f| f.underlay == params.underlay_ip); + + if !params.group_is_active { + if current_entry.is_some() { + let clear = ClearMcastForwarding { underlay: params.underlay_ip }; + client + .clear_mcast_fwd(&clear) + .await + .context("failed to clear stale forwarding from sled")?; + } + return Ok(()); + } + + let desired_next_hops = vec![McastForwardingNextHop { + next_hop: params.switch_ip, + replication: McastReplication::Underlay, + filter: McastSourceFilter { + mode: McastFilterMode::Exclude, + sources: Vec::new(), + }, + }]; + + let needs_update = match current_entry { + Some(f) => f.next_hops != desired_next_hops, + None => true, + }; + + if needs_update { + // OPTE's set_mcast_fwd handler is additive: it inserts next + // hops but never removes stale ones. Clear first so the + // subsequent set produces an exact replacement. + if current_entry.is_some() { + let clear = ClearMcastForwarding { underlay: params.underlay_ip }; + client + .clear_mcast_fwd(&clear) + .await + .context("failed to clear forwarding before update")?; + } + let desired_fwd = McastForwardingEntry { + underlay: params.underlay_ip, + next_hops: desired_next_hops, + }; + client + .set_mcast_fwd(&desired_fwd) + .await + .context("failed to set forwarding on sled")?; + } + + Ok(()) +} diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index b0fa2ab8408..0152fc20156 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -73,6 +73,7 @@ use nexus_types::external_api::user::{Group, User, UserBuiltin}; use nexus_types::external_api::vpc::{Vpc, VpcRouter, VpcSubnet}; use nexus_types_versions::latest::headers::RangeRequest; use nexus_types_versions::v2025_11_20_00; +use nexus_types_versions::v2026_01_08_00; use omicron_common::address::IpRange; use omicron_common::api::external::AddressLot; use omicron_common::api::external::AddressLotBlock; @@ -5289,6 +5290,32 @@ impl NexusExternalApi for NexusExternalApiImpl { .await } + // Pre-MULTICAST_SOURCE_LIMITS version: same types as the latest variant + // (re-exported through `latest::`), so delegate directly. The behavioral + // difference (per-member and per-group source IP caps) is enforced + // unconditionally in the Nexus app layer. + async fn instance_multicast_group_join_v2026_01_08_00( + rqctx: RequestContext, + path_params: Path< + v2026_01_08_00::multicast::InstanceMulticastGroupPath, + >, + query_params: Query, + body_params: TypedBody< + v2026_01_08_00::multicast::InstanceMulticastGroupJoin, + >, + ) -> Result< + HttpResponseCreated, + HttpError, + > { + Self::instance_multicast_group_join( + rqctx, + path_params, + query_params, + body_params, + ) + .await + } + // Cannot delegate to lib.rs: old API version has no body parameter, but the // new `instance_multicast_group_join` requires `TypedBody`. // TypedBody has no public constructor, so we can't create a default body for delegation. diff --git a/nexus/tests/integration_tests/multicast/groups.rs b/nexus/tests/integration_tests/multicast/groups.rs index 6f810eb04d7..9ae620b7dc2 100644 --- a/nexus/tests/integration_tests/multicast/groups.rs +++ b/nexus/tests/integration_tests/multicast/groups.rs @@ -866,14 +866,6 @@ async fn test_cannot_delete_multicast_pool_with_groups( "IP Pool cannot be deleted while it contains IP ranges" ); - // Verify we can't unlink the pool from the silo while groups are - // allocated from it. - let unlink_url = format!( - "/v1/system/ip-pools/{pool_name}/silos/{}", - DEFAULT_SILO.name().as_str() - ); - object_delete_error(client, &unlink_url, StatusCode::BAD_REQUEST).await; - cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; wait_for_group_deleted(cptestctx, group_name).await; @@ -890,9 +882,6 @@ async fn test_cannot_delete_multicast_pool_with_groups( "Should be able to delete range after groups are implicitly deleted", ); - // And we can unlink the pool from the silo - object_delete(client, &unlink_url).await; - // And now we should be able to delete the pool NexusRequest::object_delete(client, &pool_url) .authn_as(AuthnMode::PrivilegedUser) diff --git a/nexus/tests/integration_tests/multicast/instances.rs b/nexus/tests/integration_tests/multicast/instances.rs index 245e284248e..521d85d0405 100644 --- a/nexus/tests/integration_tests/multicast/instances.rs +++ b/nexus/tests/integration_tests/multicast/instances.rs @@ -377,7 +377,7 @@ async fn test_multicast_group_attach_conflicts( } #[nexus_test] -async fn test_multicast_group_attach_limits( +async fn test_multicast_group_attach_multiple( cptestctx: &ControlPlaneTestContext, ) { let client = &cptestctx.external_client; @@ -390,14 +390,8 @@ async fn test_multicast_group_attach_limits( ) .await; - // Group names for implicit groups (implicitly created when first member joins) - let group_names = [ - "limit-test-group-0", - "limit-test-group-1", - "limit-test-group-2", - "limit-test-group-3", - "limit-test-group-4", - ]; + let group_names = + ["limit-test-group-0", "limit-test-group-1", "limit-test-group-2"]; // Create instance first (groups will be implicitly created when attached) let instance = instance_for_multicast_groups( @@ -409,8 +403,8 @@ async fn test_multicast_group_attach_limits( ) .await; - // Attach instance to 3 groups (implicitly creates each group) - let multicast_group_names = &group_names[0..3]; + // Attach instance to multiple groups (implicitly creates each group) + let multicast_group_names = &group_names; for group_name in multicast_group_names { multicast_group_attach( cptestctx, @@ -585,29 +579,23 @@ async fn test_multicast_concurrent_operations( // Wait for final state to be consistent (should still have 2 members) wait_for_member_count(client, "concurrent-test-group", 2).await; - // Concurrent operations during reconciler processing - - // Start a member addition and immediately follow with another operation - // This tests handling of operations that arrive while reconciler is processing - let rapid_ops_future = async { - multicast_group_attach( - cptestctx, - PROJECT_NAME, - "concurrent-instance-3", - "concurrent-test-group", - ) - .await; - // Don't wait for reconciler; immediately do another operation - multicast_group_detach( - client, - PROJECT_NAME, - "concurrent-instance-4", - "concurrent-test-group", - ) - .await; - }; - - rapid_ops_future.await; + // Back-to-back operations without waiting for reconciler between them. + // Tests that the reconciler handles state changes that arrive while it + // is still processing a previous batch. + multicast_group_attach( + cptestctx, + PROJECT_NAME, + "concurrent-instance-3", + "concurrent-test-group", + ) + .await; + multicast_group_detach( + client, + PROJECT_NAME, + "concurrent-instance-4", + "concurrent-test-group", + ) + .await; // Wait for system to reach consistent final state (should have 2 members) wait_for_member_count(client, "concurrent-test-group", 2).await; @@ -896,6 +884,94 @@ async fn test_multicast_migration_scenarios( .await .expect("Group should exist in DPD after migration"); + // Verify sled-agent state after migration: the target sled should + // have the VMM subscription and M2P mapping. The source sled should + // not have any subscription for the old propolis. + { + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => { + panic!("Expected IPv6 underlay address, got {other}") + } + }; + + // Target sled should have the VMM subscription after the + // reconciler pushes it via verify_members. Poll because the + // reconciler may still be propagating state to the sled-agent. + let post_info = nexus + .active_instance_info(&instance1_id, None) + .await + .unwrap() + .unwrap(); + + let target_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == target_sled) + .unwrap() + .sled_agent(); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let groups = target_agent.multicast_groups.lock().unwrap(); + let has_sub = + groups.get(&post_info.propolis_id).map_or(false, |g| { + g.iter().any(|m| m.group_ip == multicast_ip) + }); + if has_sub { Ok(()) } else { Err(CondCheckError::NotYet::<()>) } + }, + &POLL_INTERVAL, + &POLL_TIMEOUT, + ) + .await + .expect("Target sled should have VMM subscription after migration"); + + // Target sled should have M2P mapping. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = target_agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &POLL_TIMEOUT, + ) + .await + .expect("Target sled should have M2P mapping after migration"); + + // TODO: assert the source sled no longer holds a multicast + // subscription for the old propolis_id. On real hardware, + // VMM teardown (release_opte_ports -> PortTicket::release_inner) + // clears it. The sim does not model per-propolis cleanup on + // unregister for any of the networking maps (external_ips, + // attached_subnets, multicast_groups). + } + // Case: Concurrent migrations let group2_name = "concurrent-migration-group"; @@ -911,7 +987,9 @@ async fn test_multicast_migration_scenarios( group2_name, ) .await; + wait_for_group_active(client, group2_name).await; + multicast_group_attach( cptestctx, project_name, @@ -1787,14 +1865,13 @@ async fn test_multicast_ipv6_lifecycle(cptestctx: &ControlPlaneTestContext) { instance_wait_for_state(client, instance_id, InstanceState::Running).await; wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - let member_joined = wait_for_member_state( + wait_for_member_state( cptestctx, group_name, instance.identity.id, nexus_db_model::MulticastGroupMemberState::Joined, ) .await; - assert_eq!(member_joined.state, "Joined"); // Stop the instance - member should transition to "Left" let stop_url = @@ -1813,14 +1890,13 @@ async fn test_multicast_ipv6_lifecycle(cptestctx: &ControlPlaneTestContext) { instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - let member_left = wait_for_member_state( + wait_for_member_state( cptestctx, group_name, instance.identity.id, nexus_db_model::MulticastGroupMemberState::Left, ) .await; - assert_eq!(member_left.state, "Left"); // Delete the instance - this should delete the group since it's the only member cleanup_instances(cptestctx, client, project_name, &["ipv6-instance"]) diff --git a/nexus/tests/integration_tests/multicast/mod.rs b/nexus/tests/integration_tests/multicast/mod.rs index cc3c947008c..742ea6df22d 100644 --- a/nexus/tests/integration_tests/multicast/mod.rs +++ b/nexus/tests/integration_tests/multicast/mod.rs @@ -69,6 +69,7 @@ mod pool_selection; // Timeout constants for test operations const POLL_INTERVAL: Duration = Duration::from_millis(50); +const POLL_TIMEOUT: Duration = Duration::from_secs(30); const MULTICAST_OPERATION_TIMEOUT: Duration = Duration::from_secs(120); /// Generic helper for PUT upsert requests that return 201 Created. @@ -211,6 +212,11 @@ pub(crate) async fn create_multicast_ip_pool_v6( pool } +/// The reconciler can take longer than the default 10s timeout under +/// parallel test load, especially after the CRDB graceful-shutdown +/// change (eb8ae2f8f). 30s matches other heavy background task timeouts. +const RECONCILER_ACTIVATION_TIMEOUT: Duration = Duration::from_secs(30); + /// Waits for the multicast group reconciler to complete. /// /// This wraps wait_background_task with the correct task name. @@ -231,9 +237,10 @@ pub(crate) async fn wait_for_multicast_reconciler( pub(crate) async fn activate_multicast_reconciler( lockstep_client: &ClientTestContext, ) -> nexus_lockstep_client::types::BackgroundTask { - nexus_test_utils::background::activate_background_task( + nexus_test_utils::background::activate_background_task_with_timeout( lockstep_client, "multicast_reconciler", + RECONCILER_ACTIVATION_TIMEOUT, ) .await } @@ -307,8 +314,8 @@ where /// This function verifies that inventory has SP data for EVERY in-service sled, /// not just that inventory completed. /// -/// This is required for multicast member operations which map `sled_id` → `sp_slot` -/// → switch ports via inventory. +/// This is required for multicast member operations which map `sled_id` to +/// `sp_slot` to switch ports via inventory. pub(crate) async fn ensure_inventory_ready( cptestctx: &ControlPlaneTestContext, ) { @@ -358,9 +365,8 @@ pub(crate) async fn ensure_inventory_ready( let mut missing_sleds = Vec::new(); for sled in &sleds { let has_sp = inventory.sps.iter().any(|(bb, _)| { - (bb.serial_number == sled.serial_number() - && bb.part_number == sled.part_number()) - || bb.serial_number == sled.serial_number() + bb.serial_number == sled.serial_number() + && bb.part_number == sled.part_number() }); if !has_sp { @@ -385,8 +391,8 @@ pub(crate) async fn ensure_inventory_ready( Err(CondCheckError::::NotYet) } }, - &Duration::from_millis(500), // Check every 500ms - &Duration::from_secs(120), // Wait up to 120s + &Duration::from_millis(500), + &MULTICAST_OPERATION_TIMEOUT, ) .await { @@ -448,8 +454,8 @@ pub(crate) async fn ensure_dpd_ready(cptestctx: &ControlPlaneTestContext) { } } }, - &Duration::from_millis(200), // Check every 200ms - &Duration::from_secs(30), // Wait up to 30 seconds for switches + &Duration::from_millis(200), + &POLL_TIMEOUT, ) .await { @@ -1067,19 +1073,16 @@ pub(crate) async fn wait_for_group_deleted( lockstep_client, || async { let group_url = mcast_group_url(group_name); - match NexusRequest::object_get(client, &group_url) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - { - Ok(response) => { - if response.status == StatusCode::NOT_FOUND { - Ok(()) - } else { - Err(CondCheckError::<()>::NotYet) - } - } - Err(_) => Ok(()), // Assume 404 or similar error means deleted + let response = NexusRequest::new( + RequestBuilder::new(client, Method::GET, &group_url) + .expect_status(Some(StatusCode::NOT_FOUND)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await; + match response { + Ok(_) => Ok(()), + Err(_) => Err(CondCheckError::<()>::NotYet), } }, &POLL_INTERVAL, diff --git a/nexus/tests/integration_tests/multicast/networking_integration.rs b/nexus/tests/integration_tests/multicast/networking_integration.rs index 3b28892ef82..f0f10a9ff15 100644 --- a/nexus/tests/integration_tests/multicast/networking_integration.rs +++ b/nexus/tests/integration_tests/multicast/networking_integration.rs @@ -8,10 +8,14 @@ //! //! - External IPs: Instances with ephemeral/floating IPs can join multicast groups //! - Floating IP attach/detach: Multicast membership unaffected by IP changes +//! - Sled-agent M2P/forwarding propagation on member join and group deletion +//! - Per-VMM multicast subscriptions via sled-agent -use std::time::Duration; +use std::net::IpAddr; use http::{Method, StatusCode}; +use nexus_db_lookup::LookupPath; +use nexus_db_queries::context::OpContext; use nexus_test_utils::http_testing::{AuthnMode, NexusRequest, RequestBuilder}; use nexus_test_utils::resource_helpers::create_floating_ip; use nexus_test_utils::resource_helpers::{ @@ -30,6 +34,7 @@ use omicron_common::api::external::{ ByteCount, IdentityMetadataCreateParams, Instance, InstanceCpuCount, NameOrId, }; +use omicron_nexus::TestInterfaces; use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; @@ -637,13 +642,13 @@ async fn test_multicast_with_floating_ip_basic( Err(CondCheckError::::NotYet) } }, - &Duration::from_millis(200), - &Duration::from_secs(30), + &POLL_INTERVAL, + &POLL_TIMEOUT, ) .await .unwrap_or_else(|e| { panic!( - "instance did not show floating IP {} as attached within 30s: {e:?}", + "instance did not show floating IP {} as attached within {POLL_TIMEOUT:?}: {e:?}", floating_ip.ip ) }); @@ -694,13 +699,13 @@ async fn test_multicast_with_floating_ip_basic( Err(CondCheckError::::NotYet) } }, - &Duration::from_millis(200), - &Duration::from_secs(30), + &POLL_INTERVAL, + &POLL_TIMEOUT, ) .await .unwrap_or_else(|e| { panic!( - "instance still showed floating IP {} as attached after 30s: {e:?}", + "instance still showed floating IP {} as attached after {POLL_TIMEOUT:?}: {e:?}", floating_ip.ip ) }); @@ -713,3 +718,977 @@ async fn test_multicast_with_floating_ip_basic( cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; wait_for_group_deleted(cptestctx, group_name).await; } + +/// Verify that when an instance joins a multicast group, the reconciler +/// pushes M2P mappings, forwarding entries, and per-VMM subscriptions +/// to the sim sled-agent. Also verify cleanup on instance deletion. +#[nexus_test] +async fn test_multicast_sled_agent_m2p_and_subscriptions( + cptestctx: &nexus_test_utils::ControlPlaneTestContext< + omicron_nexus::Server, + >, +) { + let client = &cptestctx.external_client; + let project_name = "sled-agent-mcast-project"; + let group_name = "sled-agent-mcast-group"; + let instance_name = "sled-agent-mcast-instance"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "sled-agent-mcast-pool", + (224, 150, 0, 1), + (224, 150, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + // Create and start an instance. + let instance_params = InstanceCreate { + identity: IdentityMetadataCreateParams { + name: instance_name.parse().unwrap(), + description: "Instance for sled-agent multicast test".to_string(), + }, + ncpus: InstanceCpuCount::try_from(1).unwrap(), + memory: ByteCount::from_gibibytes_u32(1), + hostname: instance_name.parse().unwrap(), + user_data: vec![], + ssh_public_keys: None, + network_interfaces: InstanceNetworkInterfaceAttachment::DefaultIpv4, + external_ips: vec![], + multicast_groups: vec![], + disks: vec![], + boot_disk: None, + cpu_platform: None, + start: true, + auto_restart_policy: Default::default(), + anti_affinity_groups: Vec::new(), + }; + + let instance_url = format!("/v1/instances?project={project_name}"); + let instance: Instance = + object_create(client, &instance_url, &instance_params).await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + + instance_wait_for_running_with_simulation(cptestctx, instance_id).await; + wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + + // Attach instance to a multicast group. + multicast_group_attach(cptestctx, project_name, instance_name, group_name) + .await; + wait_for_group_active(client, group_name).await; + + // Wait for the member to reach "Joined" state (reconciler processes it). + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Look up the underlay multicast IPv6 address for verification. + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + let group_view = get_multicast_group(client, group_name).await; + let multicast_ip = group_view.multicast_ip; + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay multicast group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // Verify M2P mapping on the sim sled-agent. + let sled_agent = cptestctx.first_sled_agent(); + { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + assert!( + m2p.contains(&(multicast_ip, underlay_ipv6)), + "Sled-agent should have M2P mapping ({multicast_ip}, \ + {underlay_ipv6}), got: {m2p:?}" + ); + } + + // Verify forwarding entries on the sim sled-agent. + // The forwarding entry points at a switch for replication. + { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + assert!( + fwd.contains_key(&underlay_ipv6), + "Sled-agent should have forwarding entry for {underlay_ipv6}, \ + got: {fwd:?}" + ); + let next_hops = &fwd[&underlay_ipv6]; + assert_eq!( + next_hops.len(), + 1, + "Should have 1 next_hop (a switch), got: {next_hops:?}" + ); + } + + // Verify per-VMM multicast subscription on the sim sled-agent. + { + let info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Running instance should have active info"); + + let groups = sled_agent.multicast_groups.lock().unwrap(); + let vmm_groups = groups + .get(&info.propolis_id) + .expect("Sled-agent should have multicast groups for propolis"); + + assert!( + vmm_groups.iter().any(|m| m.group_ip == multicast_ip), + "VMM should be subscribed to multicast group {multicast_ip}, \ + got: {vmm_groups:?}" + ); + } + + // Stop the instance. The member transitions "Joined" -> "Left". + let stop_url = + format!("/v1/instances/{instance_name}/stop?project={project_name}"); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &stop_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Should stop instance"); + + wait_for_instance_stopped(cptestctx, client, instance_id, instance_name) + .await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Left, + ) + .await; + + // Per-VMM subscription cleanup after stop is not asserted here. + // In production, destroying the VMM tears down the OPTE port, which + // implicitly removes multicast subscriptions. The reconciler's + // unsubscribe path correctly skips when the propolis_id is gone + // (matching production semantics where the port no longer exists). + // + // V2P follows the same pattern: sled-agent cleanup is keyed by + // network identity, not VMM identity. + + // M2P and forwarding should be cleared since there are no "Joined" + // members remaining. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + if !m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("M2P should be cleared when no Joined members remain"); + + // Forwarding should also be cleared when no "Joined" members remain. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + if !fwd.contains_key(&underlay_ipv6) { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("Forwarding should be cleared when no Joined members remain"); + + // Delete the instance, which should trigger group deletion. + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; + wait_for_group_deleted(cptestctx, group_name).await; + + // Verify M2P and forwarding are cleared. + { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + assert!( + !m2p.contains(&(multicast_ip, underlay_ipv6)), + "M2P mapping should be cleared after group deletion, got: {m2p:?}" + ); + } + { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + assert!( + !fwd.contains_key(&underlay_ipv6), + "Forwarding entry should be cleared after group deletion, \ + got: {fwd:?}" + ); + } +} + +/// Verify M2P and forwarding entries propagate to all sleds, not just the +/// hosting sled. Analogous to `test_instance_v2p_mappings` which verifies +/// V2P mappings on all sleds. +/// +/// Also verifies cleanup: after instance deletion, M2P and forwarding +/// entries are removed from every sled. +#[nexus_test(extra_sled_agents = 1)] +async fn test_multicast_multi_sled_m2p_propagation( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let project_name = "multi-sled-mcast-project"; + let group_name = "multi-sled-mcast-group"; + let instance_name = "multi-sled-mcast-instance"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "multi-sled-mcast-pool", + (224, 160, 0, 1), + (224, 160, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + // Collect all sled agents (2 total: 1 default + 1 extra). + // We use extra_sled_agents = 1 (not 2) because the gateway sim only + // provides SP data for the two well-known sled UUIDs. A 3rd sled with + // a random UUID would have no SP entry, causing inventory readiness + // to time out. Two sleds is sufficient to verify cross-sled propagation. + let all_sled_agents: Vec<_> = + cptestctx.sled_agents.iter().map(|sa| sa.sled_agent()).collect(); + assert_eq!(all_sled_agents.len(), 2, "expected 2 sled agents"); + + // Create and start an instance. + let instance = instance_for_multicast_groups( + cptestctx, + project_name, + instance_name, + true, + &[], + ) + .await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + + instance_wait_for_running_with_simulation(cptestctx, instance_id).await; + wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + + // Attach to a multicast group. + multicast_group_attach(cptestctx, project_name, instance_name, group_name) + .await; + wait_for_group_active(client, group_name).await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Look up the underlay IPv6 address for verification. + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + let group_view = get_multicast_group(client, group_name).await; + let multicast_ip = group_view.multicast_ip; + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay multicast group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // Look up the hosting sled for subscription verification. + let info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Running instance should have active info"); + + let hosting_sled_id = info.sled_id; + + // M2P and forwarding are pushed to all sleds (like V2P). Any + // instance on any sled may send to a multicast group; without the + // M2P mapping OPTE's overlay layer silently drops the packet. + // Forwarding entries let sender sleds replicate to member sleds. + for (i, sled_agent) in cptestctx.sled_agents.iter().enumerate() { + let agent = sled_agent.sled_agent(); + + // Wait for M2P on every sled. The reconciler may need an + // additional pass after the member reaches "Joined": during + // reconcile_member_states, propagate_m2p_and_forwarding may + // see member_sleds=0 (member still "Joining" in DB), so the + // actual push happens in reconcile_active_groups or the next + // full pass. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!("Sled {i} should have M2P mapping within timeout: {e:?}") + }); + + // Verify forwarding on every sled. With a single member on + // one sled, the hosting sled's forwarding has no next hops + // (local delivery via subscription). Non-hosting sleds list + // the hosting sled as a next hop so senders can reach it. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let fwd = agent.mcast_fwd.lock().unwrap(); + if fwd.contains_key(&underlay_ipv6) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "Sled {i} should have forwarding entry within timeout: {e:?}" + ) + }); + + let fwd = agent.mcast_fwd.lock().unwrap(); + let next_hops = &fwd[&underlay_ipv6]; + // Every sled gets a single next hop pointing at a switch. + // The switch replicates to member sled ports via DPD config. + assert_eq!( + next_hops.len(), + 1, + "Sled {i} should have 1 next_hop (a switch), \ + got: {next_hops:?}" + ); + } + + // Verify per-VMM subscription on the hosting sled only. + // Subscriptions are member-sled-only (not all sleds). + let hosting_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == hosting_sled_id) + .unwrap() + .sled_agent(); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let groups = hosting_agent.multicast_groups.lock().unwrap(); + match groups.get(&info.propolis_id) { + Some(vmm_groups) + if vmm_groups + .iter() + .any(|m| m.group_ip == multicast_ip) => + { + Ok(()) + } + _ => Err(CondCheckError::NotYet::<()>), + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "VMM should be subscribed to {multicast_ip} within timeout: {e:?}" + ) + }); + + // Delete the instance, which triggers group deletion. + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; + wait_for_group_deleted(cptestctx, group_name).await; + + // Verify cleanup on every sled: M2P and forwarding removed. + for (i, sled_agent) in all_sled_agents.iter().enumerate() { + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + if !m2p.contains(&(multicast_ip, underlay_ipv6)) + && !fwd.contains_key(&underlay_ipv6) + { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "Sled {i} M2P/forwarding not cleaned up within timeout: {e:?}" + ) + }); + } +} + +/// Verify cross-sled forwarding when members exist on both sleds. +/// +/// With one member on sled A and another on sled B, each sled's forwarding +/// entry should list the other sled as its sole next hop (self-exclusion). +/// This exercises the `.filter(|(id, _)| *id != sled_id)` logic in +/// `converge_forwarding`. +#[nexus_test(extra_sled_agents = 1)] +async fn test_multicast_cross_sled_forwarding( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + let project_name = "bidir-fwd-project"; + let group_name = "bidir-fwd-group"; + let instance_a_name = "bidir-instance-a"; + let instance_b_name = "bidir-instance-b"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "bidir-fwd-pool", + (224, 170, 0, 1), + (224, 170, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + let sled_a_id = cptestctx.sled_agents[0].sled_agent_id(); + let sled_b_id = cptestctx.sled_agents[1].sled_agent_id(); + + // Pin instance A to sled A by making sled B non-provisionable. + { + let (authz_sled, ..) = LookupPath::new(&opctx, datastore) + .sled_id(sled_b_id) + .lookup_for(nexus_auth::authz::Action::Modify) + .await + .expect("lookup sled B"); + datastore + .sled_set_provision_policy( + &opctx, + &authz_sled, + nexus_types::external_api::sled::SledProvisionPolicy::NonProvisionable, + ) + .await + .expect("set sled B non-provisionable"); + } + + let instance_a = instance_for_multicast_groups( + cptestctx, + project_name, + instance_a_name, + true, + &[], + ) + .await; + let instance_a_id = InstanceUuid::from_untyped_uuid(instance_a.identity.id); + instance_wait_for_running_with_simulation(cptestctx, instance_a_id).await; + + // Verify instance A landed on sled A. + let info_a = nexus + .active_instance_info(&instance_a_id, None) + .await + .unwrap() + .expect("instance A should be running"); + assert_eq!(info_a.sled_id, sled_a_id, "instance A should be on sled A"); + + // Swap provisionability: sled A non-provisionable, sled B provisionable. + { + let (authz_sled_a, ..) = LookupPath::new(&opctx, datastore) + .sled_id(sled_a_id) + .lookup_for(nexus_auth::authz::Action::Modify) + .await + .expect("lookup sled A"); + let (authz_sled_b, ..) = LookupPath::new(&opctx, datastore) + .sled_id(sled_b_id) + .lookup_for(nexus_auth::authz::Action::Modify) + .await + .expect("lookup sled B"); + datastore + .sled_set_provision_policy( + &opctx, + &authz_sled_a, + nexus_types::external_api::sled::SledProvisionPolicy::NonProvisionable, + ) + .await + .expect("set sled A non-provisionable"); + datastore + .sled_set_provision_policy( + &opctx, + &authz_sled_b, + nexus_types::external_api::sled::SledProvisionPolicy::Provisionable, + ) + .await + .expect("set sled B provisionable"); + } + + let instance_b = instance_for_multicast_groups( + cptestctx, + project_name, + instance_b_name, + true, + &[], + ) + .await; + + let instance_b_id = InstanceUuid::from_untyped_uuid(instance_b.identity.id); + instance_wait_for_running_with_simulation(cptestctx, instance_b_id).await; + + // Verify instance B landed on sled B. + let info_b = nexus + .active_instance_info(&instance_b_id, None) + .await + .unwrap() + .expect("instance B should be running"); + + assert_eq!(info_b.sled_id, sled_b_id, "instance B should be on sled B"); + + // Both instances join the same multicast group. + multicast_group_attach( + cptestctx, + project_name, + instance_a_name, + group_name, + ) + .await; + + multicast_group_attach( + cptestctx, + project_name, + instance_b_name, + group_name, + ) + .await; + + wait_for_group_active(client, group_name).await; + + // Wait for both members to reach "Joined". + for instance in [&instance_a, &instance_b] { + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + } + + // Resolve underlay IPv6 for forwarding assertions. + let group_view = get_multicast_group(client, group_name).await; + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, group_view.multicast_ip) + .await + .expect("lookup group by IP"); + + let underlay_group = datastore + .underlay_multicast_group_fetch( + &opctx, + external_group + .underlay_group_id + .expect("active group should have underlay_group_id"), + ) + .await + .expect("fetch underlay group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // Wait for forwarding entries on both sleds, then verify each sled's + // forwarding lists exactly the other sled (not itself). + let agent_a = cptestctx.sled_agents[0].sled_agent(); + let agent_b = cptestctx.sled_agents[1].sled_agent(); + + for (label, agent) in [("sled A", &agent_a), ("sled B", &agent_b)] { + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let fwd = agent.mcast_fwd.lock().unwrap(); + match fwd.get(&underlay_ipv6) { + Some(hops) if hops.len() == 1 => Ok(()), + _ => Err(CondCheckError::NotYet::<()>), + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!("{label} should have exactly 1 forwarding next_hop: {e:?}") + }); + } + + // Cleanup. + cleanup_instances( + cptestctx, + client, + project_name, + &[instance_a_name, instance_b_name], + ) + .await; + wait_for_group_deleted(cptestctx, group_name).await; +} + +/// Verify multicast state is re-established after simulated cold start. +/// Analogous to `test_instance_start_creates_networking_state` which tests +/// V2P re-establishment after forcibly clearing sled-agent state. +/// +/// Steps: a) create instance, b) join multicast, c) stop instance, +/// d) forcibly clear all sim sled-agent multicast state, e) restart +/// instance, f) verify M2P, forwarding, and per-VMM subscriptions are +/// re-established. +#[nexus_test(extra_sled_agents = 1)] +async fn test_multicast_cold_start_reestablishment( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let project_name = "cold-start-mcast-project"; + let group_name = "cold-start-mcast-group"; + let instance_name = "cold-start-mcast-instance"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "cold-start-mcast-pool", + (224, 170, 0, 1), + (224, 170, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + let all_sled_agents: Vec<_> = + cptestctx.sled_agents.iter().map(|sa| sa.sled_agent()).collect(); + + // Create and start an instance, join a multicast group. + let instance = instance_for_multicast_groups( + cptestctx, + project_name, + instance_name, + true, + &[], + ) + .await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + + instance_wait_for_running_with_simulation(cptestctx, instance_id).await; + wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + + multicast_group_attach(cptestctx, project_name, instance_name, group_name) + .await; + wait_for_group_active(client, group_name).await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Look up the underlay IPv6. + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + let group_view = get_multicast_group(client, group_name).await; + let multicast_ip = group_view.multicast_ip; + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay multicast group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // M2P and forwarding are pushed to all sleds. Verify at least the + // hosting sled has M2P before we clear state. + let pre_info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Running instance should have active info"); + + let pre_hosting_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == pre_info.sled_id) + .unwrap() + .sled_agent(); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = pre_hosting_agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("Hosting sled M2P should exist before cold start simulation"); + + // Stop the instance. + let stop_url = + format!("/v1/instances/{instance_name}/stop?project={project_name}"); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &stop_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Should stop instance"); + + wait_for_instance_stopped(cptestctx, client, instance_id, instance_name) + .await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Left, + ) + .await; + + // Forcibly clear all sim sled-agent multicast state, simulating a cold + // start where sled-agents lose in-memory state. + for sled_agent in &all_sled_agents { + sled_agent.m2p_mappings.lock().unwrap().clear(); + sled_agent.mcast_fwd.lock().unwrap().clear(); + sled_agent.multicast_groups.lock().unwrap().clear(); + } + + // Restart the instance. + let start_url = + format!("/v1/instances/{instance_name}/start?project={project_name}"); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &start_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Should start instance"); + + // Use `try_instance_simulate` here instead of `instance_wait_for_running_with_simulation` + // because the old VMM may still be draining from the sim collection after + // the stop. `instance_simulate` would panic if it pokes a VMM that was just + // removed; `try_instance_simulate` handles that gracefully. + wait_for_condition( + || async { + let _ = + instance_helpers::try_instance_simulate(nexus, &instance_id) + .await; + + let url = format!("/v1/instances/{instance_id}"); + let instance: Instance = NexusRequest::object_get(client, &url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .map_err(|_| CondCheckError::<()>::NotYet)? + .parsed_body() + .map_err(|_| CondCheckError::<()>::NotYet)?; + + if instance.runtime.run_state == InstanceState::Running { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("Instance should reach Running after restart"); + + // Wait for the reconciler to re-establish multicast state. + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Verify M2P and forwarding re-established on all sleds. + for (i, sled_agent) in all_sled_agents.iter().enumerate() { + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!("Sled {i} M2P not re-established within timeout: {e:?}") + }); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + if fwd.contains_key(&underlay_ipv6) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "Sled {i} forwarding not re-established within timeout: {e:?}" + ) + }); + } + + // Verify per-VMM subscription on the hosting sled (new propolis_id + // since restart creates a new VMM). + let post_info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Restarted instance should have active info"); + + let post_hosting_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == post_info.sled_id) + .unwrap() + .sled_agent(); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let groups = post_hosting_agent.multicast_groups.lock().unwrap(); + match groups.get(&post_info.propolis_id) { + Some(vmm_groups) + if vmm_groups + .iter() + .any(|m| m.group_ip == multicast_ip) => + { + Ok(()) + } + _ => Err(CondCheckError::NotYet::<()>), + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "New VMM should be subscribed to {multicast_ip} after restart: \ + {e:?}" + ) + }); + + // Cleanup. + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; + wait_for_group_deleted(cptestctx, group_name).await; +} diff --git a/openapi/nexus/nexus-2026052000.0.0-ced7df.json.gitstub b/openapi/nexus/nexus-2026052000.0.0-ced7df.json.gitstub new file mode 100644 index 00000000000..d265307f58f --- /dev/null +++ b/openapi/nexus/nexus-2026052000.0.0-ced7df.json.gitstub @@ -0,0 +1 @@ +4c614ceb0b05e7ab9b2a1f19504a2328dc87a087:openapi/nexus/nexus-2026052000.0.0-ced7df.json diff --git a/openapi/nexus/nexus-2026052000.0.0-ced7df.json b/openapi/nexus/nexus-2026052200.0.0-c91da3.json similarity index 99% rename from openapi/nexus/nexus-2026052000.0.0-ced7df.json rename to openapi/nexus/nexus-2026052200.0.0-c91da3.json index c3b359aed5a..0f3fa92f5bf 100644 --- a/openapi/nexus/nexus-2026052000.0.0-ced7df.json +++ b/openapi/nexus/nexus-2026052200.0.0-c91da3.json @@ -7,7 +7,7 @@ "url": "https://oxide.computer", "email": "api@oxide.computer" }, - "version": "2026052000.0.0" + "version": "2026052200.0.0" }, "paths": { "/device/auth": { @@ -4770,7 +4770,7 @@ "experimental" ], "summary": "Join multicast group by name, IP address, or UUID", - "description": "Groups can be referenced by name, IP address, or UUID. If the group doesn't exist, it's implicitly created with an auto-allocated IP from a multicast pool linked to the caller's silo. When referencing by UUID, the group must already exist.\n\nSource IPs are optional for ASM addresses but required for SSM addresses (232.0.0.0/8 for IPv4, ff3x::/32 for IPv6). Duplicate IPs in the request are automatically deduplicated, with a maximum of 64 source IPs allowed.", + "description": "Groups can be referenced by name, IP address, or UUID. If the group doesn't exist, it's implicitly created with an auto-allocated IP from a multicast pool linked to the caller's silo. When referencing by UUID, the group must already exist.\n\nSource IPs are optional for ASM addresses but required for SSM addresses (232.0.0.0/8 for IPv4, ff3x::/32 for IPv6). Duplicate source IPs in a single request are rejected. Per-member source list is capped at 32, and the union of source IPs across all members of a single group is capped at 256.", "operationId": "instance_multicast_group_join", "parameters": [ { diff --git a/openapi/nexus/nexus-latest.json b/openapi/nexus/nexus-latest.json index 867f4785de9..07096de5e65 120000 --- a/openapi/nexus/nexus-latest.json +++ b/openapi/nexus/nexus-latest.json @@ -1 +1 @@ -nexus-2026052000.0.0-ced7df.json \ No newline at end of file +nexus-2026052200.0.0-c91da3.json \ No newline at end of file diff --git a/openapi/sled-agent/sled-agent-40.0.0-600e45.json.gitstub b/openapi/sled-agent/sled-agent-40.0.0-600e45.json.gitstub new file mode 100644 index 00000000000..23b7f916608 --- /dev/null +++ b/openapi/sled-agent/sled-agent-40.0.0-600e45.json.gitstub @@ -0,0 +1 @@ +a3a58cc520fcb12bfad1a946eae784ec08cf8718:openapi/sled-agent/sled-agent-40.0.0-600e45.json diff --git a/openapi/sled-agent/sled-agent-40.0.0-600e45.json b/openapi/sled-agent/sled-agent-41.0.0-6be62d.json similarity index 97% rename from openapi/sled-agent/sled-agent-40.0.0-600e45.json rename to openapi/sled-agent/sled-agent-41.0.0-6be62d.json index fa6c04f9104..5bf8a4a4b56 100644 --- a/openapi/sled-agent/sled-agent-40.0.0-600e45.json +++ b/openapi/sled-agent/sled-agent-41.0.0-6be62d.json @@ -7,7 +7,7 @@ "url": "https://oxide.computer", "email": "api@oxide.computer" }, - "version": "40.0.0" + "version": "41.0.0" }, "paths": { "/artifacts": { @@ -389,6 +389,162 @@ } } }, + "/networking/mcast-fwd": { + "get": { + "summary": "List multicast forwarding entries present on this sled.", + "operationId": "list_mcast_fwd", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_McastForwardingEntry", + "type": "array", + "items": { + "$ref": "#/components/schemas/McastForwardingEntry" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "put": { + "summary": "Set multicast forwarding entries for an underlay address.", + "operationId": "set_mcast_fwd", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/McastForwardingEntry" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "summary": "Clear multicast forwarding entries for an underlay address.", + "operationId": "clear_mcast_fwd", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ClearMcastForwarding" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/networking/mcast-m2p": { + "get": { + "summary": "List M2P mappings present on this sled.", + "operationId": "list_mcast_m2p", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_Mcast2PhysMapping", + "type": "array", + "items": { + "$ref": "#/components/schemas/Mcast2PhysMapping" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "put": { + "summary": "Set a multicast-to-physical (M2P) mapping in OPTE.", + "operationId": "set_mcast_m2p", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Mcast2PhysMapping" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "summary": "Clear a multicast-to-physical (M2P) mapping in OPTE.", + "operationId": "clear_mcast_m2p", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ClearMcast2Phys" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/omicron-config": { "put": { "operationId": "omicron_config_put", @@ -2310,7 +2466,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/InstanceMulticastBody" + "$ref": "#/components/schemas/InstanceMulticastMembership" } } }, @@ -2344,7 +2500,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/InstanceMulticastBody" + "$ref": "#/components/schemas/InstanceMulticastMembership" } } }, @@ -4018,6 +4174,40 @@ } ] }, + "ClearMcast2Phys": { + "description": "Clear a mapping from an overlay multicast group to an underlay multicast address.", + "type": "object", + "properties": { + "group": { + "description": "Overlay multicast group address.", + "type": "string", + "format": "ip" + }, + "underlay": { + "description": "Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`].", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "group", + "underlay" + ] + }, + "ClearMcastForwarding": { + "description": "Clear all forwarding entries for an underlay multicast address.", + "type": "object", + "properties": { + "underlay": { + "description": "Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`].", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "underlay" + ] + }, "CombineError": { "type": "string", "enum": [ @@ -5990,35 +6180,6 @@ "src_propolis_addr" ] }, - "InstanceMulticastBody": { - "description": "Request body for multicast group operations.", - "oneOf": [ - { - "type": "object", - "properties": { - "join": { - "$ref": "#/components/schemas/InstanceMulticastMembership" - } - }, - "required": [ - "join" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "leave": { - "$ref": "#/components/schemas/InstanceMulticastMembership" - } - }, - "required": [ - "leave" - ], - "additionalProperties": false - } - ] - }, "InstanceMulticastMembership": { "description": "Represents a multicast group membership for an instance.\n\nIntroduced in v7.", "type": "object", @@ -6974,6 +7135,151 @@ "minimum": 1, "maximum": 32 }, + "Mcast2PhysMapping": { + "description": "Mapping from an overlay multicast group to an underlay multicast address.\n\nThe underlay address must be within the underlay multicast subnet (ff04::/64). This invariant is enforced by mapping in Nexus, not validated at this layer.", + "type": "object", + "properties": { + "group": { + "description": "Overlay multicast group address.", + "type": "string", + "format": "ip" + }, + "underlay": { + "description": "Underlay IPv6 multicast address (ff04::/64).", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "group", + "underlay" + ] + }, + "McastFilterMode": { + "description": "Filter mode for multicast source filtering.", + "oneOf": [ + { + "description": "Accept only packets from listed sources (SSM).", + "type": "string", + "enum": [ + "include" + ] + }, + { + "description": "Accept packets from all sources except those listed. With an empty sources list this is any-source multicast (ASM).", + "type": "string", + "enum": [ + "exclude" + ] + } + ] + }, + "McastForwardingEntry": { + "description": "Forwarding entry for an underlay multicast address, specifying which next hops should receive replicated packets.", + "type": "object", + "properties": { + "next_hops": { + "description": "Next hops with replication and source filter configuration.", + "type": "array", + "items": { + "$ref": "#/components/schemas/McastForwardingNextHop" + } + }, + "underlay": { + "description": "Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`].", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "next_hops", + "underlay" + ] + }, + "McastForwardingNextHop": { + "description": "A forwarding next hop with replication mode and aggregated source filter.", + "type": "object", + "properties": { + "filter": { + "description": "Aggregated source filter for this destination.", + "allOf": [ + { + "$ref": "#/components/schemas/McastSourceFilter" + } + ] + }, + "next_hop": { + "description": "Unicast IPv6 address of the destination sled.", + "type": "string", + "format": "ipv6" + }, + "replication": { + "description": "Replication mode for this next hop.", + "allOf": [ + { + "$ref": "#/components/schemas/McastReplication" + } + ] + } + }, + "required": [ + "filter", + "next_hop", + "replication" + ] + }, + "McastReplication": { + "description": "Replication mode for multicast forwarding.", + "oneOf": [ + { + "description": "Replicate to front panel ports (egress to external networks).", + "type": "string", + "enum": [ + "external" + ] + }, + { + "description": "Replicate to sled underlay ports.", + "type": "string", + "enum": [ + "underlay" + ] + }, + { + "description": "Replicate to both external and underlay ports.", + "type": "string", + "enum": [ + "both" + ] + } + ] + }, + "McastSourceFilter": { + "description": "Source filter for multicast forwarding.", + "type": "object", + "properties": { + "mode": { + "description": "Filter mode.", + "allOf": [ + { + "$ref": "#/components/schemas/McastFilterMode" + } + ] + }, + "sources": { + "description": "Source addresses to include or exclude.", + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + } + }, + "required": [ + "mode", + "sources" + ] + }, "Measurement": { "description": "An RoT provided measurement which represents a digest of some component in the trusted computing base (TCB) for the attestor.", "oneOf": [ diff --git a/openapi/sled-agent/sled-agent-latest.json b/openapi/sled-agent/sled-agent-latest.json index 5f70b30a1ee..4d51056ae30 120000 --- a/openapi/sled-agent/sled-agent-latest.json +++ b/openapi/sled-agent/sled-agent-latest.json @@ -1 +1 @@ -sled-agent-40.0.0-600e45.json \ No newline at end of file +sled-agent-41.0.0-6be62d.json \ No newline at end of file diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 0ee2567f5e1..17ec2abb23a 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -19,6 +19,10 @@ use omicron_common::api::internal::{ SledIdentifiers, VirtualNetworkInterfaceHost, }, }; +use sled_agent_types_versions::latest::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, +}; use sled_agent_types_versions::{ latest, v1, v4, v6, v7, v9, v10, v11, v12, v14, v16, v17, v18, v20, v22, v24, v25, v26, v28, v29, v30, v31, v33, v34, v37, v39, @@ -38,6 +42,7 @@ api_versions!([ // | example for the next person. // v // (next_int, IDENT), + (41, MCAST_M2P_FORWARDING), (40, ADD_FMD_TO_INVENTORY), (39, BOOTSTORE_SERVICE_NAT_GENERATION), (38, RENAME_PORT_FEC_SPEED_TO_LINK_FEC_SPEED), @@ -638,25 +643,79 @@ pub trait SledAgentApi { #[endpoint { method = PUT, path = "/vmms/{propolis_id}/multicast-group", - versions = VERSION_MULTICAST_SUPPORT.., + versions = VERSION_MCAST_M2P_FORWARDING.., }] async fn vmm_join_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result; #[endpoint { method = DELETE, path = "/vmms/{propolis_id}/multicast-group", - versions = VERSION_MULTICAST_SUPPORT.., + versions = VERSION_MCAST_M2P_FORWARDING.., }] async fn vmm_leave_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result; + /// Join a multicast group. + /// + /// Accepts a tagged `InstanceMulticastBody` request. + /// Superseded in MCAST_M2P_FORWARDING. + #[endpoint { + operation_id = "vmm_join_multicast_group", + method = PUT, + path = "/vmms/{propolis_id}/multicast-group", + versions = VERSION_MULTICAST_SUPPORT..VERSION_MCAST_M2P_FORWARDING, + }] + async fn vmm_join_multicast_group_v7( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let body = body.try_map(|b| match b { + v7::instance::InstanceMulticastBody::Join(m) => Ok(m), + v7::instance::InstanceMulticastBody::Leave(_) => { + Err(HttpError::for_bad_request( + None, + "Join endpoint cannot process Leave operations".to_string(), + )) + } + })?; + Self::vmm_join_multicast_group(rqctx, path_params, body).await + } + + /// Leave a multicast group. + /// + /// Accepts a tagged `InstanceMulticastBody` request. + /// Superseded in MCAST_M2P_FORWARDING. + #[endpoint { + operation_id = "vmm_leave_multicast_group", + method = DELETE, + path = "/vmms/{propolis_id}/multicast-group", + versions = VERSION_MULTICAST_SUPPORT..VERSION_MCAST_M2P_FORWARDING, + }] + async fn vmm_leave_multicast_group_v7( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let body = body.try_map(|b| match b { + v7::instance::InstanceMulticastBody::Leave(m) => Ok(m), + v7::instance::InstanceMulticastBody::Join(_) => { + Err(HttpError::for_bad_request( + None, + "Leave endpoint cannot process Join operations".to_string(), + )) + } + })?; + Self::vmm_leave_multicast_group(rqctx, path_params, body).await + } + #[endpoint { method = PUT, path = "/disks/{disk_id}", @@ -813,6 +872,70 @@ pub trait SledAgentApi { rqctx: RequestContext, ) -> Result>, HttpError>; + /// Set a multicast-to-physical (M2P) mapping in OPTE. + #[endpoint { + method = PUT, + path = "/networking/mcast-m2p", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn set_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Clear a multicast-to-physical (M2P) mapping in OPTE. + #[endpoint { + method = DELETE, + path = "/networking/mcast-m2p", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn clear_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Set multicast forwarding entries for an underlay address. + #[endpoint { + method = PUT, + path = "/networking/mcast-fwd", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn set_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Clear multicast forwarding entries for an underlay address. + #[endpoint { + method = DELETE, + path = "/networking/mcast-fwd", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn clear_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// List M2P mappings present on this sled. + #[endpoint { + method = GET, + path = "/networking/mcast-m2p", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn list_mcast_m2p( + rqctx: RequestContext, + ) -> Result>, HttpError>; + + /// List multicast forwarding entries present on this sled. + #[endpoint { + method = GET, + path = "/networking/mcast-fwd", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn list_mcast_fwd( + rqctx: RequestContext, + ) -> Result>, HttpError>; + #[endpoint { method = POST, path = "/switch-ports", diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 03a2b42a38e..fd8dfb57bf0 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -45,12 +45,16 @@ use sled_agent_types::early_networking::EarlyNetworkConfigEnvelope; use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; use sled_agent_types::instance::SledVmmState; use sled_agent_types::instance::{ - InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastBody, + InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastMembership, VmmIssueDiskSnapshotRequestBody, VmmIssueDiskSnapshotRequestPathParam, VmmIssueDiskSnapshotRequestResponse, VmmPathParam, VmmPutStateBody, VmmPutStateResponse, VmmUnregisterResponse, VpcPathParam, }; use sled_agent_types::inventory::{Inventory, OmicronSledConfig}; +use sled_agent_types::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, +}; use sled_agent_types::probes::ProbeSet; use sled_agent_types::rot::{ Attestation, CertificateChain, MeasurementLog, Nonce, RotPathParams, @@ -710,14 +714,14 @@ impl SledAgentApi for SledAgentImpl { async fn vmm_join_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result { let sa = rqctx.context(); let id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); + let membership = body.into_inner(); sa.latencies() .instrument_dropshot_handler(&rqctx, async { - sa.instance_join_multicast_group(id, &body_args).await?; + sa.instance_join_multicast_group(id, &membership).await?; Ok(HttpResponseUpdatedNoContent()) }) .await @@ -726,14 +730,14 @@ impl SledAgentApi for SledAgentImpl { async fn vmm_leave_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result { let sa = rqctx.context(); let id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); + let membership = body.into_inner(); sa.latencies() .instrument_dropshot_handler(&rqctx, async { - sa.instance_leave_multicast_group(id, &body_args).await?; + sa.instance_leave_multicast_group(id, &membership).await?; Ok(HttpResponseUpdatedNoContent()) }) .await @@ -932,6 +936,86 @@ impl SledAgentApi for SledAgentImpl { .await } + async fn set_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.set_mcast_m2p(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn clear_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.clear_mcast_m2p(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn set_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.set_mcast_fwd(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn clear_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.clear_mcast_fwd(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn list_mcast_m2p( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + let m2p = sa.list_mcast_m2p().await.map_err(Error::from)?; + Ok(HttpResponseOk(m2p)) + }) + .await + } + + async fn list_mcast_fwd( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + let fwd = sa.list_mcast_fwd().await.map_err(Error::from)?; + Ok(HttpResponseOk(fwd)) + }) + .await + } + async fn uplink_ensure( rqctx: RequestContext, body: TypedBody, diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index a009e7f3843..57716676fdf 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -2325,7 +2325,11 @@ impl InstanceRunner { // for them. let mut opte_ports = Vec::with_capacity(self.requested_nics.len()); let mut opte_port_names = Vec::with_capacity(self.requested_nics.len()); + let mcast_cfg = self.multicast_group_cfgs(); for nic in self.requested_nics.iter() { + // Multicast subscriptions target the primary NIC only. + // See the TODO on ensure_multicast_groups. + let groups: &[_] = if nic.primary { &mcast_cfg } else { &[] }; let port = self.port_manager.create_port(PortCreateParams { nic, external_ips: &self.external_ips, @@ -2337,6 +2341,7 @@ impl InstanceRunner { .copied() .map(Into::into) .collect(), + multicast_groups: groups, })?; opte_port_names.push(port.0.name().to_string()); opte_ports.push(port); @@ -2618,12 +2623,13 @@ impl InstanceRunner { &mut self, membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - // Similar logic to add_external_ip - save state for rollback + // Save pre-call state so rollback restores exactly what was + // present, mirroring add_external_ip's old_config pattern. + let old_groups = self.multicast_groups.clone(); let out = self.join_multicast_group_inner(membership).await; if out.is_err() { - // Rollback state on error - self.multicast_groups.retain(|m| m != membership); + self.multicast_groups = old_groups; } out } @@ -2632,14 +2638,13 @@ impl InstanceRunner { &mut self, membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - // Similar logic to delete_external_ip - save state for rollback + // Save pre-call state so rollback restores exactly what was + // present, mirroring delete_external_ip's old_config pattern. + let old_groups = self.multicast_groups.clone(); let out = self.leave_multicast_group_inner(membership).await; if out.is_err() { - // Rollback state on error - readd the membership if it was removed - if !self.multicast_groups.contains(membership) { - self.multicast_groups.push(membership.clone()); - } + self.multicast_groups = old_groups; } out } @@ -2648,48 +2653,54 @@ impl InstanceRunner { self.refresh_multicast_groups_inner() } - async fn join_multicast_group_inner( - &mut self, - membership: &InstanceMulticastMembership, - ) -> Result<(), Error> { - // Check for duplicate membership (idempotency) - if self.multicast_groups.contains(membership) { - return Ok(()); - } - - // Add to local state - self.multicast_groups.push(membership.clone()); + /// Convert `InstanceMulticastMembership` list to OPTE + /// `MulticastGroupCfg` list. + fn multicast_group_cfgs( + &self, + ) -> Vec { + self.multicast_groups + .iter() + .map(|m| illumos_utils::opte::MulticastGroupCfg { + group_ip: m.group_ip, + sources: m.sources.clone(), + }) + .collect() + } - // Update OPTE configuration + /// Sync the current multicast group memberships to OPTE via the + /// port manager. + /// + // TODO: subscriptions target the primary NIC only. + // InstanceMulticastMembership carries no NIC identifier, same as + // external IPs and attached subnets (though not firewall rules, + // which fan out across all VPC ports by VNI). If per-NIC multicast + // is needed, the membership type needs a NIC field and both this + // function and setup_propolis_zone must be updated. + fn ensure_multicast_groups(&self) -> Result<(), Error> { let Some(primary_nic) = self.primary_nic() else { return Err(Error::Opte(illumos_utils::opte::Error::NoPrimaryNic)); }; - // Convert InstanceMulticastMembership to MulticastGroupCfg - let multicast_cfg: Vec = self - .multicast_groups - .iter() - .map(|membership| illumos_utils::opte::MulticastGroupCfg { - group_ip: membership.group_ip, - sources: membership.sources.clone(), - }) - .collect(); - - // Validate multicast configuration with OPTE self.port_manager.multicast_groups_ensure( primary_nic.id, primary_nic.kind, - &multicast_cfg, + &self.multicast_group_cfgs(), )?; - // TODO: Configure underlay multicast group addresses on the zone's vNIC. - // This should add the multicast group addresses to the zone's network - // interface so it can receive underlay multicast traffic (physical - // network layer). Rack-wide dataplane forwarding is handled by the - // RPW reconciler + DPD. - // See also: port_manager.rs multicast_groups_ensure() TODO about - // configuring OPTE port-level multicast group membership. + Ok(()) + } + async fn join_multicast_group_inner( + &mut self, + membership: &InstanceMulticastMembership, + ) -> Result<(), Error> { + // Idempotent -> skip if already subscribed. + if self.multicast_groups.contains(membership) { + return Ok(()); + } + + self.multicast_groups.push(membership.clone()); + self.ensure_multicast_groups()?; Ok(()) } @@ -2697,56 +2708,12 @@ impl InstanceRunner { &mut self, membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - // Remove from local state self.multicast_groups.retain(|m| m != membership); - - // Update OPTE configuration - let Some(primary_nic) = self.primary_nic() else { - return Err(Error::Opte(illumos_utils::opte::Error::NoPrimaryNic)); - }; - - // Convert InstanceMulticastMembership to MulticastGroupCfg - let multicast_cfg: Vec = self - .multicast_groups - .iter() - .map(|membership| illumos_utils::opte::MulticastGroupCfg { - group_ip: membership.group_ip, - sources: membership.sources.clone(), - }) - .collect(); - - self.port_manager.multicast_groups_ensure( - primary_nic.id, - primary_nic.kind, - &multicast_cfg, - )?; - - Ok(()) + self.ensure_multicast_groups() } fn refresh_multicast_groups_inner(&mut self) -> Result<(), Error> { - // Update OPTE configuration - let Some(primary_nic) = self.primary_nic() else { - return Err(Error::Opte(illumos_utils::opte::Error::NoPrimaryNic)); - }; - - // Convert InstanceMulticastMembership to MulticastGroupCfg - let multicast_cfg: Vec = self - .multicast_groups - .iter() - .map(|membership| illumos_utils::opte::MulticastGroupCfg { - group_ip: membership.group_ip, - sources: membership.sources.clone(), - }) - .collect(); - - self.port_manager.multicast_groups_ensure( - primary_nic.id, - primary_nic.kind, - &multicast_cfg, - )?; - - Ok(()) + self.ensure_multicast_groups() } } diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index f110a379470..28f4bf9a8d7 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -306,14 +306,14 @@ impl InstanceManager { pub async fn join_multicast_group( &self, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { let (tx, rx) = oneshot::channel(); self.inner .tx .send(InstanceManagerRequest::JoinMulticastGroup { propolis_id, - multicast_body: multicast_body.clone(), + membership: membership.clone(), tx, }) .await @@ -325,14 +325,14 @@ impl InstanceManager { pub async fn leave_multicast_group( &self, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { let (tx, rx) = oneshot::channel(); self.inner .tx .send(InstanceManagerRequest::LeaveMulticastGroup { propolis_id, - multicast_body: multicast_body.clone(), + membership: membership.clone(), tx, }) .await @@ -484,12 +484,12 @@ enum InstanceManagerRequest { }, JoinMulticastGroup { propolis_id: PropolisUuid, - multicast_body: InstanceMulticastBody, + membership: InstanceMulticastMembership, tx: oneshot::Sender>, }, LeaveMulticastGroup { propolis_id: PropolisUuid, - multicast_body: InstanceMulticastBody, + membership: InstanceMulticastMembership, tx: oneshot::Sender>, }, GetState { @@ -630,11 +630,11 @@ impl InstanceManagerRunner { Some(RefreshExternalIps { tx }) => { self.refresh_external_ips(tx) }, - Some(JoinMulticastGroup { propolis_id, multicast_body, tx }) => { - self.join_multicast_group(tx, propolis_id, &multicast_body) + Some(JoinMulticastGroup { propolis_id, membership, tx }) => { + self.join_multicast_group(tx, propolis_id, &membership) }, - Some(LeaveMulticastGroup { propolis_id, multicast_body, tx }) => { - self.leave_multicast_group(tx, propolis_id, &multicast_body) + Some(LeaveMulticastGroup { propolis_id, membership, tx }) => { + self.leave_multicast_group(tx, propolis_id, &membership) } Some(GetState { propolis_id, tx }) => { // TODO(eliza): it could potentially be nice to @@ -907,20 +907,12 @@ impl InstanceManagerRunner { &self, tx: oneshot::Sender>, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { let Some(instance) = self.get_propolis(propolis_id) else { return Err(Error::NoSuchVmm(propolis_id)); }; - - match multicast_body { - InstanceMulticastBody::Join(membership) => { - instance.join_multicast_group(tx, membership)?; - } - InstanceMulticastBody::Leave(membership) => { - instance.leave_multicast_group(tx, membership)?; - } - } + instance.join_multicast_group(tx, membership)?; Ok(()) } @@ -928,20 +920,12 @@ impl InstanceManagerRunner { &self, tx: oneshot::Sender>, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { let Some(instance) = self.get_propolis(propolis_id) else { return Err(Error::NoSuchVmm(propolis_id)); }; - - match multicast_body { - InstanceMulticastBody::Join(membership) => { - instance.join_multicast_group(tx, membership)?; - } - InstanceMulticastBody::Leave(membership) => { - instance.leave_multicast_group(tx, membership)?; - } - } + instance.leave_multicast_group(tx, membership)?; Ok(()) } diff --git a/sled-agent/src/probe_manager.rs b/sled-agent/src/probe_manager.rs index 2f2d5421204..38c0d0eba06 100644 --- a/sled-agent/src/probe_manager.rs +++ b/sled-agent/src/probe_manager.rs @@ -382,6 +382,7 @@ impl ProbeManagerInner { // but probes are supposed to mimic instances as closely as // possible. We should consider if we want to support them here. attached_subnets: vec![], + multicast_groups: &[], })?; let installed_zone = ZoneBuilderFactory::new() diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 2dfdb487c3b..ef483541de6 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -1177,6 +1177,7 @@ impl ServiceManager { dhcp_config: DhcpCfg::default(), // Services do not use attached subnets, only instances. attached_subnets: vec![], + multicast_groups: &[], }) .map_err(|err| Error::ServicePortCreation { service: zone_kind, diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index 84a31190cb8..935566bb242 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -236,18 +236,15 @@ impl SimCollection { while should_step { let (new_state, to_destroy) = { - // The object must be present in `objects` because it only gets - // removed when it comes to rest in the "Destroyed" state, but - // we can only get here if there's an asynchronous state - // transition desired. - // // We do as little as possible with the lock held. In // particular, we want to finish this work before calling out to // notify the nexus. let mut objects = self.objects.lock().await; + + // The object may already have been destroyed and removed by a + // concurrent poke (e.g., sim_step racing with an explicit poke + // from a test). In that case there is nothing left to do. let Some(mut object) = objects.remove(&id) else { - // Instance was already removed (e.g., destroyed by a - // concurrent transition). Nothing left to do. break; }; object.transition_finish(); diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index e69c830c2f0..a57dc102f85 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -56,12 +56,16 @@ use sled_agent_types::early_networking::EarlyNetworkConfigEnvelope; use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; use sled_agent_types::instance::SledVmmState; use sled_agent_types::instance::{ - InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastBody, + InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastMembership, VmmIssueDiskSnapshotRequestBody, VmmIssueDiskSnapshotRequestPathParam, VmmIssueDiskSnapshotRequestResponse, VmmPathParam, VmmPutStateBody, VmmPutStateResponse, VmmUnregisterResponse, VpcPathParam, }; use sled_agent_types::inventory::{Inventory, OmicronSledConfig}; +use sled_agent_types::multicast::ClearMcast2Phys; +use sled_agent_types::multicast::ClearMcastForwarding; +use sled_agent_types::multicast::Mcast2PhysMapping; +use sled_agent_types::multicast::McastForwardingEntry; use sled_agent_types::probes::ProbeSet; use sled_agent_types::rot::{ Attestation, CertificateChain, MeasurementLog, Nonce, RotPathParams, @@ -192,52 +196,24 @@ impl SledAgentApi for SledAgentSimImpl { async fn vmm_join_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result { let sa = rqctx.context(); let propolis_id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); - - match body_args { - InstanceMulticastBody::Join(membership) => { - sa.instance_join_multicast_group(propolis_id, &membership) - .await?; - } - InstanceMulticastBody::Leave(_) => { - // This endpoint is for joining - reject leave operations - return Err(HttpError::for_bad_request( - None, - "Join endpoint cannot process Leave operations".to_string(), - )); - } - } - + let membership = body.into_inner(); + sa.instance_join_multicast_group(propolis_id, &membership).await?; Ok(HttpResponseUpdatedNoContent()) } async fn vmm_leave_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result { let sa = rqctx.context(); let propolis_id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); - - match body_args { - InstanceMulticastBody::Leave(membership) => { - sa.instance_leave_multicast_group(propolis_id, &membership) - .await?; - } - InstanceMulticastBody::Join(_) => { - // This endpoint is for leaving - reject join operations - return Err(HttpError::for_bad_request( - None, - "Leave endpoint cannot process Join operations".to_string(), - )); - } - } - + let membership = body.into_inner(); + sa.instance_leave_multicast_group(propolis_id, &membership).await?; Ok(HttpResponseUpdatedNoContent()) } @@ -391,6 +367,66 @@ impl SledAgentApi for SledAgentSimImpl { Ok(HttpResponseOk(vnics)) } + async fn set_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.set_mcast_m2p(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn clear_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.clear_mcast_m2p(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn set_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.set_mcast_fwd(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn clear_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.clear_mcast_fwd(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn list_mcast_m2p( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + let m2p = sa + .list_mcast_m2p() + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseOk(m2p)) + } + + async fn list_mcast_fwd( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + let fwd = sa + .list_mcast_fwd() + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseOk(fwd)) + } + async fn uplink_ensure( _rqctx: RequestContext, _body: TypedBody, diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index a992f103d81..0d36dd6591f 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -66,6 +66,10 @@ use sled_agent_types::inventory::{ OmicronFileSourceResolverInventory, OmicronSledConfig, OmicronZonesConfig, SingleMeasurementInventory, SledRole, ZpoolHealth, }; +use sled_agent_types::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, McastForwardingNextHop, +}; use sled_agent_types::support_bundle::SupportBundleMetadata; use sled_agent_types::system_networking::SystemNetworkingConfig; @@ -96,6 +100,8 @@ pub struct SledAgent { pub nexus_client: Arc, pub simulated_upstairs: Arc, pub v2p_mappings: Mutex>, + pub m2p_mappings: Mutex>, + pub mcast_fwd: Mutex>>, mock_propolis: futures::lock::Mutex< Option<(propolis_mock_server::Server, PropolisClient)>, >, @@ -188,6 +194,8 @@ impl SledAgent { nexus_client, simulated_upstairs, v2p_mappings: Mutex::new(HashSet::new()), + m2p_mappings: Mutex::new(HashSet::new()), + mcast_fwd: Mutex::new(HashMap::new()), external_ips: Mutex::new(HashMap::new()), attached_subnets: Mutex::new(HashMap::new()), multicast_groups: Mutex::new(HashMap::new()), @@ -676,6 +684,58 @@ impl SledAgent { Ok(Vec::from_iter(v2p_mappings.clone())) } + pub fn set_mcast_m2p(&self, req: &Mcast2PhysMapping) -> Result<(), Error> { + let mut m2p = self.m2p_mappings.lock().unwrap(); + m2p.insert((req.group, req.underlay)); + Ok(()) + } + + pub fn clear_mcast_m2p(&self, req: &ClearMcast2Phys) -> Result<(), Error> { + let mut m2p = self.m2p_mappings.lock().unwrap(); + m2p.remove(&(req.group, req.underlay)); + Ok(()) + } + + pub fn set_mcast_fwd( + &self, + req: &McastForwardingEntry, + ) -> Result<(), Error> { + let mut fwd = self.mcast_fwd.lock().unwrap(); + fwd.insert(req.underlay, req.next_hops.clone()); + Ok(()) + } + + pub fn clear_mcast_fwd( + &self, + req: &ClearMcastForwarding, + ) -> Result<(), Error> { + let mut fwd = self.mcast_fwd.lock().unwrap(); + fwd.remove(&req.underlay); + Ok(()) + } + + pub fn list_mcast_m2p(&self) -> Result, Error> { + let m2p = self.m2p_mappings.lock().unwrap(); + Ok(m2p + .iter() + .map(|(group, underlay)| Mcast2PhysMapping { + group: *group, + underlay: *underlay, + }) + .collect()) + } + + pub fn list_mcast_fwd(&self) -> Result, Error> { + let fwd = self.mcast_fwd.lock().unwrap(); + Ok(fwd + .iter() + .map(|(underlay, next_hops)| McastForwardingEntry { + underlay: *underlay, + next_hops: next_hops.clone(), + }) + .collect()) + } + pub async fn instance_put_external_ip( &self, propolis_id: PropolisUuid, diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 8c5e72a6466..c8c52e9980a 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -75,11 +75,15 @@ use sled_agent_types::disk::DiskStateRequested; use sled_agent_types::early_networking::EarlyNetworkConfigEnvelope; use sled_agent_types::instance::ResolvedVpcFirewallRule; use sled_agent_types::instance::{ - InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastBody, + InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastMembership, SledVmmState, VmmPutStateResponse, VmmStateRequested, VmmUnregisterResponse, }; use sled_agent_types::inventory::{Inventory, OmicronSledConfig, SledRole}; +use sled_agent_types::multicast::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, +}; use sled_agent_types::probes::ProbeCreate; use sled_agent_types::resolvable_files::{ PreparedOmicronZone, RemoveMupdateOverrideResult, ResolverStatus, @@ -413,7 +417,6 @@ struct SledAgentInner { // A handle to the trust quorum. trust_quorum: trust_quorum::NodeTaskHandle, - // A handle to the hardware monitor. hardware_monitor: HardwareMonitorHandle, @@ -1036,26 +1039,28 @@ impl SledAgent { .map_err(|e| Error::Instance(e)) } + /// Subscribe a VMM's OPTE port to a multicast group. pub async fn instance_join_multicast_group( &self, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { self.inner .instances - .join_multicast_group(propolis_id, multicast_body) + .join_multicast_group(propolis_id, membership) .await .map_err(|e| Error::Instance(e)) } + /// Unsubscribe a VMM's OPTE port from a multicast group. pub async fn instance_leave_multicast_group( &self, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { self.inner .instances - .leave_multicast_group(propolis_id, multicast_body) + .leave_multicast_group(propolis_id, membership) .await .map_err(|e| Error::Instance(e)) } @@ -1140,6 +1145,52 @@ impl SledAgent { .map_err(Error::from) } + /// Install a multicast overlay-to-underlay (M2P) mapping in OPTE. + pub async fn set_mcast_m2p( + &self, + req: &Mcast2PhysMapping, + ) -> Result<(), Error> { + self.inner.port_manager.set_mcast_m2p(req).map_err(Error::from) + } + + /// Remove a multicast overlay-to-underlay (M2P) mapping from OPTE. + pub async fn clear_mcast_m2p( + &self, + req: &ClearMcast2Phys, + ) -> Result<(), Error> { + self.inner.port_manager.clear_mcast_m2p(req).map_err(Error::from) + } + + /// Set multicast forwarding next hops for an underlay group address. + pub async fn set_mcast_fwd( + &self, + req: &McastForwardingEntry, + ) -> Result<(), Error> { + self.inner.port_manager.set_mcast_fwd(req).map_err(Error::from) + } + + /// Remove multicast forwarding entries for an underlay group address. + pub async fn clear_mcast_fwd( + &self, + req: &ClearMcastForwarding, + ) -> Result<(), Error> { + self.inner.port_manager.clear_mcast_fwd(req).map_err(Error::from) + } + + /// List all multicast M2P mappings from OPTE. + pub async fn list_mcast_m2p( + &self, + ) -> Result, Error> { + self.inner.port_manager.list_mcast_m2p().map_err(Error::from) + } + + /// List all multicast forwarding entries from OPTE. + pub async fn list_mcast_fwd( + &self, + ) -> Result, Error> { + self.inner.port_manager.list_mcast_fwd().map_err(Error::from) + } + pub async fn ensure_scrimlet_host_ports( &self, uplinks: Vec, diff --git a/sled-agent/types/src/lib.rs b/sled-agent/types/src/lib.rs index 2d87bbc1761..a1885b4aa66 100644 --- a/sled-agent/types/src/lib.rs +++ b/sled-agent/types/src/lib.rs @@ -16,6 +16,7 @@ pub mod early_networking; pub mod firewall_rules; pub mod instance; pub mod inventory; +pub mod multicast; pub mod probes; pub mod rack_init; pub mod resolvable_files; diff --git a/sled-agent/types/src/multicast.rs b/sled-agent/types/src/multicast.rs new file mode 100644 index 00000000000..27e95a0d94c --- /dev/null +++ b/sled-agent/types/src/multicast.rs @@ -0,0 +1,7 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Multicast networking types for the Sled Agent API. + +pub use sled_agent_types_versions::latest::multicast::*; diff --git a/sled-agent/types/versions/src/latest.rs b/sled-agent/types/versions/src/latest.rs index b4529cf1d27..8d5a4fd19b4 100644 --- a/sled-agent/types/versions/src/latest.rs +++ b/sled-agent/types/versions/src/latest.rs @@ -89,6 +89,18 @@ pub mod firewall_rules { pub use crate::v31::firewall_rules::VpcFirewallRulesEnsureBody; } +pub mod multicast { + pub use crate::v41::multicast::ClearMcast2Phys; + pub use crate::v41::multicast::ClearMcastForwarding; + pub use crate::v41::multicast::Mcast2PhysMapping; + pub use crate::v41::multicast::McastFilterMode; + pub use crate::v41::multicast::McastForwardingEntry; + pub use crate::v41::multicast::McastForwardingNextHop; + pub use crate::v41::multicast::McastReplication; + pub use crate::v41::multicast::McastSourceFilter; + pub use crate::v41::multicast::MulticastGroupCfg; +} + pub mod instance { pub use crate::v1::instance::InstanceExternalIpBody; pub use crate::v1::instance::InstanceMetadata; diff --git a/sled-agent/types/versions/src/lib.rs b/sled-agent/types/versions/src/lib.rs index 27b112414ab..ffce49cf852 100644 --- a/sled-agent/types/versions/src/lib.rs +++ b/sled-agent/types/versions/src/lib.rs @@ -87,6 +87,8 @@ pub mod v39; pub mod v4; #[path = "add_fmd_to_inventory/mod.rs"] pub mod v40; +#[path = "mcast_m2p_forwarding/mod.rs"] +pub mod v41; #[path = "add_probe_put_endpoint/mod.rs"] pub mod v6; #[path = "multicast_support/mod.rs"] diff --git a/sled-agent/types/versions/src/mcast_m2p_forwarding/mod.rs b/sled-agent/types/versions/src/mcast_m2p_forwarding/mod.rs new file mode 100644 index 00000000000..8c9d1bb1c4a --- /dev/null +++ b/sled-agent/types/versions/src/mcast_m2p_forwarding/mod.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Version `MCAST_M2P_FORWARDING` of the Sled Agent API. +//! +//! Adds multicast-to-physical mapping and forwarding types used by +//! the multicast-to-physical and forwarding endpoints. + +pub mod multicast; diff --git a/sled-agent/types/versions/src/mcast_m2p_forwarding/multicast.rs b/sled-agent/types/versions/src/mcast_m2p_forwarding/multicast.rs new file mode 100644 index 00000000000..5c2247c1159 --- /dev/null +++ b/sled-agent/types/versions/src/mcast_m2p_forwarding/multicast.rs @@ -0,0 +1,132 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2026 Oxide Computer Company + +//! Multicast networking types for the sled-agent API. +//! +//! These types support overlay-to-underlay multicast mapping and +//! multicast forwarding configuration via OPTE. The underlay address +//! space is ff04::/64, a subset of admin-local scope per +//! [RFC 7346](https://www.rfc-editor.org/rfc/rfc7346). + +use std::net::IpAddr; +use std::net::Ipv6Addr; + +use schemars::JsonSchema; +use serde::Deserialize; +use serde::Serialize; + +/// Mapping from an overlay multicast group to an underlay multicast +/// address. +/// +/// The underlay address must be within the underlay multicast subnet +/// (ff04::/64). This invariant is enforced by mapping in Nexus, not +/// validated at this layer. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct Mcast2PhysMapping { + /// Overlay multicast group address. + pub group: IpAddr, + /// Underlay IPv6 multicast address (ff04::/64). + pub underlay: Ipv6Addr, +} + +/// Clear a mapping from an overlay multicast group to an underlay +/// multicast address. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct ClearMcast2Phys { + /// Overlay multicast group address. + pub group: IpAddr, + /// Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`]. + pub underlay: Ipv6Addr, +} + +/// Forwarding entry for an underlay multicast address, specifying +/// which next hops should receive replicated packets. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct McastForwardingEntry { + /// Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`]. + pub underlay: Ipv6Addr, + /// Next hops with replication and source filter configuration. + pub next_hops: Vec, +} + +/// Clear all forwarding entries for an underlay multicast address. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct ClearMcastForwarding { + /// Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`]. + pub underlay: Ipv6Addr, +} + +/// A forwarding next hop with replication mode and aggregated +/// source filter. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +pub struct McastForwardingNextHop { + /// Unicast IPv6 address of the destination sled. + pub next_hop: Ipv6Addr, + /// Replication mode for this next hop. + pub replication: McastReplication, + /// Aggregated source filter for this destination. + pub filter: McastSourceFilter, +} + +/// Replication mode for multicast forwarding. +#[derive( + Clone, Copy, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, +)] +#[serde(rename_all = "snake_case")] +pub enum McastReplication { + /// Replicate to front panel ports (egress to external networks). + External, + /// Replicate to sled underlay ports. + Underlay, + /// Replicate to both external and underlay ports. + Both, +} + +/// Source filter for multicast forwarding. +#[derive( + Clone, Debug, Default, Deserialize, Serialize, JsonSchema, PartialEq, +)] +pub struct McastSourceFilter { + /// Filter mode. + pub mode: McastFilterMode, + /// Source addresses to include or exclude. + pub sources: Vec, +} + +/// Filter mode for multicast source filtering. +#[derive( + Clone, + Copy, + Debug, + Default, + Deserialize, + Serialize, + JsonSchema, + PartialEq, + Eq, +)] +#[serde(rename_all = "snake_case")] +pub enum McastFilterMode { + /// Accept only packets from listed sources (SSM). + Include, + /// Accept packets from all sources except those listed. + /// With an empty sources list this is any-source multicast (ASM). + #[default] + Exclude, +} + +/// Declarative multicast group subscription for an OPTE port. +/// +/// Represents a single group membership with optional source filtering. +/// Empty `sources` means any-source multicast (ASM) and non-empty means +/// source-specific multicast (SSM). +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq)] +pub struct MulticastGroupCfg { + /// The multicast group IP address (IPv4 or IPv6). + pub group_ip: IpAddr, + /// Source addresses for source-filtered multicast. + pub sources: Vec, +} diff --git a/tools/ci_check_opte_ver.sh b/tools/ci_check_opte_ver.sh index 8715792defb..92dc1f6fb27 100755 --- a/tools/ci_check_opte_ver.sh +++ b/tools/ci_check_opte_ver.sh @@ -81,6 +81,6 @@ BUILDOMAT_DEPLOY_TARGET=$(cat .github/buildomat/jobs/deploy.sh | sed -n 's/#:[ ] if [ "lab-3.0-opte-0.$API_VER" != "$BUILDOMAT_DEPLOY_TARGET" ]; then echo "OPTE version mismatch:" echo "Cargo.toml: $OPTE_REV ($OPTE_VER)" - echo "buildomat deploy job: $BUILDOMAT_DEPLOY_TARGET (expected lab-3.0-opte-0.$API_VER)" + echo "buildomat deploy job: $BUILDOMAT_DEPLOY_TARGET (expected lab-opte-0.$API_VER)" exit 1 fi