diff --git a/README.md b/README.md index fe2a537..0727f10 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Today, Micromize attaches eBPF programs to LSM hooks and enforces: - **Strict container boundaries** — blocks filesystem escapes and host access - **Capability restriction** — prevents privilege escalation via `unshare`/`clone`/`setns` - **Ptrace blocking** — eliminates ptrace-based debugging/injection attacks -- **Socket restriction** — blocks `AF_ALG` (kernel crypto userspace API) socket usage in containers, mitigating CVE-2026-31431 and related attack surface +- **Socket restriction** — blocks niche/legacy socket address families (AF_ALG, AF_TIPC, AF_RDS, AF_SMC, AF_CAN, AF_NFC, AF_BLUETOOTH, AF_AX25, AF_ATMPVC, AF_ATMSVC, AF_X25, AF_KCM, AF_CAIF) out of the box, mitigating CVE-2026-31431. Defaults are conservative — AF_PACKET (MetalLB/keepalived/kube-proxy IPVS), AF_VSOCK (firecracker/kata), `NETLINK_NETFILTER` (nf_tables CVE family — incompatible with iptables-nft / nft-based CNI / Istio CNI), and `NETLINK_XFRM` (IPsec) are opt-in via `--socket-deny-families` and `--socket-deny-netlink-protocols`. See [gadgets/socket-restrict](gadgets/socket-restrict/README.md). - **Execution integrity** — SBOM + runtime hash validation via `bpf_ima_file_hash` Policies are loaded before container start and enforced at execution time. No runtime replacement. No learning mode. Kernel-native enforcement. @@ -77,6 +77,8 @@ helm install micromize ./charts/micromize \ | `--filter-image-digest` | `""` | Filter out containers running this image digest from monitoring | | `--disable-gadgets` | `""` | Comma-separated list of gadgets to disable (e.g. `ptrace-restrict,cap-restrict`) | | `--exempt-label` | `micromize.dev/exempt` | Kubernetes label key used to mark namespaces as exempt (value must be `true`). Evaluated at startup only. Set to `""` to disable. | +| `--socket-deny-families` | (conservative default; see [`gadgets/socket-restrict`](gadgets/socket-restrict/README.md)) | Address families to deny via `socket-restrict`. Accepts names (`AF_ALG`) or decimal numbers. Set to `""` to disable family-level denial. | +| `--socket-deny-netlink-protocols` | `""` | `AF_NETLINK` protocols to deny (e.g. `NETLINK_NETFILTER` to block the nf_tables LPE family — verify CNI compatibility first). | ## Requirements diff --git a/cmd/micromize/root.go b/cmd/micromize/root.go index 61faae5..ac1740b 100644 --- a/cmd/micromize/root.go +++ b/cmd/micromize/root.go @@ -43,14 +43,32 @@ const ( ) var ( - enforce bool - verbose bool - filterNamespaces string - filterImageDigest string - disableGadgets string - exemptLabel string + enforce bool + verbose bool + filterNamespaces string + filterImageDigest string + disableGadgets string + exemptLabel string + socketDenyFamilies string + socketDenyNetlinkProtocols string ) +// defaultSocketDenyFamilies is the conservative set of socket address +// families denied out of the box. It targets families that are essentially +// never used by cloud-native application containers but periodically ship +// kernel LPEs. AF_PACKET (used by MetalLB, keepalived, tcpdump-in-pod, +// kube-proxy IPVS, Cilium) and AF_VSOCK (firecracker/kata workloads) are +// deliberately excluded — operators that want them blocked can opt-in via +// --socket-deny-families. +const defaultSocketDenyFamilies = "AF_ALG,AF_TIPC,AF_RDS,AF_SMC,AF_CAN,AF_NFC,AF_BLUETOOTH,AF_AX25,AF_ATMPVC,AF_ATMSVC,AF_X25,AF_KCM,AF_CAIF" + +// defaultSocketDenyNetlinkProtocols is intentionally empty: blocking +// NETLINK_NETFILTER would break iptables-nft / nf_tables-based CNI plugins +// (Istio CNI, kube-proxy nft, etc.); NETLINK_XFRM would break IPsec +// sidecars. Operators concerned about nf_tables LPEs should opt-in via +// --socket-deny-netlink-protocols=NETLINK_NETFILTER. +const defaultSocketDenyNetlinkProtocols = "" + var rootCmd = &cobra.Command{ Use: "micromize", Short: "micromize is a security hardening tool for containerized applications", @@ -78,6 +96,8 @@ func init() { rootCmd.PersistentFlags().StringVar(&filterImageDigest, "filter-image-digest", "", "Filter out containers running this image digest from monitoring (e.g. sha256:abc123...)") rootCmd.PersistentFlags().StringVar(&disableGadgets, "disable-gadgets", "", "Comma-separated list of gadgets to disable (e.g. ptrace-restrict,cap-restrict)") rootCmd.PersistentFlags().StringVar(&exemptLabel, "exempt-label", "micromize.dev/exempt", "Kubernetes label key used to mark namespaces as exempt from monitoring (value must be 'true'). Set to empty string to disable. Changes take effect on restart.") + rootCmd.PersistentFlags().StringVar(&socketDenyFamilies, "socket-deny-families", defaultSocketDenyFamilies, "Comma-separated list of socket address families denied by socket-restrict. Names (e.g. AF_ALG) or decimal numbers; case-insensitive. Set to empty string to disable family-level denial.") + rootCmd.PersistentFlags().StringVar(&socketDenyNetlinkProtocols, "socket-deny-netlink-protocols", defaultSocketDenyNetlinkProtocols, "Comma-separated list of AF_NETLINK protocols denied by socket-restrict. Defaults to empty. Use NETLINK_NETFILTER to block nf_tables LPE chains (incompatible with iptables-nft / nf_tables-based CNI). Use NETLINK_XFRM to block XFRM/IPsec control plane.") } func run(ctx context.Context) error { @@ -124,7 +144,20 @@ func run(ctx context.Context) error { return fmt.Errorf("creating local manager operator: %w", err) } - contextManager := gadget.NewContextManager([]operators.DataOperator{ociHandlerOp, localManagerOp, imaOp, eventTypeOp, outputOp}) + socketDenyFamilyList, err := operators.ParseSocketDenyFamilies(socketDenyFamilies) + if err != nil { + return fmt.Errorf("parsing --socket-deny-families: %w", err) + } + socketDenyNetlinkProtocolList, err := operators.ParseSocketDenyNetlinkProtocols(socketDenyNetlinkProtocols) + if err != nil { + return fmt.Errorf("parsing --socket-deny-netlink-protocols: %w", err) + } + slog.Info("Socket-restrict deny-list", + "families", socketDenyFamilyList, + "netlinkProtocols", socketDenyNetlinkProtocolList) + socketRestrictOp := operators.NewSocketRestrictOperator(socketDenyFamilyList, socketDenyNetlinkProtocolList) + + contextManager := gadget.NewContextManager([]operators.DataOperator{ociHandlerOp, localManagerOp, imaOp, socketRestrictOp, eventTypeOp, outputOp}) // Create gadget registry registry := gadget.NewRegistry(contextManager, runtimeManager) diff --git a/cmd/micromize/root_test.go b/cmd/micromize/root_test.go index c549d01..95f804f 100644 --- a/cmd/micromize/root_test.go +++ b/cmd/micromize/root_test.go @@ -40,6 +40,12 @@ func TestBuildDisabledSet(t *testing.T) { wantDisabled: []string{"ptrace-restrict", "cap-restrict"}, wantEnabled: []string{"fs-restrict"}, }, + { + name: "socket restrict can be disabled alongside others", + disableGadgets: "socket-restrict,cap-restrict", + wantDisabled: []string{"socket-restrict", "cap-restrict"}, + wantEnabled: []string{"fs-restrict", "ptrace-restrict", "binary-attestation"}, + }, { name: "whitespace around names is trimmed", disableGadgets: " ptrace-restrict , cap-restrict ", diff --git a/gadgets/socket-restrict/README.md b/gadgets/socket-restrict/README.md index 723351d..19c2174 100644 --- a/gadgets/socket-restrict/README.md +++ b/gadgets/socket-restrict/README.md @@ -1,24 +1,92 @@ # socket-restrict -Restrict dangerous socket primitives in containers. +Restrict dangerous socket address families and (optionally) high-risk +`AF_NETLINK` protocols in containers. -This gadget blocks all `AF_ALG` (kernel crypto userspace API) socket usage -inside containers. `AF_ALG` is rarely needed in containerized production -workloads — most TLS, SSH, and dm-crypt use cases never touch it — and -blocking it eliminates a class of kernel attack surface from the container -boundary. +This gadget started as an `AF_ALG` hardening control and now applies a +runtime-configurable deny-list for socket families that are rarely needed in +cloud-native workloads but repeatedly show up in container-escape and +local-privilege-escalation chains. The deny-list is **conservative by +default**: only niche/legacy address families are denied out of the box. +Families that have legitimate cloud-native uses (`AF_PACKET` for MetalLB / +keepalived / kube-proxy IPVS / tcpdump-in-pod, `AF_VSOCK` for +firecracker / kata-containers) and `AF_NETLINK` protocols (`NETLINK_NETFILTER` +for iptables-nft and nft-based CNI, `NETLINK_XFRM` for IPsec) are **opt-in**. The initial motivation is CVE-2026-31431 (Copy Fail), a Linux kernel local -privilege escalation in `algif_aead` that can be triggered via `AF_ALG` -sockets. This gadget blocks the entire killchain at socket creation time, -before any vulnerable kernel path is reached. +privilege escalation in `algif_aead` reachable via `AF_ALG` sockets. `AF_ALG` +is rarely needed in containerized production workloads — most TLS, SSH, and +dm-crypt use cases never touch it — and blocking it eliminates that attack +surface before any vulnerable path is reached. Opt-in coverage extends to +`AF_VSOCK` (e.g. CVE-2024-50264) and the `nf_tables` LPE family +(CVE-2022-32250, CVE-2022-34918, CVE-2023-32233, CVE-2024-1086, +CVE-2024-26925, CVE-2024-26581, CVE-2024-26809) reachable through +`AF_NETLINK / NETLINK_NETFILTER`. + +Allowed by default: `AF_INET`, `AF_INET6`, `AF_UNIX`, all `AF_NETLINK` +protocols (including `NETLINK_ROUTE`, `NETLINK_GENERIC`, `NETLINK_SOCK_DIAG`, +**and** `NETLINK_NETFILTER` / `NETLINK_XFRM`), `AF_PACKET`, `AF_VSOCK`, and +every other family not listed below. + +## Default deny-list (`--socket-deny-families`) + +| Family | Number | Rationale | +|---|---|---| +| `AF_ALG` | 38 | Kernel crypto userspace API; preserves the original CVE-2026-31431 mitigation. | +| `AF_TIPC` | 30 | Cluster-IPC protocol, multiple historical kernel LPEs. | +| `AF_RDS` | 21 | Reliable datagram sockets, multiple historical LPEs. | +| `AF_SMC` | 43 | Shared-memory comms, niche. | +| `AF_CAN` | 29 | Controller area network, automotive. | +| `AF_NFC` | 39 | Near-field comms. | +| `AF_BLUETOOTH` | 31 | Bluetooth stack. | +| `AF_AX25` | 3 | Amateur radio. | +| `AF_ATMPVC` | 8 | ATM permanent VC. | +| `AF_ATMSVC` | 20 | ATM switched VC. | +| `AF_X25` | 9 | X.25 networking. | +| `AF_KCM` | 41 | Kernel connection multiplexer. | +| `AF_CAIF` | 37 | Communication CPU interface. | + +## Opt-in family blocks + +Set `--socket-deny-families` to add to (or replace) the default list. Examples: + +- `--socket-deny-families=AF_PACKET` — block raw link-layer sockets. + Breaks: MetalLB, keepalived, `tcpdump` in pods, kube-proxy IPVS, + Cilium endpoint operations. +- `--socket-deny-families=AF_VSOCK` — block virtio-vsock. + Breaks: firecracker host↔guest agents, kata-containers control plane. + +## Opt-in `AF_NETLINK` protocol blocks (`--socket-deny-netlink-protocols`) + +Empty by default. Add protocols by name (case-insensitive) or decimal number. + +| Protocol | Number | What it blocks | Compatibility caveat | +|---|---|---|---| +| `NETLINK_NETFILTER` | 12 | `nf_tables` LPE family (CVE-2022-32250 → CVE-2024-26809) | **iptables-nft / nft-based CNI / kube-proxy nft mode / Istio CNI all use this.** Verify your data plane before enabling. | +| `NETLINK_XFRM` | 6 | XFRM/IPsec control plane attack surface | IPsec sidecars and pods using kernel IPsec will break. | +| `NETLINK_AUDIT` | 9 | Linux audit control plane | Auditd-style agents will break. | +| `NETLINK_KOBJECT_UEVENT` | 15 | Device-event channel | udev-like consumers will break. | + +Both flags accept symbolic names (`AF_VSOCK`, `NETLINK_NETFILTER`) or decimal +numbers. Whitespace is trimmed, duplicates are collapsed. + +## Recommended rollout + +1. Start with the default `--socket-deny-families` list (above). It is + designed to be safe for typical cloud-native pods. +2. To extend coverage, **run micromize in audit mode (`--enforce=false`) + first**, opt in the additional families / protocols, and observe the + `socket_family_denied_create` / `_bind` events for a representative + workload sample. +3. Move to `--enforce=true` only once you have zero unexpected events for + the workloads you care about. ## Hooks | Hook | Purpose | |---|---| -| `lsm/socket_create` | Block `AF_ALG` socket creation (main choke point) | -| `lsm/socket_bind` | Defense-in-depth: block `AF_ALG` bind if a socket FD exists from before policy load. Preserves `alg_type`/`alg_name` for visibility. | +| `lsm/socket_create` | Block denied socket families and selected `AF_NETLINK` protocols at creation time. | +| `lsm/socket_bind` | Defense-in-depth: block denied binds if a socket FD existed before policy load. Preserves `alg_type` / `alg_name` for `AF_ALG` visibility. | ## Getting Started diff --git a/gadgets/socket-restrict/gadget.yaml b/gadgets/socket-restrict/gadget.yaml index d3a3dfa..c515a15 100644 --- a/gadgets/socket-restrict/gadget.yaml +++ b/gadgets/socket-restrict/gadget.yaml @@ -18,6 +18,9 @@ datasources: family: annotations: description: Socket address family + protocol: + annotations: + description: Socket protocol (e.g. NETLINK_NETFILTER for AF_NETLINK) process: annotations: description: The process attempting a restricted socket operation diff --git a/gadgets/socket-restrict/program.bpf.c b/gadgets/socket-restrict/program.bpf.c index 95da65c..e0be20c 100644 --- a/gadgets/socket-restrict/program.bpf.c +++ b/gadgets/socket-restrict/program.bpf.c @@ -5,6 +5,7 @@ #include +#include #include #include #include @@ -16,17 +17,52 @@ GADGET_TRACER_MAP(events, 1024 * 256); GADGET_TRACER(socket_restrict, events, event); -// Block AF_ALG socket creation — main choke point. +// Runtime-populated deny-list of address families. Populated from userspace +// at gadget init from the --socket-deny-families flag. Empty by default +// means the BPF program is a no-op for that family. +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_DENIED_FAMILIES); + __type(key, __u16); + __type(value, __u8); +} denied_families SEC(".maps"); + +// Runtime-populated deny-list of AF_NETLINK protocols. Empty by default +// (NETLINK_NETFILTER/XFRM/AUDIT/KOBJECT_UEVENT are all opt-in). +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_DENIED_NETLINK_PROTOCOLS); + __type(key, __u32); + __type(value, __u8); +} denied_netlink_protocols SEC(".maps"); + +static __always_inline bool is_family_denied(__u16 family) { + return bpf_map_lookup_elem(&denied_families, &family) != NULL; +} + +static __always_inline bool is_netlink_protocol_denied(__u32 protocol) { + return bpf_map_lookup_elem(&denied_netlink_protocols, &protocol) != NULL; +} + +// Block denied socket families and selected AF_NETLINK protocols. SEC("lsm/socket_create") int BPF_PROG(micromize_socket_create, int family, int type, int protocol, int kern) { + (void)type; + if (kern) return 0; if (gadget_should_discard_data_current()) return 0; - if (family != AF_ALG) + __u16 fam = (__u16)family; + __u32 socket_protocol = protocol >= 0 ? (__u32)protocol : 0; + + bool denied = is_family_denied(fam); + if (!denied && fam == AF_NETLINK) + denied = is_netlink_protocol_denied(socket_protocol); + if (!denied) return 0; struct event *event; @@ -39,8 +75,10 @@ int BPF_PROG(micromize_socket_create, int family, int type, int protocol, gadget_process_populate(&event->process); event->timestamp_raw = bpf_ktime_get_boot_ns(); - event->event_type = EVENT_TYPE_SOCKET_AF_ALG_CREATE; - event->family = AF_ALG; + event->event_type = fam == AF_ALG ? EVENT_TYPE_SOCKET_AF_ALG_CREATE + : EVENT_TYPE_SOCKET_FAMILY_DENIED_CREATE; + event->family = (__u32)fam; + event->protocol = socket_protocol; event->alg_type[0] = '\0'; event->alg_name[0] = '\0'; @@ -52,22 +90,34 @@ int BPF_PROG(micromize_socket_create, int family, int type, int protocol, return 0; } -// Defense-in-depth: block AF_ALG bind if a socket FD exists from before -// policy load. Preserves alg_type/alg_name for visibility. +// Defense-in-depth: block denied socket binds if a socket FD exists from before +// policy load. Preserve AF_ALG alg_type/alg_name for visibility. SEC("lsm/socket_bind") int BPF_PROG(micromize_socket_bind, struct socket *sock, struct sockaddr *address, int addrlen) { - (void)sock; - if (gadget_should_discard_data_current()) return 0; - if (!address || addrlen < SOCKADDR_ALG_TYPE_END) + if (!address || addrlen < sizeof(__u16)) return 0; __u16 family = 0; bpf_probe_read_kernel(&family, sizeof(family), address); - if (family != AF_ALG) + + bool denied = is_family_denied(family); + __u32 protocol = 0; + + // Micro-optimization: sk_protocol is only meaningful (and only consulted by + // the deny-list) for AF_NETLINK. Avoid the BPF_CORE_READ on every other + // socket bind. + if (!denied && family == AF_NETLINK) { + struct sock *sk = BPF_CORE_READ(sock, sk); + if (sk) { + protocol = (__u32)BPF_CORE_READ_BITFIELD_PROBED(sk, sk_protocol); + denied = is_netlink_protocol_denied(protocol); + } + } + if (!denied) return 0; struct event *event; @@ -80,19 +130,23 @@ int BPF_PROG(micromize_socket_bind, struct socket *sock, gadget_process_populate(&event->process); event->timestamp_raw = bpf_ktime_get_boot_ns(); - event->event_type = EVENT_TYPE_SOCKET_AF_ALG_BIND; + event->event_type = family == AF_ALG ? EVENT_TYPE_SOCKET_AF_ALG_BIND + : EVENT_TYPE_SOCKET_FAMILY_DENIED_BIND; event->family = family; + event->protocol = protocol; + event->alg_type[0] = '\0'; + event->alg_name[0] = '\0'; - bpf_probe_read_kernel(event->alg_type, SOCKADDR_ALG_TYPE_LEN, - (const char *)address + SOCKADDR_ALG_TYPE_OFFSET); - event->alg_type[SOCKADDR_ALG_TYPE_LEN] = '\0'; + if (family == AF_ALG && addrlen >= SOCKADDR_ALG_TYPE_END) { + bpf_probe_read_kernel(event->alg_type, SOCKADDR_ALG_TYPE_LEN, + (const char *)address + SOCKADDR_ALG_TYPE_OFFSET); + event->alg_type[SOCKADDR_ALG_TYPE_LEN] = '\0'; - if (addrlen >= SOCKADDR_ALG_MIN_LEN) { - bpf_probe_read_kernel(event->alg_name, SOCKADDR_ALG_NAME_LEN, - (const char *)address + SOCKADDR_ALG_NAME_OFFSET); - event->alg_name[SOCKADDR_ALG_NAME_LEN - 1] = '\0'; - } else { - event->alg_name[0] = '\0'; + if (addrlen >= SOCKADDR_ALG_MIN_LEN) { + bpf_probe_read_kernel(event->alg_name, SOCKADDR_ALG_NAME_LEN, + (const char *)address + SOCKADDR_ALG_NAME_OFFSET); + event->alg_name[SOCKADDR_ALG_NAME_LEN - 1] = '\0'; + } } gadget_submit_buf(ctx, &events, event, sizeof(*event)); diff --git a/gadgets/socket-restrict/program.bpf.h b/gadgets/socket-restrict/program.bpf.h index 79f559d..3e42db3 100644 --- a/gadgets/socket-restrict/program.bpf.h +++ b/gadgets/socket-restrict/program.bpf.h @@ -8,10 +8,86 @@ #define EPERM 1 #endif +#ifndef AF_AX25 +#define AF_AX25 3 +#endif + +#ifndef AF_ATMPVC +#define AF_ATMPVC 8 +#endif + +#ifndef AF_X25 +#define AF_X25 9 +#endif + +#ifndef AF_NETLINK +#define AF_NETLINK 16 +#endif + +#ifndef AF_PACKET +#define AF_PACKET 17 +#endif + +#ifndef AF_ATMSVC +#define AF_ATMSVC 20 +#endif + +#ifndef AF_RDS +#define AF_RDS 21 +#endif + +#ifndef AF_CAN +#define AF_CAN 29 +#endif + +#ifndef AF_TIPC +#define AF_TIPC 30 +#endif + +#ifndef AF_BLUETOOTH +#define AF_BLUETOOTH 31 +#endif + +#ifndef AF_CAIF +#define AF_CAIF 37 +#endif + #ifndef AF_ALG #define AF_ALG 38 #endif +#ifndef AF_NFC +#define AF_NFC 39 +#endif + +#ifndef AF_VSOCK +#define AF_VSOCK 40 +#endif + +#ifndef AF_KCM +#define AF_KCM 41 +#endif + +#ifndef AF_SMC +#define AF_SMC 43 +#endif + +#ifndef NETLINK_XFRM +#define NETLINK_XFRM 6 +#endif + +#ifndef NETLINK_AUDIT +#define NETLINK_AUDIT 9 +#endif + +#ifndef NETLINK_NETFILTER +#define NETLINK_NETFILTER 12 +#endif + +#ifndef NETLINK_KOBJECT_UEVENT +#define NETLINK_KOBJECT_UEVENT 15 +#endif + #define SOCKADDR_ALG_TYPE_OFFSET 2 #define SOCKADDR_ALG_TYPE_LEN 14 #define SOCKADDR_ALG_TYPE_END (SOCKADDR_ALG_TYPE_OFFSET + SOCKADDR_ALG_TYPE_LEN) @@ -22,11 +98,15 @@ #define EVENT_ALG_TYPE_LEN (SOCKADDR_ALG_TYPE_LEN + 1) +#define MAX_DENIED_FAMILIES 64 +#define MAX_DENIED_NETLINK_PROTOCOLS 32 + struct event { gadget_timestamp timestamp_raw; struct gadget_process process; __u32 event_type; __u32 family; + __u32 protocol; char alg_type[EVENT_ALG_TYPE_LEN]; char alg_name[SOCKADDR_ALG_NAME_LEN]; }; diff --git a/include/micromize/event_types.h b/include/micromize/event_types.h index 90011ee..84e86ae 100644 --- a/include/micromize/event_types.h +++ b/include/micromize/event_types.h @@ -29,6 +29,8 @@ enum micromize_event_type { // socket-restrict EVENT_TYPE_SOCKET_AF_ALG_CREATE = 11, EVENT_TYPE_SOCKET_AF_ALG_BIND = 12, + EVENT_TYPE_SOCKET_FAMILY_DENIED_CREATE = 14, + EVENT_TYPE_SOCKET_FAMILY_DENIED_BIND = 15, }; #endif /* __MICROMIZE_EVENT_TYPES_H */ diff --git a/internal/gadget/registry_test.go b/internal/gadget/registry_test.go index 978a923..7fe499e 100644 --- a/internal/gadget/registry_test.go +++ b/internal/gadget/registry_test.go @@ -60,6 +60,25 @@ func TestRegistry_Register(t *testing.T) { } } +func TestRegistry_RegisterAllDefaultGadgets(t *testing.T) { + r := NewRegistry(&mockContextCreator{}, &mockRuntimeManager{}) + gadgets := []string{"fs-restrict", "cap-restrict", "ptrace-restrict", "socket-restrict", "binary-attestation"} + + for _, name := range gadgets { + r.Register(name, &GadgetConfig{ImageName: name + "-image"}) + } + + if len(r.gadgets) != len(gadgets) { + t.Fatalf("expected %d gadgets, got %d", len(gadgets), len(r.gadgets)) + } + + for _, name := range gadgets { + if _, ok := r.gadgets[name]; !ok { + t.Errorf("expected gadget %q to be registered", name) + } + } +} + func TestRegistry_RunAll(t *testing.T) { done := make(chan struct{}) var once sync.Once diff --git a/internal/operators/operators.go b/internal/operators/operators.go index c1b79ea..92cbefc 100644 --- a/internal/operators/operators.go +++ b/internal/operators/operators.go @@ -310,6 +310,8 @@ const ( eventTypeSocketAFAlgCreate = 11 eventTypeSocketAFAlgBind = 12 eventTypeCapModuleAutoload = 13 + eventTypeSocketFamilyDeniedCreate = 14 + eventTypeSocketFamilyDeniedBind = 15 ) var eventTypeNames = map[uint32]string{ @@ -327,6 +329,8 @@ var eventTypeNames = map[uint32]string{ eventTypeSocketAFAlgCreate: "af_alg_socket_create", eventTypeSocketAFAlgBind: "af_alg_socket_bind", eventTypeCapModuleAutoload: "module_autoload", + eventTypeSocketFamilyDeniedCreate: "socket_family_denied_create", + eventTypeSocketFamilyDeniedBind: "socket_family_denied_bind", } // NewEventTypeOperator creates an operator that enriches events with a diff --git a/internal/operators/output.go b/internal/operators/output.go index a55416d..9fbb0b3 100644 --- a/internal/operators/output.go +++ b/internal/operators/output.go @@ -42,6 +42,8 @@ var eventDescriptions = map[uint32]string{ eventTypeSocketAFAlgCreate: "AF_ALG socket creation blocked", eventTypeSocketAFAlgBind: "AF_ALG socket bind blocked", eventTypeCapModuleAutoload: "Kernel module auto-load blocked", + eventTypeSocketFamilyDeniedCreate: "Socket family denied (create)", + eventTypeSocketFamilyDeniedBind: "Socket family denied (bind)", } var eventEmojis = map[uint32]string{} @@ -51,6 +53,56 @@ var capNames = map[int32]string{ 21: "CAP_SYS_ADMIN", } +const ( + afAX25 = 3 + afATMPVC = 8 + afX25 = 9 + afNetlink = 16 + afPacket = 17 + afATMSVC = 20 + afRDS = 21 + afCAN = 29 + afTIPC = 30 + afBluetooth = 31 + afCAIF = 37 + afALG = 38 + afNFC = 39 + afVSOCK = 40 + afKCM = 41 + afSMC = 43 + + netlinkXFRM = 6 + netlinkAudit = 9 + netlinkNetfilter = 12 + netlinkKObjectUevent = 15 +) + +var socketFamilyNames = map[uint32]string{ + afAX25: "AF_AX25", + afATMPVC: "AF_ATMPVC", + afX25: "AF_X25", + afNetlink: "AF_NETLINK", + afPacket: "AF_PACKET", + afATMSVC: "AF_ATMSVC", + afRDS: "AF_RDS", + afCAN: "AF_CAN", + afTIPC: "AF_TIPC", + afBluetooth: "AF_BLUETOOTH", + afCAIF: "AF_CAIF", + afALG: "AF_ALG", + afNFC: "AF_NFC", + afVSOCK: "AF_VSOCK", + afKCM: "AF_KCM", + afSMC: "AF_SMC", +} + +var netlinkProtocolNames = map[uint32]string{ + netlinkXFRM: "NETLINK_XFRM", + netlinkAudit: "NETLINK_AUDIT", + netlinkNetfilter: "NETLINK_NETFILTER", + netlinkKObjectUevent: "NETLINK_KOBJECT_UEVENT", +} + type eventFields struct { eventType datasource.FieldAccessor timestampRaw datasource.FieldAccessor @@ -78,8 +130,10 @@ type eventFields struct { syscall datasource.FieldAccessor // socket-restrict specific - algType datasource.FieldAccessor - algName datasource.FieldAccessor + family datasource.FieldAccessor + protocol datasource.FieldAccessor + algType datasource.FieldAccessor + algName datasource.FieldAccessor // cap-restrict module autoload moduleName datasource.FieldAccessor @@ -183,6 +237,8 @@ func collectEventFields(ds datasource.DataSource, etField datasource.FieldAccess f.cap = ds.GetField("cap") f.syscall = ds.GetField("syscall") + f.family = ds.GetField("family") + f.protocol = ds.GetField("protocol") f.algType = ds.GetField("alg_type") f.algName = ds.GetField("alg_name") f.moduleName = ds.GetField("module_name") @@ -218,6 +274,9 @@ func formatAndPrintEvent(f *eventFields, data datasource.Data) { if filename := fieldStr(f.filename, data); filename != "" { fmt.Fprintf(&sb, ". Filename: %s", filename) } + if isSocketFamilyDeniedEvent(eventType) { + appendSocketFamilyDetails(&sb, f, data) + } if algType := fieldStr(f.algType, data); algType != "" { fmt.Fprintf(&sb, ". AF_ALG type: %s", algType) if algName := fieldStr(f.algName, data); algName != "" { @@ -265,6 +324,46 @@ func formatAndPrintEvent(f *eventFields, data datasource.Data) { outputMu.Unlock() } +func isSocketFamilyDeniedEvent(eventType uint32) bool { + return eventType == eventTypeSocketFamilyDeniedCreate || eventType == eventTypeSocketFamilyDeniedBind +} + +func appendSocketFamilyDetails(sb *strings.Builder, f *eventFields, data datasource.Data) { + if f.family == nil { + return + } + + family, err := f.family.Uint32(data) + if err != nil { + return + } + fmt.Fprintf(sb, ". Family: %s", formatSocketFamily(family)) + + if family != afNetlink || f.protocol == nil { + return + } + + protocol, err := f.protocol.Uint32(data) + if err != nil { + return + } + fmt.Fprintf(sb, ". Protocol: %s", formatNetlinkProtocol(protocol)) +} + +func formatSocketFamily(family uint32) string { + if name, ok := socketFamilyNames[family]; ok { + return fmt.Sprintf("%s (%d)", name, family) + } + return fmt.Sprintf("%d", family) +} + +func formatNetlinkProtocol(protocol uint32) string { + if name, ok := netlinkProtocolNames[protocol]; ok { + return fmt.Sprintf("%s (%d)", name, protocol) + } + return fmt.Sprintf("%d", protocol) +} + func containerIdentity(f *eventFields, data datasource.Data) string { ns := fieldStr(f.k8sNamespace, data) pod := fieldStr(f.k8sPodName, data) diff --git a/internal/operators/socket_restrict.go b/internal/operators/socket_restrict.go new file mode 100644 index 0000000..eaa4e34 --- /dev/null +++ b/internal/operators/socket_restrict.go @@ -0,0 +1,236 @@ +// Copyright The micromize authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operators + +import ( + "fmt" + "log/slog" + "strconv" + "strings" + + "github.com/cilium/ebpf" + igoperators "github.com/inspektor-gadget/inspektor-gadget/pkg/operators" + "github.com/inspektor-gadget/inspektor-gadget/pkg/operators/simple" +) + +// Linux socket address-family constants relevant to the socket-restrict +// gadget. Mirrors gadgets/socket-restrict/program.bpf.h. +var socketFamilyByName = map[string]uint16{ + "AF_UNIX": 1, + "AF_INET": 2, + "AF_AX25": 3, + "AF_IPX": 4, + "AF_APPLETALK": 5, + "AF_NETROM": 6, + "AF_BRIDGE": 7, + "AF_ATMPVC": 8, + "AF_X25": 9, + "AF_INET6": 10, + "AF_ROSE": 11, + "AF_DECNET": 12, + "AF_NETBEUI": 13, + "AF_SECURITY": 14, + "AF_KEY": 15, + "AF_NETLINK": 16, + "AF_PACKET": 17, + "AF_ASH": 18, + "AF_ECONET": 19, + "AF_ATMSVC": 20, + "AF_RDS": 21, + "AF_IRDA": 23, + "AF_PPPOX": 24, + "AF_WANPIPE": 25, + "AF_LLC": 26, + "AF_IB": 27, + "AF_MPLS": 28, + "AF_CAN": 29, + "AF_TIPC": 30, + "AF_BLUETOOTH": 31, + "AF_IUCV": 32, + "AF_RXRPC": 33, + "AF_ISDN": 34, + "AF_PHONET": 35, + "AF_IEEE802154": 36, + "AF_CAIF": 37, + "AF_ALG": 38, + "AF_NFC": 39, + "AF_VSOCK": 40, + "AF_KCM": 41, + "AF_QIPCRTR": 42, + "AF_SMC": 43, + "AF_XDP": 44, +} + +// Linux AF_NETLINK protocol numbers consulted by socket-restrict. +var netlinkProtocolByName = map[string]uint32{ + "NETLINK_ROUTE": 0, + "NETLINK_UNUSED": 1, + "NETLINK_USERSOCK": 2, + "NETLINK_FIREWALL": 3, + "NETLINK_SOCK_DIAG": 4, + "NETLINK_NFLOG": 5, + "NETLINK_XFRM": 6, + "NETLINK_SELINUX": 7, + "NETLINK_ISCSI": 8, + "NETLINK_AUDIT": 9, + "NETLINK_FIB_LOOKUP": 10, + "NETLINK_CONNECTOR": 11, + "NETLINK_NETFILTER": 12, + "NETLINK_IP6_FW": 13, + "NETLINK_DNRTMSG": 14, + "NETLINK_KOBJECT_UEVENT": 15, + "NETLINK_GENERIC": 16, + "NETLINK_SCSITRANSPORT": 18, + "NETLINK_ECRYPTFS": 19, + "NETLINK_RDMA": 20, + "NETLINK_CRYPTO": 21, +} + +// ParseSocketDenyFamilies parses a comma-separated string of address-family +// names (e.g. "AF_ALG,AF_TIPC") or decimal numbers (e.g. "38,30") into a +// deduplicated slice of family numbers. Empty entries are skipped. Whitespace +// is trimmed. Returns an error on unknown names or out-of-range numbers. +func ParseSocketDenyFamilies(input string) ([]uint16, error) { + if strings.TrimSpace(input) == "" { + return nil, nil + } + seen := make(map[uint16]struct{}) + var out []uint16 + for _, raw := range strings.Split(input, ",") { + token := strings.TrimSpace(raw) + if token == "" { + continue + } + var fam uint16 + if v, ok := socketFamilyByName[strings.ToUpper(token)]; ok { + fam = v + } else { + n, err := strconv.ParseUint(token, 10, 16) + if err != nil { + return nil, fmt.Errorf("unknown address family %q", token) + } + fam = uint16(n) + } + if _, dup := seen[fam]; dup { + continue + } + seen[fam] = struct{}{} + out = append(out, fam) + } + return out, nil +} + +// ParseSocketDenyNetlinkProtocols parses a comma-separated string of +// AF_NETLINK protocol names (e.g. "NETLINK_NETFILTER,NETLINK_XFRM") or +// decimal numbers into a deduplicated slice of protocol numbers. +func ParseSocketDenyNetlinkProtocols(input string) ([]uint32, error) { + if strings.TrimSpace(input) == "" { + return nil, nil + } + seen := make(map[uint32]struct{}) + var out []uint32 + for _, raw := range strings.Split(input, ",") { + token := strings.TrimSpace(raw) + if token == "" { + continue + } + var proto uint32 + if v, ok := netlinkProtocolByName[strings.ToUpper(token)]; ok { + proto = v + } else { + n, err := strconv.ParseUint(token, 10, 32) + if err != nil { + return nil, fmt.Errorf("unknown netlink protocol %q", token) + } + proto = uint32(n) + } + if _, dup := seen[proto]; dup { + continue + } + seen[proto] = struct{}{} + out = append(out, proto) + } + return out, nil +} + +// Keep in sync with gadgets/socket-restrict/program.bpf.c. +const ( + socketDeniedFamiliesMapName = "map/denied_families" + socketDeniedNetlinkProtocolsMapName = "map/denied_netlink_protocols" +) + +// NewSocketRestrictOperator returns a data operator that, on each gadget's +// init, populates the socket-restrict BPF deny-list maps from the supplied +// family / netlink-protocol slices. The operator is a no-op for any gadget +// that does not expose those maps. +func NewSocketRestrictOperator(families []uint16, netlinkProtocols []uint32) igoperators.DataOperator { + slog.Debug("Creating socket-restrict operator", + "families", families, "netlinkProtocols", netlinkProtocols) + return simple.New("socketRestrictOperator", + simple.OnInit(func(gadgetCtx igoperators.GadgetContext) error { + if err := populateUint16Map(gadgetCtx, socketDeniedFamiliesMapName, families); err != nil { + return fmt.Errorf("populating %s: %w", socketDeniedFamiliesMapName, err) + } + if err := populateUint32Map(gadgetCtx, socketDeniedNetlinkProtocolsMapName, netlinkProtocols); err != nil { + return fmt.Errorf("populating %s: %w", socketDeniedNetlinkProtocolsMapName, err) + } + return nil + }), + ) +} + +func populateUint16Map(gadgetCtx igoperators.GadgetContext, name string, keys []uint16) error { + m, ok := lookupMap(gadgetCtx, name) + if !ok { + return nil + } + value := uint8(1) + for _, k := range keys { + key := k + if err := m.Put(key, value); err != nil { + return fmt.Errorf("inserting key %d: %w", k, err) + } + } + slog.Debug("Populated socket-restrict map", "name", name, "entries", len(keys)) + return nil +} + +func populateUint32Map(gadgetCtx igoperators.GadgetContext, name string, keys []uint32) error { + m, ok := lookupMap(gadgetCtx, name) + if !ok { + return nil + } + value := uint8(1) + for _, k := range keys { + key := k + if err := m.Put(key, value); err != nil { + return fmt.Errorf("inserting key %d: %w", k, err) + } + } + slog.Debug("Populated socket-restrict map", "name", name, "entries", len(keys)) + return nil +} + +func lookupMap(gadgetCtx igoperators.GadgetContext, name string) (*ebpf.Map, bool) { + v, ok := gadgetCtx.GetVar(name) + if !ok { + return nil, false + } + m, ok := v.(*ebpf.Map) + if !ok || m == nil { + return nil, false + } + return m, true +} diff --git a/internal/operators/socket_restrict_test.go b/internal/operators/socket_restrict_test.go new file mode 100644 index 0000000..aac0dfd --- /dev/null +++ b/internal/operators/socket_restrict_test.go @@ -0,0 +1,113 @@ +// Copyright The micromize authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operators + +import ( + "reflect" + "sort" + "testing" +) + +func sortedU16(in []uint16) []uint16 { + out := append([]uint16(nil), in...) + sort.Slice(out, func(i, j int) bool { return out[i] < out[j] }) + return out +} + +func sortedU32(in []uint32) []uint32 { + out := append([]uint32(nil), in...) + sort.Slice(out, func(i, j int) bool { return out[i] < out[j] }) + return out +} + +func TestParseSocketDenyFamilies(t *testing.T) { + tests := []struct { + name string + input string + want []uint16 + wantErr bool + }{ + {name: "empty", input: "", want: nil}, + {name: "whitespace only", input: " , ", want: nil}, + {name: "single name", input: "AF_ALG", want: []uint16{38}}, + {name: "case insensitive", input: "af_alg", want: []uint16{38}}, + {name: "decimal", input: "38", want: []uint16{38}}, + { + name: "mixed names and numbers", + input: " AF_ALG ,30, AF_VSOCK", + want: []uint16{30, 38, 40}, + }, + { + name: "duplicates collapsed", + input: "AF_ALG,38,AF_ALG", + want: []uint16{38}, + }, + {name: "unknown name", input: "AF_NOPE", wantErr: true}, + {name: "out of range", input: "1000000", wantErr: true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := ParseSocketDenyFamilies(tt.input) + if (err != nil) != tt.wantErr { + t.Fatalf("ParseSocketDenyFamilies(%q) err=%v, wantErr=%v", + tt.input, err, tt.wantErr) + } + if tt.wantErr { + return + } + if !reflect.DeepEqual(sortedU16(got), sortedU16(tt.want)) { + t.Errorf("ParseSocketDenyFamilies(%q) = %v, want %v", + tt.input, got, tt.want) + } + }) + } +} + +func TestParseSocketDenyNetlinkProtocols(t *testing.T) { + tests := []struct { + name string + input string + want []uint32 + wantErr bool + }{ + {name: "empty default", input: "", want: nil}, + { + name: "all four common opt-ins", + input: "NETLINK_NETFILTER,NETLINK_XFRM,NETLINK_AUDIT,NETLINK_KOBJECT_UEVENT", + want: []uint32{6, 9, 12, 15}, + }, + {name: "case insensitive", input: "netlink_netfilter", want: []uint32{12}}, + {name: "decimal", input: "12", want: []uint32{12}}, + {name: "unknown", input: "NETLINK_NOPE", wantErr: true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := ParseSocketDenyNetlinkProtocols(tt.input) + if (err != nil) != tt.wantErr { + t.Fatalf("ParseSocketDenyNetlinkProtocols(%q) err=%v, wantErr=%v", + tt.input, err, tt.wantErr) + } + if tt.wantErr { + return + } + if !reflect.DeepEqual(sortedU32(got), sortedU32(tt.want)) { + t.Errorf("ParseSocketDenyNetlinkProtocols(%q) = %v, want %v", + tt.input, got, tt.want) + } + }) + } +} diff --git a/tests/integration/cases/11_af_vsock_audit_mode.sh b/tests/integration/cases/11_af_vsock_audit_mode.sh new file mode 100755 index 0000000..f724e15 --- /dev/null +++ b/tests/integration/cases/11_af_vsock_audit_mode.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# Test: AF_VSOCK socket-restrict in audit mode +# +# AF_VSOCK is intentionally NOT in the default socket-restrict deny-list — it +# can be used by firecracker / kata-containers / hypervisor agents. This test +# opts in AF_VSOCK via --socket-deny-families and runs micromize in audit +# mode (--enforce=false), so the socket call is *allowed* but the gadget +# emits an event. The probe asserts the socket() was not blocked; the harness +# is expected to additionally verify a "Socket family denied (create)" event +# was logged by micromize (this is harness-side and not part of the probe). +# +# Expected harness configuration: +# MICROMIZE_SOCKET_DENY_FAMILIES=AF_VSOCK +# MICROMIZE_ENFORCE=false +# These are read by the harness when launching micromize before this case +# executes, mirroring the convention used by the existing AF_ALG cases (which +# rely on the harness to have micromize already running in enforce mode). + +test_af_vsock_audit_mode() { + begin_test "AF_VSOCK socket allowed in audit mode while opted into deny-list" + + if ! command -v go &>/dev/null; then + fail_test "go is required to build the AF_VSOCK probe" + return + fi + + local probe_bin="${ROOTFS_DIR}/bin/af-vsock-probe" + if ! (cd "$REPO_ROOT" && CGO_ENABLED=0 GOOS=linux GOARCH="$ARCH" go build -o "$probe_bin" ./tests/integration/probes/af_vsock); then + fail_test "failed to build AF_VSOCK probe" + return + fi + + local bundle="${TEST_TMPDIR}/bundle-af-vsock" + local cid="micromize-test-af-vsock" + + create_bundle "$bundle" "$ROOTFS_DIR" /bin/af-vsock-probe + + local output + output=$(runc run "$cid" -b "$bundle" 2>&1) + local rc=$? + + if [[ $rc -ne 0 ]]; then + fail_test "AF_VSOCK probe exited with ${rc}: ${output}" + runc delete -f "$cid" 2>/dev/null || true + return + fi + + # In audit mode the socket call must succeed (audit only emits an event). + # Accept "skipped" for kernels without AF_VSOCK so the case is portable. + if echo "$output" | grep -qE "^(ok|skipped):"; then + pass_test + else + fail_test "Expected AF_VSOCK socket to be allowed in audit mode, got: ${output}" + fi + + runc delete -f "$cid" 2>/dev/null || true +} + +test_af_vsock_audit_mode diff --git a/tests/integration/probes/af_vsock/main.go b/tests/integration/probes/af_vsock/main.go new file mode 100644 index 0000000..38cbf5a --- /dev/null +++ b/tests/integration/probes/af_vsock/main.go @@ -0,0 +1,76 @@ +// Copyright The micromize authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "os" + "syscall" +) + +const ( + afVSOCK = 40 + sockStream = 1 + sockSeqpacket = 5 +) + +// AF_VSOCK probe. Exits 0 in both audit and enforce modes so the harness can +// drive the audit-vs-enforce distinction externally (by toggling micromize's +// --enforce flag) while still asserting on the probe's printed status. +// +// "ok: AF_VSOCK socket created (audit-mode or opt-out)" socket() returned an fd +// "blocked: AF_VSOCK socket creation denied: " socket() returned EPERM/EACCES +// "skipped: AF_VSOCK not supported on this kernel" socket() returned EAFNOSUPPORT/EPROTONOSUPPORT +// +// Exit codes: +// 0 ok / blocked / skipped (any expected outcome) +// 2 unexpected error +func main() { + fd, err := syscall.Socket(afVSOCK, sockStream, 0) + if err == nil { + syscall.Close(fd) //nolint:errcheck,gosec + fmt.Println("ok: AF_VSOCK socket created (audit-mode or opt-out)") + return + } + + switch err { + case syscall.EPERM, syscall.EACCES: + fmt.Printf("blocked: AF_VSOCK socket creation denied: %v\n", err) + return + case syscall.EAFNOSUPPORT, syscall.EPROTONOSUPPORT: + fmt.Printf("skipped: AF_VSOCK not supported on this kernel: %v\n", err) + return + } + + // SOCK_STREAM is the newer transport; some older kernels only support + // SOCK_SEQPACKET. Retry once before declaring the result unexpected. + fd, err = syscall.Socket(afVSOCK, sockSeqpacket, 0) + if err == nil { + syscall.Close(fd) //nolint:errcheck,gosec + fmt.Println("ok: AF_VSOCK socket created (audit-mode or opt-out)") + return + } + switch err { + case syscall.EPERM, syscall.EACCES: + fmt.Printf("blocked: AF_VSOCK socket creation denied: %v\n", err) + return + case syscall.EAFNOSUPPORT, syscall.EPROTONOSUPPORT: + fmt.Printf("skipped: AF_VSOCK not supported on this kernel: %v\n", err) + return + } + + fmt.Printf("socket-error: %v\n", err) + os.Exit(2) +}