Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 141 additions & 8 deletions pkg/registry/file/dynamicpathdetector/analyze_endpoints.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,42 @@ import (
"strings"

mapset "github.com/deckarep/golang-set/v2"
"github.com/kubescape/go-logger"
loggerhelpers "github.com/kubescape/go-logger/helpers"
types "github.com/kubescape/storage/pkg/apis/softwarecomposition"
"github.com/kubescape/storage/pkg/apis/softwarecomposition/consts"
)

func isWildcardPort(port string) bool {
return port == "0"
}

func AnalyzeEndpoints(endpoints *[]types.HTTPEndpoint, analyzer *PathAnalyzer) []types.HTTPEndpoint {
if len(*endpoints) == 0 {
return nil
}

var newEndpoints []*types.HTTPEndpoint
// First pass: build the analyzer trie from each endpoint's true (port,
// path) tuple. Each port keys a separate sub-tree, so :0/foo and
// :443/foo are analyzed independently — :443/foo is NOT rewritten to
// :0/foo just because some unrelated endpoint also uses :0.
for _, endpoint := range *endpoints {
_, _ = AnalyzeURL(endpoint.Endpoint, analyzer)
}

// Second pass: process endpoints with their original ports.
var newEndpoints []*types.HTTPEndpoint
for _, endpoint := range *endpoints {
processedEndpoint, err := ProcessEndpoint(&endpoint, analyzer, newEndpoints)
ep := endpoint
processedEndpoint, err := ProcessEndpoint(&ep, analyzer, newEndpoints)
if processedEndpoint == nil && err == nil || err != nil {
continue
} else {
newEndpoints = append(newEndpoints, processedEndpoint)
}
newEndpoints = append(newEndpoints, processedEndpoint)
}

// Cross-port folding happens here: only same-(path, direction) siblings
// of an explicit :0 wildcard get absorbed into it.
newEndpoints = MergeDuplicateEndpoints(newEndpoints)

return convertPointerToValueSlice(newEndpoints)
Expand Down Expand Up @@ -88,6 +102,50 @@ func AnalyzeURL(urlString string, analyzer *PathAnalyzer) (string, error) {
return ":" + port + path, nil
}

// splitEndpointPortAndPath splits the canonical `:<port><path>` form
// produced by AnalyzeURL into its (port, path) parts.
//
// Defensive contract: AnalyzeURL guarantees a leading `:` and a port
// segment, but callers and tests sometimes pass bare paths (e.g.
// "/health") for ad-hoc lookups. To keep merge keys deterministic,
// this helper returns empty port + leading-slash-normalised path for
// any input that does not start with `:`. The empty string returns
// ("", "/") to match the original fall-through behavior.
func splitEndpointPortAndPath(endpoint string) (string, string) {
if !strings.HasPrefix(endpoint, ":") {
if endpoint == "" {
return "", "/"
}
if !strings.HasPrefix(endpoint, "/") {
endpoint = "/" + endpoint
}
return "", endpoint
}
s := endpoint[1:]
idx := strings.Index(s, "/")
if idx == -1 {
return s, "/"
}
return s[:idx], s[idx:]
}

// MergeDuplicateEndpoints folds duplicates and merges same-path specific-port
// endpoints into a wildcard-port (:0) sibling. Folding is symmetric and is
// keyed on the same triple HTTPEndpoint.Equal compares — (Endpoint,
// Direction, Internal). An Internal=false endpoint will therefore NOT merge
// with an Internal=true sibling even if their path and direction match.
//
// - If a specific-port endpoint is encountered AFTER its :0 sibling, the
// specific-port methods/headers are merged INTO the wildcard entry.
// - If a specific-port endpoint is encountered BEFORE its :0 sibling, it
// is initially recorded; when the wildcard arrives we sweep `seen` for
// same-(path, direction, Internal) specific-port siblings, fold them
// into the wildcard, and remove them from the output.
//
// This contract was tightened on the back of upstream review on
// kubescape/storage#316 — a single :0 entry must NOT cause unrelated
// concrete-port endpoints to be wildcarded; only same-path same-Internal
// siblings fold.
func MergeDuplicateEndpoints(endpoints []*types.HTTPEndpoint) []*types.HTTPEndpoint {
seen := make(map[string]*types.HTTPEndpoint)
var newEndpoints []*types.HTTPEndpoint
Expand All @@ -97,21 +155,91 @@ func MergeDuplicateEndpoints(endpoints []*types.HTTPEndpoint) []*types.HTTPEndpo
if existing, found := seen[key]; found {
existing.Methods = MergeStrings(existing.Methods, endpoint.Methods)
mergeHeaders(existing, endpoint)
} else {
continue
}

port, pathPart := splitEndpointPortAndPath(endpoint.Endpoint)

if isWildcardPort(port) {
// Wildcard arriving after specific-port siblings — sweep `seen`
// for any same-(path, direction, Internal) specific-port entries
// already recorded, fold them into the wildcard, then drop them
// from the output slice.
for k, e := range seen {
ePort, ePath := splitEndpointPortAndPath(e.Endpoint)
if isWildcardPort(ePort) || ePath != pathPart ||
e.Direction != endpoint.Direction || e.Internal != endpoint.Internal {
continue
}
endpoint.Methods = MergeStrings(endpoint.Methods, e.Methods)
mergeHeaders(endpoint, e)
delete(seen, k)
newEndpoints = removeEndpoint(newEndpoints, e)
}
seen[key] = endpoint
newEndpoints = append(newEndpoints, endpoint)
continue
}

// Specific port: if a wildcard sibling for the same
// (path, direction, Internal) is already in `seen`, fold this entry
// into it. The wildcardKey shape MUST match getEndpointKey exactly so
// the lookup hits the same map slot the wildcard was inserted under.
// CodeRabbit upstream PR #323 finding #5: the two formats are
// derived from the same helper to remove the duplication risk.
wildcardKey := buildEndpointKey(":0"+pathPart, endpoint.Direction, endpoint.Internal)
if existing, found := seen[wildcardKey]; found {
existing.Methods = MergeStrings(existing.Methods, endpoint.Methods)
mergeHeaders(existing, endpoint)
continue
}

seen[key] = endpoint
newEndpoints = append(newEndpoints, endpoint)
}

return newEndpoints
}

// removeEndpoint returns a new slice with the first occurrence of target
// removed (compared by pointer). Used by MergeDuplicateEndpoints when a
// previously-recorded specific-port entry is absorbed into a later wildcard.
//
// NOTE — in-place backing-array mutation: the `append(s[:i], s[i+1:]...)`
// pattern shifts elements left within the original backing array, leaving
// a stale pointer at s[len-1] outside the returned slice. The sole caller
// immediately replaces `newEndpoints` with the return value, so there is
// no live alias today. If a future refactor stores intermediate slice
// references, swap to a copy-based removal to avoid action-at-a-distance.
// CodeRabbit upstream PR #323 finding #9.
func removeEndpoint(s []*types.HTTPEndpoint, target *types.HTTPEndpoint) []*types.HTTPEndpoint {
for i, e := range s {
if e == target {
return append(s[:i], s[i+1:]...)
}
}
return s
}

// getEndpointKey returns a key that uniquely identifies an HTTPEndpoint by
// the same fields HTTPEndpoint.Equal compares: Endpoint, Direction, Internal.
// CodeRabbit upstream PR #323 finding #5: both this function and the
// wildcard-key construction in MergeDuplicateEndpoints route through
// buildEndpointKey so the format can't drift between the two callsites.
func getEndpointKey(endpoint *types.HTTPEndpoint) string {
return fmt.Sprintf("%s|%s", endpoint.Endpoint, endpoint.Direction)
return buildEndpointKey(endpoint.Endpoint, endpoint.Direction, endpoint.Internal)
}

// buildEndpointKey is the single source of truth for the endpoint-lookup
// key shape used by MergeDuplicateEndpoints. Inlining the fmt.Sprintf at
// two callsites is what allowed the two formats to drift in the past
// (e.g. one with %s|%s|%t and another with %s|%s|%v). Keep this helper
// the only producer of the key.
func buildEndpointKey(endpoint string, direction consts.NetworkDirection, internal bool) string {
return fmt.Sprintf("%s|%s|%t", endpoint, direction, internal)
}

func mergeHeaders(existing, new *types.HTTPEndpoint) {
// TODO: Find a better way to unmashal the headers
existingHeaders, err := existing.GetHeaders()
if err != nil {
return
Expand All @@ -133,7 +261,12 @@ func mergeHeaders(existing, new *types.HTTPEndpoint) {

rawJSON, err := json.Marshal(existingHeaders)
if err != nil {
fmt.Println("Error marshaling JSON:", err)
// Don't pollute stdout from a library function. The caller has
// no signal-back path here (mergeHeaders is a void helper) so
// log at Debug and bail — leaving Headers untouched is the
// safer choice than corrupting them with a partial marshal.
logger.L().Debug("mergeHeaders: failed to marshal merged headers, leaving existing untouched",
loggerhelpers.Error(err))
return
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package dynamicpathdetector

import (
"testing"

"github.com/stretchr/testify/assert"
)

// TestSplitEndpointPortAndPath_DefensiveContract pins the inputs that
// AnalyzeURL is supposed to produce (`:<port><path>`) AND the defensive
// behavior for bare-path / empty / no-leading-slash inputs that may
// arrive via ad-hoc lookups or tests. Without the guard, "foo" was
// returned as ("foo", "/") — silently treating an opaque token as a
// port number. Flagged on upstream PR #316 by CodeRabbit (C7).
func TestSplitEndpointPortAndPath_DefensiveContract(t *testing.T) {
tests := []struct {
name string
input string
wantPort string
wantPath string
}{
// Canonical AnalyzeURL output.
{"empty", "", "", "/"},
{"port_only", ":80", "80", "/"},
{"port_with_root_path", ":80/", "80", "/"},
{"port_with_path", ":80/health", "80", "/health"},
{"wildcard_port", ":0", "0", "/"},
{"wildcard_port_with_path", ":0/api/users", "0", "/api/users"},
{"port_with_deep_path", ":443/v1/items/42", "443", "/v1/items/42"},

// Defensive — bare paths arriving without the `:` prefix.
{"bare_path", "/health", "", "/health"},
{"bare_root", "/", "", "/"},
{"bare_deep_path", "/v1/items/42", "", "/v1/items/42"},

// Defensive — opaque token without a leading slash. Previous
// behavior silently returned ("foo", "/") which would be
// indistinguishable from port="foo". The guard normalises this
// to ("", "/foo").
{"opaque_token", "foo", "", "/foo"},
{"opaque_with_dot", "host.example.com", "", "/host.example.com"},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotPort, gotPath := splitEndpointPortAndPath(tt.input)
assert.Equal(t, tt.wantPort, gotPort,
"splitEndpointPortAndPath(%q) port = %q, want %q",
tt.input, gotPort, tt.wantPort)
assert.Equal(t, tt.wantPath, gotPath,
"splitEndpointPortAndPath(%q) path = %q, want %q",
tt.input, gotPath, tt.wantPath)
})
}
}
94 changes: 92 additions & 2 deletions pkg/registry/file/dynamicpathdetector/analyze_opens.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,99 @@ func AnalyzeOpens(opens []types.OpenCalls, analyzer *PathAnalyzer, sbomSet mapse
}
}

return slices.SortedFunc(maps.Values(dynamicOpens), func(a, b types.OpenCalls) int {
result := slices.SortedFunc(maps.Values(dynamicOpens), func(a, b types.OpenCalls) int {
return strings.Compare(a.Path, b.Path)
}), nil
})

return consolidateOpens(result, sbomSet), nil
}

// consolidateOpens drops any literal Open whose path is already covered
// by a wildcard / dynamic-identifier sibling in the same result set, and
// merges the dropped entry's Flags into that sibling. This is a
// post-trie cleanup pass: AnalyzeOpens may emit both a collapsed pattern
// (e.g. /etc/⋯) AND the original literals (/etc/passwd) when only some
// children at that node hit threshold. Without this pass, downstream
// matchers see both forms and the literal acts as redundant noise.
//
// Two invariants:
//
// 1. Patterns are always preserved — if a path contains either
// WildcardIdentifier or DynamicIdentifier it counts as a pattern
// and is never absorbed.
// 2. SBOM-listed paths are always preserved — they are part of the
// image's manifest and must remain identifiable on their own,
// even if a wildcard pattern would otherwise cover them.
//
// Subsumption check uses the same CompareDynamic the runtime matcher
// (CEL `ap.was_opened`) uses, so both sides agree on what "covered"
// means at every depth.
//
// Complexity: O(n²) over the input slice — each literal walks the full
// pattern set to test coverage. For the profile sizes seen in practice
// (≤ a few thousand entries per container even on chatty workloads) the
// constant factor dominates and the quadratic shape is acceptable. If
// profiles ever balloon beyond ~10k entries, swap the inner scan to a
// precomputed prefix-trie lookup. CodeRabbit upstream PR #323 finding
// #6 (acknowledged, not yet optimised).
func consolidateOpens(opens []types.OpenCalls, sbomSet mapset.Set[string]) []types.OpenCalls {
if len(opens) <= 1 {
return opens
}

isPattern := make([]bool, len(opens))
// Sorted slice of pattern indices, longest-path-first. Order matters:
// when two patterns both cover the same literal, the more specific
// (longer) one wins so folded Flags land deterministically. With ties,
// the input's existing sort (alphabetical by Path) breaks them.
patternOrder := make([]int, 0, len(opens))
for i, o := range opens {
if strings.Contains(o.Path, WildcardIdentifier) || strings.Contains(o.Path, DynamicIdentifier) {
isPattern[i] = true
patternOrder = append(patternOrder, i)
}
}
if len(patternOrder) == 0 {
return opens
}
slices.SortFunc(patternOrder, func(a, b int) int {
if d := len(opens[b].Path) - len(opens[a].Path); d != 0 {
return d
}
return strings.Compare(opens[a].Path, opens[b].Path)
})

keep := make([]bool, len(opens))
for i := range opens {
keep[i] = true
}

for i, o := range opens {
if isPattern[i] {
continue // patterns always survive
}
if sbomSet != nil && sbomSet.ContainsOne(o.Path) {
continue // SBOM paths always survive
}
for _, pi := range patternOrder {
if CompareDynamic(opens[pi].Path, o.Path) {
// `o` is subsumed by the pattern at pi — fold its Flags
// into the pattern entry so all observed access modes
// remain represented.
opens[pi].Flags = mapset.Sorted(mapset.NewThreadUnsafeSet(slices.Concat(opens[pi].Flags, o.Flags)...))
keep[i] = false
break
}
}
}

out := make([]types.OpenCalls, 0, len(opens))
for i, o := range opens {
if keep[i] {
out = append(out, o)
}
}
return out
}

func AnalyzeOpen(path string, analyzer *PathAnalyzer) (string, error) {
Expand Down
Loading
Loading