Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions plugin/action/hash/normalize/token_normalizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,17 @@ func (n *tokenNormalizer) normalizeByScanner(out []byte, scanner *lexmachine.Sca
prevEnd := 0
for tokRaw, err, eos := scanner.Next(); !eos; tokRaw, err, eos = scanner.Next() {
if ui, is := err.(*machines.UnconsumedInput); is {
scanner.TC = ui.FailTC // skip
// Jumping to FailTC may skip start of the next token.
// Example: part/offset = 54/5990:
// After matching 54, lexer reports unconsumed /5,
// FailTC moves past /5, so 5 is skipped and next scan starts from 990,
// Result normalization before: part/offset = <int>/5990.
//
// Using max(scanner.TC+1, ui.FailTC-1):
// FailTC-1 keeps last byte, which may be a token start,
// scanner.TC+1 ensures forward progress, since FailTC-1 can be equal to scanner.TC,
// Result normalization after: part/offset = <int>/<int>.
scanner.TC = max(scanner.TC+1, ui.FailTC-1)
continue
} else if err != nil {
out = out[:0]
Expand Down Expand Up @@ -484,7 +494,7 @@ var builtinTokenPatterns = []TokenPattern{
},
{
Placeholder: placeholderByPattern[pFilepath],
RE: `(/[a-zA-Z0-9-_.]+)+`,
RE: `(/[a-zA-Z-_.][a-zA-Z0-9-_.]*)+`,
mask: pFilepath,
},
{
Expand All @@ -511,10 +521,10 @@ var builtinTokenPatterns = []TokenPattern{
mask: pHash,
},
{
// RFC3339, RFC3339Nano, DateTime, DateOnly, TimeOnly, Go time with monotonic clock
// RFC3339, RFC3339Nano, DateTime, DateOnly, TimeOnly, Go time with optional monotonic clock
Placeholder: placeholderByPattern[pDatetime],
RE: fmt.Sprintf(`(%s)|(%s)|(%s)|(%s)`,
`\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d\.\d+ [+\-]\d\d\d\d [A-Z]+ m=[+\-]\d+\.\d+`,
`\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d\.\d+ [+\-]\d\d\d\d [A-Z]+( m=[+\-]\d+\.\d+)?`,
`\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?(Z|[\+\-]\d\d:\d\d)`,
`\d\d:\d\d:\d\d`,
`\d\d\d\d-\d\d-\d\d( \d\d:\d\d:\d\d)?`,
Expand Down
34 changes: 19 additions & 15 deletions plugin/action/hash/normalize/token_normalizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,8 @@ func TestTokenNormalizerBuiltin(t *testing.T) {
"some 2025-01-13 20:58:04.019973588 +0000 UTC m=+1417512.275697914 here",
"some 2025-01-13 20:58:04.019973588 -0700 MST m=-123.456789012 here",
"some 2025-01-13 20:58:04.019973588 +0300 MSK m=+0.123456789 here",
"some 2025-01-13 20:58:04.019973588 -0700 MST here",
"some 2025-01-13 20:58:04.019973588 +0300 MSK here",
"some 2025-01-13T10:20:40Z here",
"some 2025-01-13T10:20:40.999999999Z here",
"some 2025-01-13T10:20:40-06:00 here",
Expand Down Expand Up @@ -470,9 +472,9 @@ func TestTokenNormalizerCustom(t *testing.T) {
},
},
inputs: []string{
`2006/01/02 15:04:05 error occurred, client: 10.125.172.251, upstream: "http://10.117.246.15:84/download", host: "mpm-youtube-downloader-38.name.com:84"`,
`2006/01/02 15:04:05 error occurred, client: 10.125.172.251, upstream: "http://10.117.246.15:84/download", host: "mpm-youtube-downloader-38.name.com:84", part/offset: 10117/2461584`,
},
want: "<nginx_datetime> error occurred, client: <ip>, upstream: <double_quoted>, host: <double_quoted>",
want: "<nginx_datetime> error occurred, client: <ip>, upstream: <double_quoted>, host: <double_quoted>, part/offset: <int>/<int>",
},
{
name: "empty_patterns",
Expand Down Expand Up @@ -515,19 +517,21 @@ func TestTokenNormalizerCustom(t *testing.T) {

func genBenchInput(count int) []byte {
var examples = []string{
"s1mple falsehood", // no match
"test@host1.host2.com", // email
"http://some.host.com/page1?a=1", // url
"hello-world-123.COM", // host
"7c1811ed-e98f-4c9c-a9f9-58c757ff494f", // uuid
"a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", // sha1
"098f6bcd4621d373cade4e832627b4f6", // md5
"2025-01-13T10:20:40Z", // datetime
"1.2.3.4", // ip
"-1.2m5s", // duration
"0x13eb85e69dfbc0758b12acdaae36287d", // hex
"-4.56", // float
"123", // int
"48757ec9f04efe7faacec8722f3476339b125a6b6172b8a69ff3aa329e0bd0ff", // hash(sha256)
"a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", // hash(sha1)
"098f6bcd4621d373cade4e832627b4f6", // hash(md5)
"s1mple falsehood", // no match
"test@host1.host2.com", // email
"http://some.host.com/page1?a=1", // url
"hello-world-123.COM", // host
"7c1811ed-e98f-4c9c-a9f9-58c757ff494f", // uuid
"/home/user/photos", // filepath
"2025-01-13T10:20:40Z", // datetime
"1.2.3.4", // ip
"-1.2m5s", // duration
"0x13eb85e69dfbc0758b12acdaae36287d", // hex
"-4.56", // float
"123", // int
"truE faLse",
}

Expand Down
Loading