From db8d37da78c4ab86db810b6d90f42482c4b82ffe Mon Sep 17 00:00:00 2001 From: LoukasPap Date: Sun, 22 Mar 2026 16:29:02 +0200 Subject: [PATCH 1/8] Add validation for extended and basic regex brace quantifiers --- src/sed/compiler.rs | 45 ++- src/sed/delimited_parser.rs | 293 +++++++++++++++++- tests/by-util/test_sed.rs | 117 +++++++ .../fixtures/sed/input/regex-quantifiers.txt | 5 + tests/fixtures/sed/input/regex_quantifiers | 4 + .../fixtures/sed/output/bre_quantifier_comma | 5 + .../sed/output/bre_quantifier_minimum_m | 1 + .../output/bre_quantifier_only_closing_brace | 1 + .../sed/output/ere_quantifier_comma_n | 5 + .../sed/output/ere_quantifier_exactly_m | 2 + .../fixtures/sed/output/ere_quantifier_m_to_n | 1 + .../sed/output/ere_quantifier_minimum_m | 4 + 12 files changed, 453 insertions(+), 30 deletions(-) create mode 100644 tests/fixtures/sed/input/regex-quantifiers.txt create mode 100644 tests/fixtures/sed/input/regex_quantifiers create mode 100644 tests/fixtures/sed/output/bre_quantifier_comma create mode 100644 tests/fixtures/sed/output/bre_quantifier_minimum_m create mode 100644 tests/fixtures/sed/output/bre_quantifier_only_closing_brace create mode 100644 tests/fixtures/sed/output/ere_quantifier_comma_n create mode 100644 tests/fixtures/sed/output/ere_quantifier_exactly_m create mode 100644 tests/fixtures/sed/output/ere_quantifier_m_to_n create mode 100644 tests/fixtures/sed/output/ere_quantifier_minimum_m diff --git a/src/sed/compiler.rs b/src/sed/compiler.rs index 82319761..bbbec6da 100644 --- a/src/sed/compiler.rs +++ b/src/sed/compiler.rs @@ -286,7 +286,6 @@ fn compile_sequence( let n_addr = compile_address_range(lines, line, &mut cmd, context)?; line.eat_spaces(); let mut cmd_spec = get_verified_cmd_spec(lines, line, n_addr, context.posix)?; - // Compile the command according to its specification. let mut cmd_mut = cmd.borrow_mut(); cmd_mut.code = line.current(); @@ -331,10 +330,8 @@ fn compile_address_range( let mut is_line0 = false; line.eat_spaces(); - if !line.eol() - && is_address_char(line.current()) - && let Ok(addr1) = compile_address(lines, line, context) - { + if !line.eol() && is_address_char(line.current()) { + let addr1 = compile_address(lines, line, context)?; is_line0 = matches!(addr1, Address::Line(0)); cmd.addr1 = Some(addr1); if is_line0 && context.posix { @@ -364,9 +361,8 @@ fn compile_address_range( } // Look for second address. - if !line.eol() - && let Ok(addr2) = compile_address(lines, line, context) - { + if !line.eol() { + let addr2 = compile_address(lines, line, context)?; // Set step_n to the number specified in the (required numeric) address. let step_n = if is_step_match || is_step_end { match addr2 { @@ -449,7 +445,7 @@ fn compile_address( // The next character is an arbitrary delimiter line.advance(); } - let re = parse_regex(lines, line)?; + let re = parse_regex(lines, line, context.regex_extended)?; // Skip over delimiter line.advance(); @@ -554,6 +550,14 @@ fn bre_to_ere(pattern: &str) -> String { chars.next(); result.push(')'); // Group end } + Some('{') => { + chars.next(); + result.push('{'); // Group end + } + Some('}') => { + chars.next(); + result.push('}'); // Group end + } Some(v) if v.is_ascii_digit() => { // Back-reference. In sed BREs these are single-digit // (\1-\9) whereas fancy_regex supports multi-digit @@ -624,7 +628,7 @@ fn compile_regex( // Convert basic to extended regular expression if needed. let pattern = if context.regex_extended { - pattern + &pattern.replace("{,}", "*") } else { &bre_to_ere(pattern) }; @@ -633,7 +637,7 @@ fn compile_regex( let pattern = if icase { format!("(?i){pattern}") } else { - pattern.to_string() + pattern.clone() }; // Compile into engine. @@ -776,8 +780,7 @@ fn compile_subst_command( ); } - let pattern = parse_regex(lines, line)?; - + let pattern = parse_regex(lines, line, context.regex_extended)?; let mut subst = Box::new(Substitution::default()); subst.replacement = compile_replacement(lines, line)?; @@ -807,7 +810,6 @@ fn compile_subst_command( ), ); } - cmd.data = CommandData::Substitution(subst); parse_command_ending(lines, line, cmd)?; @@ -1559,6 +1561,21 @@ mod tests { assert!(!regex.is_match(&mut IOChunk::new_from_str("ABC")).unwrap()); } + #[test] + fn test_compile_re_extended() { + let (lines, chars) = make_providers("acaa\nbbb\nccc"); + let mut ctx = ctx(); + ctx.regex_extended = true; + let regex = compile_regex(&lines, &chars, "cc{,}", &ctx, false) + .unwrap() + .expect("regex should be present"); + assert!( + regex + .is_match(&mut IOChunk::new_from_str("acaa\nccc")) + .unwrap() + ); + } + #[test] fn test_compile_re_case_insensitive() { let (lines, chars) = dummy_providers(); diff --git a/src/sed/delimited_parser.rs b/src/sed/delimited_parser.rs index 06d73aea..cf03a7c8 100644 --- a/src/sed/delimited_parser.rs +++ b/src/sed/delimited_parser.rs @@ -312,11 +312,15 @@ fn scan_delimiter(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> /// Parse the regular expression delimited by the current line /// character and return it as a string. -/// On return the line is on the closing delimiter. -pub fn parse_regex(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult { +/// On return, the line is on the closing delimiter. +/// If extended_mode is false, quantifiers like {m,n} are treated as literals. +pub fn parse_regex( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, + extended_mode: bool, +) -> UResult { let delimiter = scan_delimiter(lines, line)?; let mut result = String::new(); - while !line.eol() { match line.current() { '[' if delimiter != '[' => { @@ -335,6 +339,20 @@ pub fn parse_regex(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> line.advance(); continue; } + if line.current() == '{' && !extended_mode { + validate_quantifier_structure(lines, line, delimiter, true)?; + let quantifier = validate_quantifier_numbers(lines, line)?; + result.push('\\'); + result.push('{'); + result.push_str(&quantifier); + continue; + } + if line.current() == '}' { + result.push('\\'); + result.push('}'); + line.advance(); + continue; + } if let Some(decoded) = parse_char_escape(line) { result.push(decoded); } else { @@ -345,6 +363,19 @@ pub fn parse_regex(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> } continue; } + '{' if extended_mode => { + validate_quantifier_structure(lines, line, delimiter, false)?; + let quantifier = validate_quantifier_numbers(lines, line)?; + result.push('{'); + result.push_str(&quantifier); + continue; + } + '}' => { + result.push('}'); + line.advance(); + continue; + } + c if c == delimiter => return Ok(result), c => result.push(c), } @@ -353,6 +384,170 @@ pub fn parse_regex(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> compilation_error(lines, line, "unterminated regular expression") } +// Check for closing brace and the structure/content. +fn validate_quantifier_structure( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, + delimiter: char, + is_bre: bool, +) -> UResult { + let invalid_content_error_msg = "Invalid content of \\{\\}"; + let mut advances = 0; + let mut found_closing_brace = false; + let mut seen_comma = false; + let mut invalid_content_detected = false; + + line.advance(); + + while !line.eol() && line.current() != delimiter { + if is_bre { + // In BRE mode, look for \} + if line.current() == '\\' { + line.advance(); + advances += 1; + if !line.eol() && line.current() == '}' { + // Empty quantifier {} is not valid + if advances == 1 { + invalid_content_detected = true; + } + found_closing_brace = true; + break; + } else { + invalid_content_detected = true; + } + } else { + // Only digits and comma allowed + if line.current() == ',' { + if seen_comma { + invalid_content_detected = true; + } + seen_comma = true; + } else if !line.current().is_ascii_digit() { + invalid_content_detected = true; + } + line.advance(); + advances += 1; + } + } else { + // In ERE mode, look for } + if line.current() == '}' { + if advances == 0 { + invalid_content_detected = true; + } + found_closing_brace = true; + break; + } else { + if line.current() == ',' { + if seen_comma { + invalid_content_detected = true; + } + seen_comma = true; + } else if !line.current().is_ascii_digit() { + invalid_content_detected = true; + } + line.advance(); + advances += 1; + } + } + } + + if !found_closing_brace { + return compilation_error(lines, line, "Unmatched \\{"); + } + + if invalid_content_detected { + return compilation_error(lines, line, invalid_content_error_msg); + } + + line.retreat(advances + 1); + Ok(advances) +} + +// Peforms validations on m and/or n values of the quantifier +// and returns the valid content as a string (without braces). +fn validate_quantifier_numbers( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, +) -> UResult { + line.advance(); + + // Handle {,} (zero or more) special case + if line.current() == ',' { + line.advance(); + if line.current() == '}' { + return Ok(",".to_string()); + } + + // Continue to parse n value + let mut result = String::new(); + result.push('0'); + result.push(','); + while line.current() != '}' && line.current() != '\\' { + result.push(line.current()); + line.advance(); + } + return Ok(result); + } + // Parse m value + let mut m = String::new(); + while line.current() != ',' && line.current() != '}' && line.current() != '\\' { + m.push(line.current()); + line.advance(); + } + let m_val: u32 = match m.parse() { + Ok(val) => { + if val > 255 { + return compilation_error(lines, line, "Regular expression too big"); + } + val + } + //never happens due to previous validation, but needed to satisfy the type checker + Err(_) => return compilation_error(lines, line, "Invalid content of \\{\\}"), + }; + + // Parse n if comma is present + let mut n = String::new(); + let has_comma = line.current() == ','; + if has_comma { + line.advance(); + while line.current() != '}' && line.current() != '\\' { + n.push(line.current()); + line.advance(); + } + } + let n_val: Option = if n.is_empty() { + None + } else { + match n.parse::() { + Ok(val) => { + if val > 255 { + return compilation_error(lines, line, "Regular expression too big"); + } + Some(val) + } + Err(_) => return compilation_error(lines, line, "Invalid content of \\{\\}"), + } + }; + + // Validate m <= n if both present + if let Some(n_val) = n_val + && m_val > n_val + { + return compilation_error(lines, line, "Invalid content of \\{\\}"); + } + + // Valid quantifier content (without braces) + let mut result = m.clone(); + if has_comma { + result.push(','); + if !n.is_empty() { + result.push_str(&n); + } + } + // line.advance(); + Ok(result) +} + /// Parse the transliteration string delimited by the current line /// character and return it as a string. /// On return the line is on the closing delimiter. @@ -756,7 +951,7 @@ mod tests { #[test] fn test_simple_regex() { let (lines, mut line) = make_providers("/abc/"); - let parsed = parse_regex(&lines, &mut line).unwrap(); + let parsed = parse_regex(&lines, &mut line, false).unwrap(); assert_eq!(parsed, "abc"); assert_eq!(line.current(), '/'); } @@ -764,7 +959,7 @@ mod tests { #[test] fn test_regex_with_escaped_delimiter() { let (lines, mut line) = make_providers("/ab\\/c/"); - let parsed = parse_regex(&lines, &mut line).unwrap(); + let parsed = parse_regex(&lines, &mut line, false).unwrap(); assert_eq!(parsed, "ab/c"); assert_eq!(line.current(), '/'); } @@ -772,7 +967,7 @@ mod tests { #[test] fn test_regex_with_capture() { let (lines, mut line) = make_providers(r"/\(.\)/c/"); - let parsed = parse_regex(&lines, &mut line).unwrap(); + let parsed = parse_regex(&lines, &mut line, false).unwrap(); assert_eq!(parsed, r"\(.\)"); assert_eq!(line.current(), '/'); } @@ -780,29 +975,95 @@ mod tests { #[test] fn test_regex_with_escape_sequence() { let (lines, mut line) = make_providers("/ab\\n/"); - let parsed = parse_regex(&lines, &mut line).unwrap(); + let parsed = parse_regex(&lines, &mut line, false).unwrap(); assert_eq!(parsed, "ab\n"); assert_eq!(line.current(), '/'); } + #[test] + fn test_extended_regex_quantifier_with_ere() { + let (lines, mut line) = make_providers("/a{2,3}/p"); + let parsed = parse_regex(&lines, &mut line, true).unwrap(); + assert_eq!(parsed, "a{2,3}"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_extended_regex_with_zero_or_more() { + let (lines, mut line) = make_providers("/a{,}/p"); + let parsed = parse_regex(&lines, &mut line, true).unwrap(); + assert_eq!(parsed, "a{,}"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_extended_regex_literal() { + let (lines, mut line) = make_providers("/a{,5}/p"); + let parsed = parse_regex(&lines, &mut line, true).unwrap(); + assert_eq!(parsed, "a{0,5}"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_extended_regex_with_unmatched_brace_quantifier() { + let (lines, mut line) = make_providers("/a{2,3/p"); + let err = parse_regex(&lines, &mut line, true).unwrap_err(); + assert!(err.to_string().contains("Unmatched \\{")); + } + + #[test] + fn test_extended_regex_with_empty_quantifier() { + let (lines, mut line) = make_providers("/a{}/p"); + let err = parse_regex(&lines, &mut line, true).unwrap_err(); + assert!(err.to_string().contains("Invalid content of \\{\\}")); + } + + #[test] + fn test_extended_regex_with_whitespace_quantifier() { + let (lines, mut line) = make_providers("/a{}/p"); + let err = parse_regex(&lines, &mut line, true).unwrap_err(); + assert!(err.to_string().contains("Invalid content of \\{\\}")); + } + + #[test] + fn test_extended_regex_with_invalid_m() { + let (lines, mut line) = make_providers("/a{2d,3}/p"); + let err = parse_regex(&lines, &mut line, true).unwrap_err(); + assert!(err.to_string().contains("Invalid content of \\{\\}")); + } + + #[test] + fn test_extended_regex_with_invalid_n() { + let (lines, mut line) = make_providers("/a{2,-3}/p"); + let err = parse_regex(&lines, &mut line, true).unwrap_err(); + assert!(err.to_string().contains("Invalid content of \\{\\}")); + } + + #[test] + fn test_extended_regex_with_m_gt_n() { + let (lines, mut line) = make_providers("/a{3,2}/p"); + let err = parse_regex(&lines, &mut line, true).unwrap_err(); + assert!(err.to_string().contains("Invalid content of \\{\\}")); + } + #[test] fn errors_on_unterminated_regex() { let (lines, mut line) = make_providers("/unterminated"); - let err = parse_regex(&lines, &mut line).unwrap_err(); + let err = parse_regex(&lines, &mut line, false).unwrap_err(); assert!(err.to_string().contains("unterminated regular expression")); } #[test] fn errors_on_esc_at_re_eol() { let (lines, mut line) = make_providers("/foo\\"); - let err = parse_regex(&lines, &mut line).unwrap_err(); + let err = parse_regex(&lines, &mut line, false).unwrap_err(); assert!(err.to_string().contains("unterminated regular expression")); } #[test] fn errors_on_backslash_delimiter() { let (lines, mut line) = make_providers("\\bad"); - let err = parse_regex(&lines, &mut line).unwrap_err(); + let err = parse_regex(&lines, &mut line, false).unwrap_err(); assert!( err.to_string() .contains("\\ cannot be used as a string delimiter") @@ -812,7 +1073,7 @@ mod tests { #[test] fn test_regex_with_character_class() { let (lines, mut line) = make_providers("/[a-z]/"); - let parsed = parse_regex(&lines, &mut line).unwrap(); + let parsed = parse_regex(&lines, &mut line, false).unwrap(); assert_eq!(parsed, "[a-z]"); assert_eq!(line.current(), '/'); } @@ -820,7 +1081,7 @@ mod tests { #[test] fn test_regex_with_bracket_delimiter() { let (lines, mut line) = make_providers("[abc["); - let parsed = parse_regex(&lines, &mut line).unwrap(); + let parsed = parse_regex(&lines, &mut line, false).unwrap(); assert_eq!(parsed, "abc"); assert_eq!(line.current(), '['); } @@ -828,7 +1089,7 @@ mod tests { #[test] fn test_bracket_regex_with_bracket_delimiter() { let (lines, mut line) = make_providers("[a\\[0-9]bc["); - let parsed = parse_regex(&lines, &mut line).unwrap(); + let parsed = parse_regex(&lines, &mut line, false).unwrap(); assert_eq!(parsed, "a[0-9]bc"); assert_eq!(line.current(), '['); } @@ -836,7 +1097,7 @@ mod tests { #[test] fn test_regex_with_escaped_bracket_in_character_class() { let (lines, mut line) = make_providers("/[a\\]z]/"); - let parsed = parse_regex(&lines, &mut line).unwrap(); + let parsed = parse_regex(&lines, &mut line, false).unwrap(); assert_eq!(parsed, "[a\\]z]"); assert_eq!(line.current(), '/'); } @@ -844,7 +1105,7 @@ mod tests { #[test] fn test_regex_with_delimiter_inside_character_class() { let (lines, mut line) = make_providers("/[a/c]/"); - let parsed = parse_regex(&lines, &mut line).unwrap(); + let parsed = parse_regex(&lines, &mut line, false).unwrap(); assert_eq!(parsed, "[a/c]"); assert_eq!(line.current(), '/'); } @@ -852,7 +1113,7 @@ mod tests { #[test] fn test_regex_with_escaped_paren_and_backslash() { let (lines, mut line) = make_providers("/\\(\\\\/"); - let parsed = parse_regex(&lines, &mut line).unwrap(); + let parsed = parse_regex(&lines, &mut line, false).unwrap(); assert_eq!(parsed, "\\(\\\\"); assert_eq!(line.current(), '/'); } diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index b0baf479..5710d039 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -196,6 +196,7 @@ macro_rules! check_output_posix { // Input files const LINES1: &str = "input/lines1"; const LINES2: &str = "input/lines2"; +const REGEX_QUANTIFIERS: &str = "input/regex-quantifiers.txt"; const NO_NEW_LINE: &str = "input/no-new-line.txt"; //////////////////////////////////////////////////////////// @@ -275,6 +276,122 @@ check_output!(addr_range_step_zero, ["-n", "10~0p", LINES1]); check_output!(addr_range_end_multiple, ["-n", "/l1_2/,~10p", LINES1]); //////////////////////////////////////////////////////////// + +// Quantifiers: {m,n} +// m and n are considered to be the first and second numbers in the interval, respectively. +check_output!( + ere_quantifier_exactly_m, + ["-n", "-E", "-e", "/l{2}/p", REGEX_QUANTIFIERS] +); +check_output!( + ere_quantifier_minimum_m, + ["-n", "-E", "-e", "/l{1,}/p", REGEX_QUANTIFIERS] +); +check_output!( + ere_quantifier_m_to_n, + ["-n", "-E", "-e", "/l{3,4}/p", REGEX_QUANTIFIERS] +); +check_output!( + ere_quantifier_comma_n, + ["-n", "-E", "-e", "/l{,4}/p", REGEX_QUANTIFIERS] +); + +check_output!( + bre_quantifier_minimum_m, + ["-n", "-e", "/l\\{3,\\}/p", REGEX_QUANTIFIERS] +); + +check_output!( + bre_quantifier_comma, + ["-n", "-e", "/l\\{,\\}/p", REGEX_QUANTIFIERS] +); + +check_output!( + bre_quantifier_only_closing_brace, + ["-n", "-e", "/l\\}/p", REGEX_QUANTIFIERS] +); + +#[test] +fn test_ere_quantifier_n_gt_m() { + new_ucmd!() + .args(&["-E", "-e", "/l{3,2}/p", REGEX_QUANTIFIERS]) + .fails() + .code_is(1) + .stderr_contains("Invalid content of \\{\\}"); +} + +#[test] +fn test_ere_quantifier_negative_m() { + new_ucmd!() + .args(&["-E", "-e", "/l{-2,4}/p", REGEX_QUANTIFIERS]) + .fails() + .code_is(1) + .stderr_contains("Invalid content of \\{\\}"); +} + +#[test] +fn test_ere_quantifier_invalid_m() { + new_ucmd!() + .args(&["-E", "-e", "/l{d,}/p", REGEX_QUANTIFIERS]) + .fails() + .code_is(1) + .stderr_contains("Invalid content of \\{\\}"); +} + +#[test] +fn test_ere_quantifier_m_too_big() { + new_ucmd!() + .args(&["-E", "-e", "/l{300,}/p", REGEX_QUANTIFIERS]) + .fails() + .code_is(1) + .stderr_contains("Regular expression too big"); +} + +#[test] +fn test_ere_quantifier_empty() { + new_ucmd!() + .args(&["-E", "-e", "/l{}/p", REGEX_QUANTIFIERS]) + .fails() + .code_is(1) + .stderr_contains("Invalid content of \\{\\}"); +} + +#[test] +fn test_ere_quantifier_whitespace() { + new_ucmd!() + .args(&["-E", "-e", "/l{ }/p", REGEX_QUANTIFIERS]) + .fails() + .code_is(1) + .stderr_contains("Invalid content of \\{\\}"); +} + +#[test] +fn test_ere_quantifier_unmatched_brace() { + new_ucmd!() + .args(&["-E", "-e", "/l{,/p", REGEX_QUANTIFIERS]) + .fails() + .code_is(1) + .stderr_contains("Unmatched \\{"); +} + +#[test] +fn test_ere_quantifier_unmatched_brace_2() { + new_ucmd!() + .args(&["-E", "-e", "/l{m,n/p", REGEX_QUANTIFIERS]) + .fails() + .code_is(1) + .stderr_contains("Unmatched \\{"); +} + +#[test] +fn test_bre_quantifier_unmatched_brace() { + new_ucmd!() + .args(&["-e", "/l\\{1,2}/p", REGEX_QUANTIFIERS]) + .fails() + .code_is(1) + .stderr_contains("Unmatched \\{"); +} + // Substitution: s check_output!(subst_any, ["-e", r"s/./X/g", LINES1]); check_output!(subst_any_global, ["-e", r"s,.,X,g", LINES1]); diff --git a/tests/fixtures/sed/input/regex-quantifiers.txt b/tests/fixtures/sed/input/regex-quantifiers.txt new file mode 100644 index 00000000..4d77d3fc --- /dev/null +++ b/tests/fixtures/sed/input/regex-quantifiers.txt @@ -0,0 +1,5 @@ +Hello World +Helo World +Helllllo World +Heo Word +Heo Worl}d diff --git a/tests/fixtures/sed/input/regex_quantifiers b/tests/fixtures/sed/input/regex_quantifiers new file mode 100644 index 00000000..e90a59e0 --- /dev/null +++ b/tests/fixtures/sed/input/regex_quantifiers @@ -0,0 +1,4 @@ +Hello World +Helo World +Helllllo World +Heo Word diff --git a/tests/fixtures/sed/output/bre_quantifier_comma b/tests/fixtures/sed/output/bre_quantifier_comma new file mode 100644 index 00000000..4d77d3fc --- /dev/null +++ b/tests/fixtures/sed/output/bre_quantifier_comma @@ -0,0 +1,5 @@ +Hello World +Helo World +Helllllo World +Heo Word +Heo Worl}d diff --git a/tests/fixtures/sed/output/bre_quantifier_minimum_m b/tests/fixtures/sed/output/bre_quantifier_minimum_m new file mode 100644 index 00000000..f4982934 --- /dev/null +++ b/tests/fixtures/sed/output/bre_quantifier_minimum_m @@ -0,0 +1 @@ +Helllllo World diff --git a/tests/fixtures/sed/output/bre_quantifier_only_closing_brace b/tests/fixtures/sed/output/bre_quantifier_only_closing_brace new file mode 100644 index 00000000..5164ebcc --- /dev/null +++ b/tests/fixtures/sed/output/bre_quantifier_only_closing_brace @@ -0,0 +1 @@ +Heo Worl}d diff --git a/tests/fixtures/sed/output/ere_quantifier_comma_n b/tests/fixtures/sed/output/ere_quantifier_comma_n new file mode 100644 index 00000000..4d77d3fc --- /dev/null +++ b/tests/fixtures/sed/output/ere_quantifier_comma_n @@ -0,0 +1,5 @@ +Hello World +Helo World +Helllllo World +Heo Word +Heo Worl}d diff --git a/tests/fixtures/sed/output/ere_quantifier_exactly_m b/tests/fixtures/sed/output/ere_quantifier_exactly_m new file mode 100644 index 00000000..ac78241d --- /dev/null +++ b/tests/fixtures/sed/output/ere_quantifier_exactly_m @@ -0,0 +1,2 @@ +Hello World +Helllllo World diff --git a/tests/fixtures/sed/output/ere_quantifier_m_to_n b/tests/fixtures/sed/output/ere_quantifier_m_to_n new file mode 100644 index 00000000..f4982934 --- /dev/null +++ b/tests/fixtures/sed/output/ere_quantifier_m_to_n @@ -0,0 +1 @@ +Helllllo World diff --git a/tests/fixtures/sed/output/ere_quantifier_minimum_m b/tests/fixtures/sed/output/ere_quantifier_minimum_m new file mode 100644 index 00000000..10e03522 --- /dev/null +++ b/tests/fixtures/sed/output/ere_quantifier_minimum_m @@ -0,0 +1,4 @@ +Hello World +Helo World +Helllllo World +Heo Worl}d From f49863fdebafc8676b60b41b933bc5ab5ed730a8 Mon Sep 17 00:00:00 2001 From: LoukasPap Date: Tue, 31 Mar 2026 15:33:29 +0300 Subject: [PATCH 2/8] Add unit test for brace quantifier bre_to_ere --- src/sed/compiler.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/sed/compiler.rs b/src/sed/compiler.rs index bbbec6da..6076c3ee 100644 --- a/src/sed/compiler.rs +++ b/src/sed/compiler.rs @@ -529,7 +529,7 @@ fn parse_command_ending( } /// Convert a primitive BRE pattern to a safe ERE-compatible pattern string. -/// - Replaces `\(` and `\)` with `(` and `)`. +/// - Replaces `\(`, `\)`, `\{` and `\}` with `(`, `)`, `{` and `}`. /// - Puts single-digit back-references in non-capturing groups.. /// - Escapes ERE-only metacharacters: `+ ? { } | ( )`. /// - Leaves all other characters as-is. @@ -552,11 +552,11 @@ fn bre_to_ere(pattern: &str) -> String { } Some('{') => { chars.next(); - result.push('{'); // Group end + result.push('{'); // Brace quantifier start } Some('}') => { chars.next(); - result.push('}'); // Group end + result.push('}'); // Brace quantifier end } Some(v) if v.is_ascii_digit() => { // Back-reference. In sed BREs these are single-digit @@ -2213,6 +2213,11 @@ mod tests { assert_eq!(bre_to_ere(r"a\(b\)c"), "a(b)c"); } + #[test] + fn test_bre_brace_quantifier_translation() { + assert_eq!(bre_to_ere(r"\{1,4\}"), "{1,4}"); + } + #[test] fn test_ere_metacharacters_escaped() { assert_eq!(bre_to_ere(r"a+b?c{1}|(d)"), r"a\+b\?c\{1\}\|\(d\)"); From a22e6a42a096b2bdd03bfcd856e7a8d60ff52a0a Mon Sep 17 00:00:00 2001 From: LoukasPap Date: Tue, 31 Mar 2026 17:08:22 +0300 Subject: [PATCH 3/8] Align extended mode param semantics in `parse_regex` and `validate_quantifier_structure` and rename to `is_extended_mode` --- src/sed/delimited_parser.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/sed/delimited_parser.rs b/src/sed/delimited_parser.rs index cf03a7c8..d4aa7c6a 100644 --- a/src/sed/delimited_parser.rs +++ b/src/sed/delimited_parser.rs @@ -317,7 +317,7 @@ fn scan_delimiter(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> pub fn parse_regex( lines: &ScriptLineProvider, line: &mut ScriptCharProvider, - extended_mode: bool, + is_extended_mode: bool, ) -> UResult { let delimiter = scan_delimiter(lines, line)?; let mut result = String::new(); @@ -339,8 +339,8 @@ pub fn parse_regex( line.advance(); continue; } - if line.current() == '{' && !extended_mode { - validate_quantifier_structure(lines, line, delimiter, true)?; + if line.current() == '{' && !is_extended_mode { + validate_quantifier_structure(lines, line, delimiter, false)?; let quantifier = validate_quantifier_numbers(lines, line)?; result.push('\\'); result.push('{'); @@ -363,8 +363,8 @@ pub fn parse_regex( } continue; } - '{' if extended_mode => { - validate_quantifier_structure(lines, line, delimiter, false)?; + '{' if is_extended_mode => { + validate_quantifier_structure(lines, line, delimiter, true)?; let quantifier = validate_quantifier_numbers(lines, line)?; result.push('{'); result.push_str(&quantifier); @@ -389,7 +389,7 @@ fn validate_quantifier_structure( lines: &ScriptLineProvider, line: &mut ScriptCharProvider, delimiter: char, - is_bre: bool, + is_extended_mode: bool, ) -> UResult { let invalid_content_error_msg = "Invalid content of \\{\\}"; let mut advances = 0; @@ -400,7 +400,7 @@ fn validate_quantifier_structure( line.advance(); while !line.eol() && line.current() != delimiter { - if is_bre { + if !is_extended_mode { // In BRE mode, look for \} if line.current() == '\\' { line.advance(); From a5f0616072179791c60839782d1c0a4226e8ecbe Mon Sep 17 00:00:00 2001 From: LoukasPap Date: Tue, 31 Mar 2026 17:09:01 +0300 Subject: [PATCH 4/8] Update parse_regex unit tests --- src/sed/delimited_parser.rs | 53 +++++++++++-------------------------- 1 file changed, 16 insertions(+), 37 deletions(-) diff --git a/src/sed/delimited_parser.rs b/src/sed/delimited_parser.rs index d4aa7c6a..33f5f26a 100644 --- a/src/sed/delimited_parser.rs +++ b/src/sed/delimited_parser.rs @@ -981,67 +981,46 @@ mod tests { } #[test] - fn test_extended_regex_quantifier_with_ere() { - let (lines, mut line) = make_providers("/a{2,3}/p"); - let parsed = parse_regex(&lines, &mut line, true).unwrap(); - assert_eq!(parsed, "a{2,3}"); - assert_eq!(line.current(), '/'); - } - - #[test] - fn test_extended_regex_with_zero_or_more() { - let (lines, mut line) = make_providers("/a{,}/p"); + fn test_basic_regex_quantifier() { + let (lines, mut line) = make_providers("/a\\{2,3\\}/p"); let parsed = parse_regex(&lines, &mut line, true).unwrap(); - assert_eq!(parsed, "a{,}"); + assert_eq!(parsed, "a\\{2,3\\}"); assert_eq!(line.current(), '/'); } - #[test] - fn test_extended_regex_literal() { - let (lines, mut line) = make_providers("/a{,5}/p"); - let parsed = parse_regex(&lines, &mut line, true).unwrap(); - assert_eq!(parsed, "a{0,5}"); - assert_eq!(line.current(), '/'); - } #[test] - fn test_extended_regex_with_unmatched_brace_quantifier() { + fn test_basic_regex_with_unmatched_brace_quantifier() { let (lines, mut line) = make_providers("/a{2,3/p"); let err = parse_regex(&lines, &mut line, true).unwrap_err(); assert!(err.to_string().contains("Unmatched \\{")); } #[test] - fn test_extended_regex_with_empty_quantifier() { - let (lines, mut line) = make_providers("/a{}/p"); - let err = parse_regex(&lines, &mut line, true).unwrap_err(); - assert!(err.to_string().contains("Invalid content of \\{\\}")); - } - - #[test] - fn test_extended_regex_with_whitespace_quantifier() { - let (lines, mut line) = make_providers("/a{}/p"); + fn test_basic_regex_with_invalid_content() { + let (lines, mut line) = make_providers("/a{2d,3}/p"); let err = parse_regex(&lines, &mut line, true).unwrap_err(); assert!(err.to_string().contains("Invalid content of \\{\\}")); } #[test] - fn test_extended_regex_with_invalid_m() { - let (lines, mut line) = make_providers("/a{2d,3}/p"); - let err = parse_regex(&lines, &mut line, true).unwrap_err(); - assert!(err.to_string().contains("Invalid content of \\{\\}")); + fn test_extended_regex_quantifier() { + let (lines, mut line) = make_providers("/a{2,3}/p"); + let parsed = parse_regex(&lines, &mut line, true).unwrap(); + assert_eq!(parsed, "a{2,3}"); + assert_eq!(line.current(), '/'); } #[test] - fn test_extended_regex_with_invalid_n() { - let (lines, mut line) = make_providers("/a{2,-3}/p"); + fn test_extended_regex_with_unmatched_brace_quantifier() { + let (lines, mut line) = make_providers("/a{2,3/p"); let err = parse_regex(&lines, &mut line, true).unwrap_err(); - assert!(err.to_string().contains("Invalid content of \\{\\}")); + assert!(err.to_string().contains("Unmatched \\{")); } #[test] - fn test_extended_regex_with_m_gt_n() { - let (lines, mut line) = make_providers("/a{3,2}/p"); + fn test_extended_regex_with_invalid_m() { + let (lines, mut line) = make_providers("/a{2d,3}/p"); let err = parse_regex(&lines, &mut line, true).unwrap_err(); assert!(err.to_string().contains("Invalid content of \\{\\}")); } From a302d8a07b8a22b98706f1dc2f52d119270d7dd2 Mon Sep 17 00:00:00 2001 From: LoukasPap Date: Tue, 31 Mar 2026 17:09:41 +0300 Subject: [PATCH 5/8] Add unit tests for BRE and ERE quantifier validation functions --- src/sed/delimited_parser.rs | 129 ++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/src/sed/delimited_parser.rs b/src/sed/delimited_parser.rs index 33f5f26a..e2a7de64 100644 --- a/src/sed/delimited_parser.rs +++ b/src/sed/delimited_parser.rs @@ -1097,6 +1097,135 @@ mod tests { assert_eq!(line.current(), '/'); } + // validate_quantifier_structure + //BRE tests + #[test] + fn test_validate_quantifier_structure_bre_valid() { + let (lines, mut line) = make_providers("{2,3\\}"); + let result = validate_quantifier_structure(&lines, &mut line, '/', false).unwrap(); + assert_eq!(result, 4); + assert_eq!(line.current(), '{'); // Line should be back on the opening brace + } + + #[test] + fn test_validate_quantifier_structure_bre_with_unmatched_brace() { + let (lines, mut line) = make_providers("{2,3"); + let err = validate_quantifier_structure(&lines, &mut line, '/', false).unwrap_err(); + assert!(err.to_string().contains("Unmatched \\{")); + } + + #[test] + fn test_validate_quantifier_structure_bre_with_empty_content() { + let (lines, mut line) = make_providers("{\\}"); + let err = validate_quantifier_structure(&lines, &mut line, '/', false).unwrap_err(); + assert!(err.to_string().contains("Invalid content of \\{\\}")); + } + + #[test] + fn test_validate_quantifier_structure_bre_with_invalid_char() { + let (lines, mut line) = make_providers("{2d,3\\}"); + let err = validate_quantifier_structure(&lines, &mut line, '/', false).unwrap_err(); + assert!(err.to_string().contains("Invalid content of \\{\\}")); + } + + #[test] + fn test_validate_quantifier_structure_bre_with_double_comma() { + let (lines, mut line) = make_providers("{2,3,\\}"); + let err = validate_quantifier_structure(&lines, &mut line, '/', false).unwrap_err(); + assert!(err.to_string().contains("Invalid content of \\{\\}")); + } + + // ERE tests + #[test] + fn test_validate_quantifier_structure_ere_valid() { + let (lines, mut line) = make_providers("{2,3}"); + let result = validate_quantifier_structure(&lines, &mut line, '/', true).unwrap(); + assert_eq!(result, 3); + assert_eq!(line.current(), '{'); // Line should be back on the opening brace + } + + #[test] + fn test_validate_quantifier_structure_ere_with_unmatched_brace() { + let (lines, mut line) = make_providers("{2,3"); + let err = validate_quantifier_structure(&lines, &mut line, '/', true).unwrap_err(); + assert!(err.to_string().contains("Unmatched \\{")); + } + + #[test] + fn test_validate_quantifier_structure_ere_with_empty_content() { + let (lines, mut line) = make_providers("{}"); + let err = validate_quantifier_structure(&lines, &mut line, '/', true).unwrap_err(); + assert!(err.to_string().contains("Invalid content of \\{\\}")); + } + + #[test] + fn test_validate_quantifier_structure_ere_with_invalid_char() { + let (lines, mut line) = make_providers("{2d,3}"); + let err = validate_quantifier_structure(&lines, &mut line, '/', true).unwrap_err(); + assert!(err.to_string().contains("Invalid content of \\{\\}")); + } + + #[test] + fn test_validate_quantifier_structure_ere_with_double_comma() { + let (lines, mut line) = make_providers("{2,3,}"); + let err = validate_quantifier_structure(&lines, &mut line, '/', true).unwrap_err(); + assert!(err.to_string().contains("Invalid content of \\{\\}")); + } + + // validate_quantifier_numbers + #[test] + fn test_validate_quantifier_numbers_with_m() { + let (lines, mut line) = make_providers("{2}"); + let result = validate_quantifier_numbers(&lines, &mut line).unwrap(); + assert_eq!(result, "2"); + assert_eq!(line.current(), '}'); + } + + #[test] + fn test_validate_quantifier_numbers_with_single_comma() { + let (lines, mut line) = make_providers("{,}"); + let result = validate_quantifier_numbers(&lines, &mut line).unwrap(); + assert_eq!(result, ","); + assert_eq!(line.current(), '}'); + } + + #[test] + fn test_validate_quantifier_numbers_with_comma_n() { + let (lines, mut line) = make_providers("{,3}"); + let result = validate_quantifier_numbers(&lines, &mut line).unwrap(); + assert_eq!(result, "0,3"); + assert_eq!(line.current(), '}'); + } + + #[test] + fn test_validate_quantifier_numbers_valid() { + let (lines, mut line) = make_providers("{2,3}"); + let result = validate_quantifier_numbers(&lines, &mut line).unwrap(); + assert_eq!(result, "2,3"); + assert_eq!(line.current(), '}'); + } + + #[test] + fn test_validate_quantifier_numbers_with_m_too_big() { + let (lines, mut line) = make_providers("{256}"); + let err = validate_quantifier_numbers(&lines, &mut line).unwrap_err(); + assert!(err.to_string().contains("Regular expression too big")); + } + + #[test] + fn test_validate_quantifier_numbers_with_n_too_big() { + let (lines, mut line) = make_providers("{2,256}"); + let err = validate_quantifier_numbers(&lines, &mut line).unwrap_err(); + assert!(err.to_string().contains("Regular expression too big")); + } + + #[test] + fn test_validate_quantifier_numbers_with_m_gt_n() { + let (lines, mut line) = make_providers("{3,2}"); + let err = validate_quantifier_numbers(&lines, &mut line).unwrap_err(); + assert!(err.to_string().contains("Invalid content of \\{\\}")); + } + // parse_transliteration #[test] fn test_simple_transliteration() { From 8cd1bbdd54e1c29ede5bdd218cbb25479f9befa1 Mon Sep 17 00:00:00 2001 From: LoukasPap Date: Tue, 31 Mar 2026 17:13:11 +0300 Subject: [PATCH 6/8] Fix format --- src/sed/delimited_parser.rs | 39 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/src/sed/delimited_parser.rs b/src/sed/delimited_parser.rs index e2a7de64..f37694d2 100644 --- a/src/sed/delimited_parser.rs +++ b/src/sed/delimited_parser.rs @@ -400,23 +400,15 @@ fn validate_quantifier_structure( line.advance(); while !line.eol() && line.current() != delimiter { - if !is_extended_mode { - // In BRE mode, look for \} - if line.current() == '\\' { - line.advance(); - advances += 1; - if !line.eol() && line.current() == '}' { - // Empty quantifier {} is not valid - if advances == 1 { - invalid_content_detected = true; - } - found_closing_brace = true; - break; - } else { + if is_extended_mode { + // In ERE mode, look for } + if line.current() == '}' { + if advances == 0 { invalid_content_detected = true; } + found_closing_brace = true; + break; } else { - // Only digits and comma allowed if line.current() == ',' { if seen_comma { invalid_content_detected = true; @@ -429,14 +421,22 @@ fn validate_quantifier_structure( advances += 1; } } else { - // In ERE mode, look for } - if line.current() == '}' { - if advances == 0 { + // In BRE mode, look for \} + if line.current() == '\\' { + line.advance(); + advances += 1; + if !line.eol() && line.current() == '}' { + // Empty quantifier {} is not valid + if advances == 1 { + invalid_content_detected = true; + } + found_closing_brace = true; + break; + } else { invalid_content_detected = true; } - found_closing_brace = true; - break; } else { + // Only digits and comma allowed if line.current() == ',' { if seen_comma { invalid_content_detected = true; @@ -988,7 +988,6 @@ mod tests { assert_eq!(line.current(), '/'); } - #[test] fn test_basic_regex_with_unmatched_brace_quantifier() { let (lines, mut line) = make_providers("/a{2,3/p"); From 92be995857f59e697d2b16815a4420653fff043e Mon Sep 17 00:00:00 2001 From: LoukasPap Date: Fri, 17 Apr 2026 16:50:32 +0300 Subject: [PATCH 7/8] Refactor regex handling to introduce RegexMode enum and make parse_regex function more readable --- src/sed/compiler.rs | 29 ++- src/sed/delimited_parser.rs | 184 ++++++++++-------- src/sed/script_char_provider.rs | 5 + tests/by-util/test_sed.rs | 110 +++++++---- .../fixtures/sed/input/regex-quantifiers.txt | 5 - 5 files changed, 206 insertions(+), 127 deletions(-) delete mode 100644 tests/fixtures/sed/input/regex-quantifiers.txt diff --git a/src/sed/compiler.rs b/src/sed/compiler.rs index 6076c3ee..95d7dc9a 100644 --- a/src/sed/compiler.rs +++ b/src/sed/compiler.rs @@ -12,7 +12,9 @@ use crate::sed::command::{ Address, Command, CommandData, ProcessingContext, ReplacementPart, ReplacementTemplate, Substitution, Transliteration, }; -use crate::sed::delimited_parser::{parse_char_escape, parse_regex, parse_transliteration}; +use crate::sed::delimited_parser::{ + RegexMode, parse_char_escape, parse_regex, parse_transliteration, +}; use crate::sed::error_handling::{ScriptLocation, compilation_error, semantic_error}; use crate::sed::fast_regex::Regex; use crate::sed::named_writer::NamedWriter; @@ -445,7 +447,12 @@ fn compile_address( // The next character is an arbitrary delimiter line.advance(); } - let re = parse_regex(lines, line, context.regex_extended)?; + let regex_mode = if context.regex_extended { + RegexMode::Extended + } else { + RegexMode::Basic + }; + let re = parse_regex(lines, line, regex_mode)?; // Skip over delimiter line.advance(); @@ -780,7 +787,12 @@ fn compile_subst_command( ); } - let pattern = parse_regex(lines, line, context.regex_extended)?; + let regex_mode = if context.regex_extended { + RegexMode::Extended + } else { + RegexMode::Basic + }; + let pattern = parse_regex(lines, line, regex_mode)?; let mut subst = Box::new(Substitution::default()); subst.replacement = compile_replacement(lines, line)?; @@ -1806,6 +1818,17 @@ mod tests { } } + #[test] + fn test_compile_address_range_error_propagation() { + let (lines, mut chars) = make_providers("1,/abc"); + let mut cmd = Rc::new(RefCell::new(Command::default())); + let result = compile_address_range(&lines, &mut chars, &mut cmd, &ctx()); + + assert!(result.is_err()); + let msg = result.unwrap_err().to_string(); + assert!(msg.contains("unterminated regular expression")); + } + // compile_sequence fn empty_line() -> ScriptCharProvider { ScriptCharProvider::new("") diff --git a/src/sed/delimited_parser.rs b/src/sed/delimited_parser.rs index f37694d2..3a7b86ce 100644 --- a/src/sed/delimited_parser.rs +++ b/src/sed/delimited_parser.rs @@ -15,6 +15,14 @@ use crate::sed::script_line_provider::ScriptLineProvider; use std::char; use uucore::error::UResult; +/// Defines whether regex patterns use Basic Regular Expression (BRE) or +/// Extended Regular Expression (ERE) syntax. +#[derive(Copy, Clone, Debug)] +pub enum RegexMode { + Basic, + Extended, +} + /// Return true if c is a valid octal digit fn is_ascii_octal_digit(c: char) -> bool { matches!(c, '0'..='7') @@ -313,11 +321,12 @@ fn scan_delimiter(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> /// Parse the regular expression delimited by the current line /// character and return it as a string. /// On return, the line is on the closing delimiter. -/// If extended_mode is false, quantifiers like {m,n} are treated as literals. +/// In Basic mode, quantifiers like {m,n} must be escaped (\{m,n\}). +/// In Extended mode, quantifiers like {m,n} don't require escaping. pub fn parse_regex( lines: &ScriptLineProvider, line: &mut ScriptCharProvider, - is_extended_mode: bool, + regex_mode: RegexMode, ) -> UResult { let delimiter = scan_delimiter(lines, line)?; let mut result = String::new(); @@ -339,8 +348,8 @@ pub fn parse_regex( line.advance(); continue; } - if line.current() == '{' && !is_extended_mode { - validate_quantifier_structure(lines, line, delimiter, false)?; + if line.current() == '{' && matches!(regex_mode, RegexMode::Basic) { + validate_quantifier_structure(lines, line, delimiter, RegexMode::Basic)?; let quantifier = validate_quantifier_numbers(lines, line)?; result.push('\\'); result.push('{'); @@ -363,8 +372,8 @@ pub fn parse_regex( } continue; } - '{' if is_extended_mode => { - validate_quantifier_structure(lines, line, delimiter, true)?; + '{' if matches!(regex_mode, RegexMode::Extended) => { + validate_quantifier_structure(lines, line, delimiter, RegexMode::Extended)?; let quantifier = validate_quantifier_numbers(lines, line)?; result.push('{'); result.push_str(&quantifier); @@ -389,64 +398,67 @@ fn validate_quantifier_structure( lines: &ScriptLineProvider, line: &mut ScriptCharProvider, delimiter: char, - is_extended_mode: bool, + regex_mode: RegexMode, ) -> UResult { let invalid_content_error_msg = "Invalid content of \\{\\}"; - let mut advances = 0; let mut found_closing_brace = false; let mut seen_comma = false; let mut invalid_content_detected = false; - + let mut is_quantifier_empty = true; + let initial_pos = line.get_pos(); line.advance(); while !line.eol() && line.current() != delimiter { - if is_extended_mode { - // In ERE mode, look for } - if line.current() == '}' { - if advances == 0 { - invalid_content_detected = true; - } - found_closing_brace = true; - break; - } else { - if line.current() == ',' { - if seen_comma { - invalid_content_detected = true; - } - seen_comma = true; - } else if !line.current().is_ascii_digit() { - invalid_content_detected = true; - } - line.advance(); - advances += 1; - } - } else { - // In BRE mode, look for \} - if line.current() == '\\' { - line.advance(); - advances += 1; - if !line.eol() && line.current() == '}' { + match regex_mode { + RegexMode::Extended => { + // In ERE mode, look for } + if line.current() == '}' { // Empty quantifier {} is not valid - if advances == 1 { + if is_quantifier_empty { invalid_content_detected = true; } found_closing_brace = true; break; } else { - invalid_content_detected = true; + // Entering means there is no } immediately after the { + is_quantifier_empty = false; + // Only digits and one comma allowed + if line.current() == ',' { + if seen_comma { + invalid_content_detected = true; + } + seen_comma = true; + } else if !line.current().is_ascii_digit() { + invalid_content_detected = true; + } + line.advance(); } - } else { - // Only digits and comma allowed - if line.current() == ',' { - if seen_comma { + } + RegexMode::Basic => { + // In BRE mode, look for \} + if line.current() == '\\' { + line.advance(); + if !line.eol() && line.current() == '}' { + if is_quantifier_empty { + invalid_content_detected = true; + } + found_closing_brace = true; + break; + } else { invalid_content_detected = true; } - seen_comma = true; - } else if !line.current().is_ascii_digit() { - invalid_content_detected = true; + } else { + is_quantifier_empty = false; + if line.current() == ',' { + if seen_comma { + invalid_content_detected = true; + } + seen_comma = true; + } else if !line.current().is_ascii_digit() { + invalid_content_detected = true; + } + line.advance(); } - line.advance(); - advances += 1; } } } @@ -459,8 +471,8 @@ fn validate_quantifier_structure( return compilation_error(lines, line, invalid_content_error_msg); } - line.retreat(advances + 1); - Ok(advances) + line.set_position(initial_pos); + Ok(initial_pos) } // Peforms validations on m and/or n values of the quantifier @@ -951,7 +963,7 @@ mod tests { #[test] fn test_simple_regex() { let (lines, mut line) = make_providers("/abc/"); - let parsed = parse_regex(&lines, &mut line, false).unwrap(); + let parsed = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap(); assert_eq!(parsed, "abc"); assert_eq!(line.current(), '/'); } @@ -959,7 +971,7 @@ mod tests { #[test] fn test_regex_with_escaped_delimiter() { let (lines, mut line) = make_providers("/ab\\/c/"); - let parsed = parse_regex(&lines, &mut line, false).unwrap(); + let parsed = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap(); assert_eq!(parsed, "ab/c"); assert_eq!(line.current(), '/'); } @@ -967,7 +979,7 @@ mod tests { #[test] fn test_regex_with_capture() { let (lines, mut line) = make_providers(r"/\(.\)/c/"); - let parsed = parse_regex(&lines, &mut line, false).unwrap(); + let parsed = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap(); assert_eq!(parsed, r"\(.\)"); assert_eq!(line.current(), '/'); } @@ -975,7 +987,7 @@ mod tests { #[test] fn test_regex_with_escape_sequence() { let (lines, mut line) = make_providers("/ab\\n/"); - let parsed = parse_regex(&lines, &mut line, false).unwrap(); + let parsed = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap(); assert_eq!(parsed, "ab\n"); assert_eq!(line.current(), '/'); } @@ -983,29 +995,29 @@ mod tests { #[test] fn test_basic_regex_quantifier() { let (lines, mut line) = make_providers("/a\\{2,3\\}/p"); - let parsed = parse_regex(&lines, &mut line, true).unwrap(); + let parsed = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap(); assert_eq!(parsed, "a\\{2,3\\}"); assert_eq!(line.current(), '/'); } #[test] fn test_basic_regex_with_unmatched_brace_quantifier() { - let (lines, mut line) = make_providers("/a{2,3/p"); - let err = parse_regex(&lines, &mut line, true).unwrap_err(); + let (lines, mut line) = make_providers("/a\\{2,3/p"); + let err = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap_err(); assert!(err.to_string().contains("Unmatched \\{")); } #[test] fn test_basic_regex_with_invalid_content() { - let (lines, mut line) = make_providers("/a{2d,3}/p"); - let err = parse_regex(&lines, &mut line, true).unwrap_err(); + let (lines, mut line) = make_providers("/a\\{2d,3\\}/p"); + let err = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap_err(); assert!(err.to_string().contains("Invalid content of \\{\\}")); } #[test] fn test_extended_regex_quantifier() { let (lines, mut line) = make_providers("/a{2,3}/p"); - let parsed = parse_regex(&lines, &mut line, true).unwrap(); + let parsed = parse_regex(&lines, &mut line, RegexMode::Extended).unwrap(); assert_eq!(parsed, "a{2,3}"); assert_eq!(line.current(), '/'); } @@ -1013,35 +1025,35 @@ mod tests { #[test] fn test_extended_regex_with_unmatched_brace_quantifier() { let (lines, mut line) = make_providers("/a{2,3/p"); - let err = parse_regex(&lines, &mut line, true).unwrap_err(); + let err = parse_regex(&lines, &mut line, RegexMode::Extended).unwrap_err(); assert!(err.to_string().contains("Unmatched \\{")); } #[test] fn test_extended_regex_with_invalid_m() { let (lines, mut line) = make_providers("/a{2d,3}/p"); - let err = parse_regex(&lines, &mut line, true).unwrap_err(); + let err = parse_regex(&lines, &mut line, RegexMode::Extended).unwrap_err(); assert!(err.to_string().contains("Invalid content of \\{\\}")); } #[test] fn errors_on_unterminated_regex() { let (lines, mut line) = make_providers("/unterminated"); - let err = parse_regex(&lines, &mut line, false).unwrap_err(); + let err = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap_err(); assert!(err.to_string().contains("unterminated regular expression")); } #[test] fn errors_on_esc_at_re_eol() { let (lines, mut line) = make_providers("/foo\\"); - let err = parse_regex(&lines, &mut line, false).unwrap_err(); + let err = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap_err(); assert!(err.to_string().contains("unterminated regular expression")); } #[test] fn errors_on_backslash_delimiter() { let (lines, mut line) = make_providers("\\bad"); - let err = parse_regex(&lines, &mut line, false).unwrap_err(); + let err = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap_err(); assert!( err.to_string() .contains("\\ cannot be used as a string delimiter") @@ -1051,7 +1063,7 @@ mod tests { #[test] fn test_regex_with_character_class() { let (lines, mut line) = make_providers("/[a-z]/"); - let parsed = parse_regex(&lines, &mut line, false).unwrap(); + let parsed = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap(); assert_eq!(parsed, "[a-z]"); assert_eq!(line.current(), '/'); } @@ -1059,7 +1071,7 @@ mod tests { #[test] fn test_regex_with_bracket_delimiter() { let (lines, mut line) = make_providers("[abc["); - let parsed = parse_regex(&lines, &mut line, false).unwrap(); + let parsed = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap(); assert_eq!(parsed, "abc"); assert_eq!(line.current(), '['); } @@ -1067,7 +1079,7 @@ mod tests { #[test] fn test_bracket_regex_with_bracket_delimiter() { let (lines, mut line) = make_providers("[a\\[0-9]bc["); - let parsed = parse_regex(&lines, &mut line, false).unwrap(); + let parsed = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap(); assert_eq!(parsed, "a[0-9]bc"); assert_eq!(line.current(), '['); } @@ -1075,7 +1087,7 @@ mod tests { #[test] fn test_regex_with_escaped_bracket_in_character_class() { let (lines, mut line) = make_providers("/[a\\]z]/"); - let parsed = parse_regex(&lines, &mut line, false).unwrap(); + let parsed = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap(); assert_eq!(parsed, "[a\\]z]"); assert_eq!(line.current(), '/'); } @@ -1083,7 +1095,7 @@ mod tests { #[test] fn test_regex_with_delimiter_inside_character_class() { let (lines, mut line) = make_providers("/[a/c]/"); - let parsed = parse_regex(&lines, &mut line, false).unwrap(); + let parsed = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap(); assert_eq!(parsed, "[a/c]"); assert_eq!(line.current(), '/'); } @@ -1091,7 +1103,7 @@ mod tests { #[test] fn test_regex_with_escaped_paren_and_backslash() { let (lines, mut line) = make_providers("/\\(\\\\/"); - let parsed = parse_regex(&lines, &mut line, false).unwrap(); + let parsed = parse_regex(&lines, &mut line, RegexMode::Basic).unwrap(); assert_eq!(parsed, "\\(\\\\"); assert_eq!(line.current(), '/'); } @@ -1101,36 +1113,41 @@ mod tests { #[test] fn test_validate_quantifier_structure_bre_valid() { let (lines, mut line) = make_providers("{2,3\\}"); - let result = validate_quantifier_structure(&lines, &mut line, '/', false).unwrap(); - assert_eq!(result, 4); + let result = + validate_quantifier_structure(&lines, &mut line, '/', RegexMode::Basic).unwrap(); + assert_eq!(result, 0); assert_eq!(line.current(), '{'); // Line should be back on the opening brace } #[test] fn test_validate_quantifier_structure_bre_with_unmatched_brace() { let (lines, mut line) = make_providers("{2,3"); - let err = validate_quantifier_structure(&lines, &mut line, '/', false).unwrap_err(); + let err = + validate_quantifier_structure(&lines, &mut line, '/', RegexMode::Basic).unwrap_err(); assert!(err.to_string().contains("Unmatched \\{")); } #[test] fn test_validate_quantifier_structure_bre_with_empty_content() { let (lines, mut line) = make_providers("{\\}"); - let err = validate_quantifier_structure(&lines, &mut line, '/', false).unwrap_err(); + let err = + validate_quantifier_structure(&lines, &mut line, '/', RegexMode::Basic).unwrap_err(); assert!(err.to_string().contains("Invalid content of \\{\\}")); } #[test] fn test_validate_quantifier_structure_bre_with_invalid_char() { let (lines, mut line) = make_providers("{2d,3\\}"); - let err = validate_quantifier_structure(&lines, &mut line, '/', false).unwrap_err(); + let err = + validate_quantifier_structure(&lines, &mut line, '/', RegexMode::Basic).unwrap_err(); assert!(err.to_string().contains("Invalid content of \\{\\}")); } #[test] fn test_validate_quantifier_structure_bre_with_double_comma() { let (lines, mut line) = make_providers("{2,3,\\}"); - let err = validate_quantifier_structure(&lines, &mut line, '/', false).unwrap_err(); + let err = + validate_quantifier_structure(&lines, &mut line, '/', RegexMode::Basic).unwrap_err(); assert!(err.to_string().contains("Invalid content of \\{\\}")); } @@ -1138,36 +1155,41 @@ mod tests { #[test] fn test_validate_quantifier_structure_ere_valid() { let (lines, mut line) = make_providers("{2,3}"); - let result = validate_quantifier_structure(&lines, &mut line, '/', true).unwrap(); - assert_eq!(result, 3); + let result = + validate_quantifier_structure(&lines, &mut line, '/', RegexMode::Extended).unwrap(); + assert_eq!(result, 0); assert_eq!(line.current(), '{'); // Line should be back on the opening brace } #[test] fn test_validate_quantifier_structure_ere_with_unmatched_brace() { let (lines, mut line) = make_providers("{2,3"); - let err = validate_quantifier_structure(&lines, &mut line, '/', true).unwrap_err(); + let err = + validate_quantifier_structure(&lines, &mut line, '/', RegexMode::Extended).unwrap_err(); assert!(err.to_string().contains("Unmatched \\{")); } #[test] fn test_validate_quantifier_structure_ere_with_empty_content() { let (lines, mut line) = make_providers("{}"); - let err = validate_quantifier_structure(&lines, &mut line, '/', true).unwrap_err(); + let err = + validate_quantifier_structure(&lines, &mut line, '/', RegexMode::Extended).unwrap_err(); assert!(err.to_string().contains("Invalid content of \\{\\}")); } #[test] fn test_validate_quantifier_structure_ere_with_invalid_char() { let (lines, mut line) = make_providers("{2d,3}"); - let err = validate_quantifier_structure(&lines, &mut line, '/', true).unwrap_err(); + let err = + validate_quantifier_structure(&lines, &mut line, '/', RegexMode::Extended).unwrap_err(); assert!(err.to_string().contains("Invalid content of \\{\\}")); } #[test] fn test_validate_quantifier_structure_ere_with_double_comma() { let (lines, mut line) = make_providers("{2,3,}"); - let err = validate_quantifier_structure(&lines, &mut line, '/', true).unwrap_err(); + let err = + validate_quantifier_structure(&lines, &mut line, '/', RegexMode::Extended).unwrap_err(); assert!(err.to_string().contains("Invalid content of \\{\\}")); } diff --git a/src/sed/script_char_provider.rs b/src/sed/script_char_provider.rs index a3e3a85f..4e4ecd16 100644 --- a/src/sed/script_char_provider.rs +++ b/src/sed/script_char_provider.rs @@ -34,6 +34,11 @@ impl ScriptCharProvider { self.pos = self.pos.saturating_sub(n); } + /// Sets new current position. + pub fn set_position(&mut self, pos: usize) { + self.pos = pos; + } + /// Returns the current character. Panics if out of bounds. pub fn current(&self) -> char { self.line[self.pos] diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 5710d039..8386ba97 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -196,7 +196,6 @@ macro_rules! check_output_posix { // Input files const LINES1: &str = "input/lines1"; const LINES2: &str = "input/lines2"; -const REGEX_QUANTIFIERS: &str = "input/regex-quantifiers.txt"; const NO_NEW_LINE: &str = "input/no-new-line.txt"; //////////////////////////////////////////////////////////// @@ -279,42 +278,77 @@ check_output!(addr_range_end_multiple, ["-n", "/l1_2/,~10p", LINES1]); // Quantifiers: {m,n} // m and n are considered to be the first and second numbers in the interval, respectively. -check_output!( - ere_quantifier_exactly_m, - ["-n", "-E", "-e", "/l{2}/p", REGEX_QUANTIFIERS] -); -check_output!( - ere_quantifier_minimum_m, - ["-n", "-E", "-e", "/l{1,}/p", REGEX_QUANTIFIERS] -); -check_output!( - ere_quantifier_m_to_n, - ["-n", "-E", "-e", "/l{3,4}/p", REGEX_QUANTIFIERS] -); -check_output!( - ere_quantifier_comma_n, - ["-n", "-E", "-e", "/l{,4}/p", REGEX_QUANTIFIERS] -); -check_output!( - bre_quantifier_minimum_m, - ["-n", "-e", "/l\\{3,\\}/p", REGEX_QUANTIFIERS] -); +const REGEX_QUANTIFIERS_INPUT: &str = + "Hello World\nHelo World\nHelllllo World\nHeo Word\nHeo Worl}d\n"; -check_output!( - bre_quantifier_comma, - ["-n", "-e", "/l\\{,\\}/p", REGEX_QUANTIFIERS] -); +#[test] +fn ere_quantifier_exactly_m() { + new_ucmd!() + .args(&["-n", "-E", "-e", "/l{2}/p"]) + .pipe_in(REGEX_QUANTIFIERS_INPUT) + .succeeds() + .stdout_is("Hello World\nHelllllo World\n"); +} -check_output!( - bre_quantifier_only_closing_brace, - ["-n", "-e", "/l\\}/p", REGEX_QUANTIFIERS] -); +#[test] +fn ere_quantifier_minimum_m() { + new_ucmd!() + .args(&["-n", "-E", "-e", "/l{1,}/p"]) + .pipe_in(REGEX_QUANTIFIERS_INPUT) + .succeeds() + .stdout_is("Hello World\nHelo World\nHelllllo World\nHeo Worl}d\n"); +} + +#[test] +fn ere_quantifier_m_to_n() { + new_ucmd!() + .args(&["-n", "-E", "-e", "/l{3,4}/p"]) + .pipe_in(REGEX_QUANTIFIERS_INPUT) + .succeeds() + .stdout_is("Helllllo World\n"); +} + +#[test] +fn ere_quantifier_comma_n() { + new_ucmd!() + .args(&["-n", "-E", "-e", "/l{,4}/p"]) + .pipe_in(REGEX_QUANTIFIERS_INPUT) + .succeeds() + .stdout_is(REGEX_QUANTIFIERS_INPUT); +} + +#[test] +fn bre_quantifier_minimum_m() { + new_ucmd!() + .args(&["-n", "-e", "/l\\{3,\\}/p"]) + .pipe_in(REGEX_QUANTIFIERS_INPUT) + .succeeds() + .stdout_is("Helllllo World\n"); +} + +#[test] +fn bre_quantifier_comma() { + new_ucmd!() + .args(&["-n", "-e", "/l\\{,\\}/p"]) + .pipe_in(REGEX_QUANTIFIERS_INPUT) + .succeeds() + .stdout_is(REGEX_QUANTIFIERS_INPUT); +} + +#[test] +fn bre_quantifier_only_closing_brace() { + new_ucmd!() + .args(&["-n", "-e", "/l\\}/p"]) + .pipe_in(REGEX_QUANTIFIERS_INPUT) + .succeeds() + .stdout_is("Heo Worl}d\n"); +} #[test] fn test_ere_quantifier_n_gt_m() { new_ucmd!() - .args(&["-E", "-e", "/l{3,2}/p", REGEX_QUANTIFIERS]) + .args(&["-E", "-e", "/l{3,2}/p"]) .fails() .code_is(1) .stderr_contains("Invalid content of \\{\\}"); @@ -323,7 +357,7 @@ fn test_ere_quantifier_n_gt_m() { #[test] fn test_ere_quantifier_negative_m() { new_ucmd!() - .args(&["-E", "-e", "/l{-2,4}/p", REGEX_QUANTIFIERS]) + .args(&["-E", "-e", "/l{-2,4}/p"]) .fails() .code_is(1) .stderr_contains("Invalid content of \\{\\}"); @@ -332,7 +366,7 @@ fn test_ere_quantifier_negative_m() { #[test] fn test_ere_quantifier_invalid_m() { new_ucmd!() - .args(&["-E", "-e", "/l{d,}/p", REGEX_QUANTIFIERS]) + .args(&["-E", "-e", "/l{d,}/p"]) .fails() .code_is(1) .stderr_contains("Invalid content of \\{\\}"); @@ -341,7 +375,7 @@ fn test_ere_quantifier_invalid_m() { #[test] fn test_ere_quantifier_m_too_big() { new_ucmd!() - .args(&["-E", "-e", "/l{300,}/p", REGEX_QUANTIFIERS]) + .args(&["-E", "-e", "/l{300,}/p"]) .fails() .code_is(1) .stderr_contains("Regular expression too big"); @@ -350,7 +384,7 @@ fn test_ere_quantifier_m_too_big() { #[test] fn test_ere_quantifier_empty() { new_ucmd!() - .args(&["-E", "-e", "/l{}/p", REGEX_QUANTIFIERS]) + .args(&["-E", "-e", "/l{}/p"]) .fails() .code_is(1) .stderr_contains("Invalid content of \\{\\}"); @@ -359,7 +393,7 @@ fn test_ere_quantifier_empty() { #[test] fn test_ere_quantifier_whitespace() { new_ucmd!() - .args(&["-E", "-e", "/l{ }/p", REGEX_QUANTIFIERS]) + .args(&["-E", "-e", "/l{ }/p"]) .fails() .code_is(1) .stderr_contains("Invalid content of \\{\\}"); @@ -368,7 +402,7 @@ fn test_ere_quantifier_whitespace() { #[test] fn test_ere_quantifier_unmatched_brace() { new_ucmd!() - .args(&["-E", "-e", "/l{,/p", REGEX_QUANTIFIERS]) + .args(&["-E", "-e", "/l{,/p"]) .fails() .code_is(1) .stderr_contains("Unmatched \\{"); @@ -377,7 +411,7 @@ fn test_ere_quantifier_unmatched_brace() { #[test] fn test_ere_quantifier_unmatched_brace_2() { new_ucmd!() - .args(&["-E", "-e", "/l{m,n/p", REGEX_QUANTIFIERS]) + .args(&["-E", "-e", "/l{m,n/p"]) .fails() .code_is(1) .stderr_contains("Unmatched \\{"); @@ -386,7 +420,7 @@ fn test_ere_quantifier_unmatched_brace_2() { #[test] fn test_bre_quantifier_unmatched_brace() { new_ucmd!() - .args(&["-e", "/l\\{1,2}/p", REGEX_QUANTIFIERS]) + .args(&["-e", "/l\\{1,2}/p"]) .fails() .code_is(1) .stderr_contains("Unmatched \\{"); diff --git a/tests/fixtures/sed/input/regex-quantifiers.txt b/tests/fixtures/sed/input/regex-quantifiers.txt deleted file mode 100644 index 4d77d3fc..00000000 --- a/tests/fixtures/sed/input/regex-quantifiers.txt +++ /dev/null @@ -1,5 +0,0 @@ -Hello World -Helo World -Helllllo World -Heo Word -Heo Worl}d From 1d745c7df07c9e2ee2237606f2e2f0876247d315 Mon Sep 17 00:00:00 2001 From: LoukasPap Date: Thu, 23 Apr 2026 15:54:58 +0300 Subject: [PATCH 8/8] Remove obsolete file --- tests/fixtures/sed/input/regex_quantifiers | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 tests/fixtures/sed/input/regex_quantifiers diff --git a/tests/fixtures/sed/input/regex_quantifiers b/tests/fixtures/sed/input/regex_quantifiers deleted file mode 100644 index e90a59e0..00000000 --- a/tests/fixtures/sed/input/regex_quantifiers +++ /dev/null @@ -1,4 +0,0 @@ -Hello World -Helo World -Helllllo World -Heo Word