Fix panics with whitespace in extended mode by being more strict

Instead of ignoring space in all the bump/peek methods (as proposed in pull request #349), have an explicit `ignore_space` method that can be used in places where space/comments should be allowed. This makes parsing a bit stricter than before as well.
2025-04-12 07:34:07 +00:00 · 2017-04-07 18:58:54 +10:00 · 2017-04-07 18:58:54 +10:00 · 6f32d2f683
commit 6f32d2f683
parent 204e409488
1 changed files with 131 additions and 43 deletions
--- a/regex-syntax/src/parser.rs
+++ b/regex-syntax/src/parser.rs
@ -104,7 +104,11 @@ impl Parser {
    // Starts at the beginning of the input and consumes until either the end
    // of input or an error.
    fn parse_expr(mut self) -> Result<Expr> {
-        while !self.eof() {
+        loop {
+            self.ignore_space();
+            if self.eof() {
+                break;
+            }
            let build_expr = match self.cur() {
                '\\' => try!(self.parse_escape()),
                '|' => { let e = try!(self.alternate()); self.bump(); e }
@ -177,7 +181,7 @@ impl Parser {
            return Err(self.err(ErrorKind::UnexpectedEscapeEof));
        }
        let c = self.cur();
-        if is_punct(c) {
+        if is_punct(c) || (self.flags.ignore_space && c.is_whitespace()) {
            let c = self.bump();
            return Ok(try!(self.lit(c)));
        }
@ -234,6 +238,7 @@ impl Parser {
        let chari = self.chari;
        let mut name: CaptureName = None;
        self.bump();
+        self.ignore_space();
        if self.bump_if("?P<") {
            let n = try!(self.parse_group_name());
            if self.names.iter().any(|n2| n2 == &n) {
@ -370,13 +375,16 @@ impl Parser {
            return Err(self.err(ErrorKind::RepeaterUnexpectedExpr(e)));
        }
        self.bump();
-        let min = try!(self.parse_decimal(|c| c != ',' && c != '}'));
+        self.ignore_space();
+        let min = try!(self.parse_decimal());
        let mut max_opt = Some(min);
+        self.ignore_space();
        if self.bump_if(',') {
+            self.ignore_space();
            if self.peek_is('}') {
                max_opt = None;
            } else {
-                let max = try!(self.parse_decimal(|c| c != '}'));
+                let max = try!(self.parse_decimal());
                if min > max {
                    // e.g., a{2,1}
                    return Err(self.err(ErrorKind::InvalidRepeatRange {
@ -387,6 +395,7 @@ impl Parser {
                max_opt = Some(max);
            }
        }
+        self.ignore_space();
        if !self.bump_if('}') {
            Err(self.err(ErrorKind::UnclosedRepeat))
        } else {
@ -423,8 +432,8 @@ impl Parser {
    //
    // Start: `1`
    // End:   `,` (where `until == ','`)
-    fn parse_decimal<B: Bumpable>(&mut self, until: B) -> Result<u32> {
-        match self.bump_get(until) {
+    fn parse_decimal(&mut self) -> Result<u32> {
+        match self.bump_get(|c| is_ascii_word(c) || c.is_whitespace()) {
            // e.g., a{}
            None => Err(self.err(ErrorKind::MissingBase10)),
            Some(n) => {
@ -472,6 +481,7 @@ impl Parser {
    // Start: `{`
    // End:   `b`
    fn parse_hex(&mut self) -> Result<Build> {
+        self.ignore_space();
        if self.bump_if('{') {
            self.parse_hex_many_digits()
        } else {
@ -486,9 +496,11 @@ impl Parser {
    fn parse_hex_many_digits(&mut self) -> Result<Build> {
        use std::char;

-        let s = self.bump_get(|c| c != '}').unwrap_or("".into());
+        self.ignore_space();
+        let s = self.bump_get(is_ascii_word).unwrap_or("".into());
        let n = try!(u32::from_str_radix(&s, 16)
                         .map_err(|_| self.err(ErrorKind::InvalidBase16(s))));
+        self.ignore_space();
        if !self.bump_if('}') {
            // e.g., a\x{d
            return Err(self.err(ErrorKind::UnclosedHex));
@ -530,12 +542,16 @@ impl Parser {
    // End:   `+`
    fn parse_class(&mut self) -> Result<Build> {
        self.bump();
+        self.ignore_space();
        let negated = self.bump_if('^');
+        self.ignore_space();
        let mut class = CharClass::empty();
        while self.bump_if('-') {
+            self.ignore_space();
            class.ranges.push(ClassRange::one('-'));
        }
        loop {
+            self.ignore_space();
            if self.eof() {
                // e.g., [a
                return Err(self.err(ErrorKind::UnexpectedClassEof));
@ -631,11 +647,13 @@ impl Parser {
    // End:   `]`
    fn parse_class_range(&mut self, class: &mut CharClass, start: char)
                        -> Result<()> {
+        self.ignore_space();
        if !self.bump_if('-') {
            // Not a range, so just push a singleton range.
            class.ranges.push(ClassRange::one(start));
            return Ok(());
        }
+        self.ignore_space();
        if self.eof() {
            // e.g., [a-
            return Err(self.err(ErrorKind::UnexpectedClassEof));
@ -730,9 +748,12 @@ impl Parser {
    //
    // `negate` is true when the class name is used with `\P`.
    fn parse_unicode_class(&mut self, neg: bool) -> Result<CharClass> {
+        self.ignore_space();
        let name =
            if self.bump_if('{') {
-                let n = self.bump_get(|c| c != '}').unwrap_or("".into());
+                self.ignore_space();
+                let n = self.bump_get(is_ascii_word).unwrap_or("".into());
+                self.ignore_space();
                if n.is_empty() || !self.bump_if('}') {
                    // e.g., \p{Greek
                    return Err(self.err(ErrorKind::UnclosedUnicodeName));
@ -796,7 +817,31 @@ impl Parser {
 // Auxiliary helper methods.
 impl Parser {
    fn chars(&self) -> Chars {
-        Chars::new(&self.chars[self.chari..], self.flags.ignore_space)
+        Chars::new(&self.chars[self.chari..])
+    }
+
+    fn ignore_space(&mut self) {
+        if !self.flags.ignore_space {
+            return;
+        }
+        while !self.eof() {
+            match self.cur() {
+                '#' => {
+                    self.bump();
+                    while !self.eof() {
+                        match self.bump() {
+                            '\n' => break,
+                            _ => continue,
+                        }
+                    }
+                },
+                c => if !c.is_whitespace() {
+                    return;
+                } else {
+                    self.bump();
+                }
+            }
+        }
    }

    fn bump(&mut self) -> char {
@ -924,48 +969,22 @@ impl Parser {
 struct Chars<'a> {
    chars: &'a [char],
    cur: usize,
-    ignore_space: bool,
 }

 impl<'a> Iterator for Chars<'a> {
    type Item = char;
    fn next(&mut self) -> Option<char> {
-        if !self.ignore_space {
-            let x = self.c();
-            self.advance();
-            return x;
-        }
-        while let Some(c) = self.c() {
-            self.advance();
-            match c {
-                '\\' => return match self.c() {
-                            Some('#') => {self.advance(); Some('#')}
-                            _ => Some('\\')
-                        },
-                '#'  => loop {
-                            match self.c() {
-                                Some(c) => {
-                                    self.advance();
-                                    if c == '\n' {
-                                        break;
-                                    }
-                                },
-                                None => return None
-                            }
-                        },
-                _    => if !c.is_whitespace() {return Some(c);}
-            }
-        }
-        None
+        let x = self.c();
+        self.advance();
+        return x;
    }
 }

 impl<'a> Chars<'a> {
-    fn new(chars: &[char], ignore_space: bool) -> Chars {
+    fn new(chars: &[char]) -> Chars {
        Chars {
            chars: chars,
            cur: 0,
-            ignore_space: ignore_space,
        }
    }

@ -1221,6 +1240,13 @@ fn is_valid_capture_char(c: char) -> bool {
    || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
 }

+fn is_ascii_word(c: char) -> bool {
+    match c {
+        'a' ... 'z' | 'A' ... 'Z' | '_' | '0' ... '9' => true,
+        _ => false,
+    }
+}
+
 /// Returns true if the give character has significance in a regex.
 pub fn is_punct(c: char) -> bool {
    match c {
@ -2368,10 +2394,49 @@ mod tests {
    }

    #[test]
-    fn ignore_space_escape() {
-        assert_eq!(p(r"(?x)\ d"), Expr::Class(class(PERLD)));
-        assert_eq!(p(r"(?x)\
-                     D"), Expr::Class(class(PERLD).negate()));
+    fn ignore_space_escape_octal() {
+        assert_eq!(p(r"(?x)\12 3"), Expr::Concat(vec![
+            lit('\n'),
+            lit('3'),
+        ]));
+    }
+
+    #[test]
+    fn ignore_space_escape_hex() {
+        assert_eq!(p(r"(?x)\x { 53 }"), lit('S'));
+        assert_eq!(p(r"(?x)\x # comment
+{ # comment
+    53 # comment
+} # comment"), lit('S'));
+    }
+
+    #[test]
+    fn ignore_space_escape_hex2() {
+        assert_eq!(p(r"(?x)\x 53"), lit('S'));
+        assert_eq!(p(r"(?x)\x # comment
+        53 # comment"), lit('S'));
+    }
+
+    #[test]
+    fn ignore_space_escape_unicode_name() {
+        assert_eq!(p(r"(?x)\p # comment
+{ # comment
+    Yi # comment
+} # comment"), Expr::Class(class(YI)));
+    }
+
+    #[test]
+    fn ignore_space_repeat_counted() {
+        assert_eq!(p("(?x)a # comment
+{ # comment
+    5 # comment
+    , # comment
+    10 # comment
+}"), Expr::Repeat {
+            e: b(lit('a')),
+            r: Repeater::Range { min: 5, max: Some(10) },
+            greedy: true,
+        });
    }

    #[test]
@ -2424,6 +2489,14 @@ mod tests {
        ]));
    }

+    #[test]
+    fn ignore_space_escape_space() {
+        assert_eq!(p(r"(?x)a\  # hi there"), Expr::Concat(vec![
+            lit('a'),
+            lit(' '),
+        ]));
+    }
+
    // Test every single possible error case.

    macro_rules! test_err {
@ -2815,4 +2888,19 @@ mod tests {
        test_err!("(?P<a>.)(?P<a>.)", 14,
                  ErrorKind::DuplicateCaptureName("a".into()));
    }
+
+    #[test]
+    fn error_ignore_space_escape_hex() {
+        test_err!(r"(?x)\x{ 5 3 }", 10, ErrorKind::UnclosedHex);
+    }
+
+    #[test]
+    fn error_ignore_space_escape_hex2() {
+        test_err!(r"(?x)\x 5 3", 9, ErrorKind::InvalidBase16("5 ".into()));
+    }
+
+    #[test]
+    fn error_ignore_space_escape_unicode_name() {
+        test_err!(r"(?x)\p{Y i}", 9, ErrorKind::UnclosedUnicodeName);
+    }
 }