Support \u and \x escapes in pref files (bug 307438). r=cbiesinger, sr=darin

2024-10-08 19:04:45 +00:00 · 2005-09-29 18:09:06 +00:00 · 2005-09-29 18:09:06 +00:00 · 9e0319346f
commit 9e0319346f
parent 2fa023ccc5
2 changed files with 118 additions and 5 deletions
--- a/modules/libpref/src/prefread.cpp
+++ b/modules/libpref/src/prefread.cpp
@ -38,6 +38,8 @@
 #include <string.h>
 #include <ctype.h>
 #include "prefread.h"
+#include "nsString.h"
+#include "nsUTF8Utils.h"

 #ifdef TEST_PREFREAD
 #include <stdio.h>
@ -60,12 +62,18 @@ enum {
    PREF_PARSE_COMMENT_BLOCK,
    PREF_PARSE_COMMENT_BLOCK_MAYBE_END,
    PREF_PARSE_ESC_SEQUENCE,
+    PREF_PARSE_HEX_ESCAPE,
+    PREF_PARSE_UTF16_LOW_SURROGATE,
    PREF_PARSE_UNTIL_OPEN_PAREN,
    PREF_PARSE_UNTIL_CLOSE_PAREN,
    PREF_PARSE_UNTIL_SEMICOLON,
    PREF_PARSE_UNTIL_EOL
 };

+#define UTF16_ESC_NUM_DIGITS    4
+#define HEX_ESC_NUM_DIGITS      2
+#define BITS_PER_HEX_DIGIT      4
+
 static const char kUserPref[] = "user_pref";
 static const char kPref[] = "pref";
 static const char kTrue[] = "true";
@ -75,9 +83,10 @@ static const char kFalse[] = "false";
 * pref_GrowBuf
 * 
 * this function will increase the size of the buffer owned
- * by the given pref parse state.  the only requirement is
- * that it increase the buffer by at least one byte, but we
- * use a simple doubling algorithm.
+ * by the given pref parse state.  We currently use a simple
+ * doubling algorithm, but the only hard requirement is that
+ * it increase the buffer by at least the size of the ps->esctmp
+ * buffer used for escape processing (currently 6 bytes).
 * 
 * this buffer is used to store partial pref lines.  it is
 * freed when the parse state is destroyed.
@ -197,6 +206,7 @@ PREF_ParseBuf(PrefParseState *ps, const char *buf, int bufLen)
 {
    const char *end;
    char c;
+    char udigit;
    int state;

    state = ps->state;
@ -392,6 +402,7 @@ PREF_ParseBuf(PrefParseState *ps, const char *buf, int bufLen)
             * for us in the previous state */
            switch (c) {
            case '\"':
+            case '\'':
            case '\\':
                break;
            case 'r':
@ -400,10 +411,21 @@ PREF_ParseBuf(PrefParseState *ps, const char *buf, int bufLen)
            case 'n':
                c = '\n';
                break;
+            case 'x': /* hex escape -- always interpreted as Latin-1 */
+            case 'u': /* UTF16 escape */
+                ps->esctmp[0] = c;
+                ps->esclen = 1;
+                ps->utf16[0] = ps->utf16[1] = 0;
+                ps->sindex = (c == 'x' ) ?
+                                HEX_ESC_NUM_DIGITS :
+                                UTF16_ESC_NUM_DIGITS;
+                state = PREF_PARSE_HEX_ESCAPE;
+                continue;
            default:
                NS_WARNING("preserving unexpected JS escape sequence");
-                /* grow line buffer if necessary... */
-                if (ps->lbcur == ps->lbend && !pref_GrowBuf(ps))
+                /* Invalid escape sequence so we do have to write more than
+                 * one character. Grow line buffer if necessary... */
+                if ((ps->lbcur+1) == ps->lbend && !pref_GrowBuf(ps))
                    return PR_FALSE; /* out of memory */
                *ps->lbcur++ = '\\'; /* preserve the escape sequence */
                break;
@ -412,6 +434,93 @@ PREF_ParseBuf(PrefParseState *ps, const char *buf, int bufLen)
            state = PREF_PARSE_QUOTED_STRING;
            break;

+        /* parsing a hex (\xHH) or utf16 escape (\uHHHH) */
+        case PREF_PARSE_HEX_ESCAPE:
+            if ( c >= '0' && c <= '9' )
+                udigit = (c - '0');
+            else if ( c >= 'A' && c <= 'F' )
+                udigit = (c - 'A') + 10;
+            else if ( c >= 'a' && c <= 'f' )
+                udigit = (c - 'a') + 10;
+            else {
+                /* bad escape sequence found, write out broken escape as-is */
+                NS_WARNING("preserving invalid or incomplete hex escape");
+                *ps->lbcur++ = '\\';  /* original escape slash */
+                if ((ps->lbcur + ps->esclen) >= ps->lbend && !pref_GrowBuf(ps))
+                    return PR_FALSE;
+                for (int i = 0; i < ps->esclen; ++i)
+                    *ps->lbcur++ = ps->esctmp[i];
+
+                /* push the non-hex character back for re-parsing. */
+                /* (++buf at the top of the loop keeps this safe)  */
+                --buf;
+                state = PREF_PARSE_QUOTED_STRING;
+                continue;
+            }
+
+            /* have a digit */
+            ps->esctmp[ps->esclen++] = c; /* preserve it */
+            ps->utf16[1] <<= BITS_PER_HEX_DIGIT;
+            ps->utf16[1] |= udigit;
+            ps->sindex--;
+            if (ps->sindex == 0) {
+                /* have the full escape. Convert to UTF8 */
+                int utf16len = 0;
+                if (ps->utf16[0]) {
+                    /* already have a high surrogate, this is a two char seq */
+                    utf16len = 2;
+                }
+                else if (0xD800 == (0xFC00 & ps->utf16[1])) {
+                    /* a high surrogate, can't convert until we have the low */
+                    ps->utf16[0] = ps->utf16[1];
+                    ps->utf16[1] = 0;
+                    state = PREF_PARSE_UTF16_LOW_SURROGATE;
+                    break;
+                }
+                else {
+                    /* a single utf16 character */
+                    ps->utf16[0] = ps->utf16[1];
+                    utf16len = 1;
+                }
+
+                /* actual conversion */
+                /* make sure there's room, 6 bytes is max utf8 len (in */
+                /* theory; 4 bytes covers the actual utf16 range) */
+                if (ps->lbcur+6 >= ps->lbend && !pref_GrowBuf(ps))
+                    return PR_FALSE;
+
+                ConvertUTF16toUTF8 converter(ps->lbcur);
+                converter.write(ps->utf16, utf16len);
+                ps->lbcur += converter.Size();
+                state = PREF_PARSE_QUOTED_STRING;
+            }
+            break;
+
+        /* looking for beginning of utf16 low surrogate */
+        case PREF_PARSE_UTF16_LOW_SURROGATE:
+            if (ps->sindex == 0 && c == '\\') {
+                ++ps->sindex;
+            }
+            else if (ps->sindex == 1 && c == 'u') {
+                /* escape sequence is correct, now parse hex */
+                ps->sindex = UTF16_ESC_NUM_DIGITS;
+                ps->esctmp[0] = 'u';
+                ps->esclen = 1;
+                state = PREF_PARSE_HEX_ESCAPE;
+            }
+            else {
+                /* didn't find expected low surrogate. Ignore high surrogate
+                 * (it would just get converted to nothing anyway) and start
+                 * over with this character */
+                 --buf;
+                 if (ps->sindex == 1)
+                     state = PREF_PARSE_ESC_SEQUENCE;
+                 else
+                     state = PREF_PARSE_QUOTED_STRING;
+                 continue;
+            }
+            break;
+
        /* function open and close parsing */
        case PREF_PARSE_UNTIL_OPEN_PAREN:
            /* tolerate only whitespace and embedded comments */
--- a/modules/libpref/src/prefread.h
+++ b/modules/libpref/src/prefread.h
@ -72,6 +72,10 @@ typedef struct PrefParseState {
    int         nextstate;  /* sometimes used...             */
    const char *smatch;     /* string to match               */
    int         sindex;     /* next char of smatch to check  */
+                            /* also, counter in \u parsing   */
+    PRUnichar   utf16[2];   /* parsing UTF16  (\u) escape    */
+    int         esclen;     /* length in esctmp              */
+    char        esctmp[6];  /* raw escape to put back if err */
    char        quotechar;  /* char delimiter for quotations */
    char       *lb;         /* line buffer (only allocation) */
    char       *lbcur;      /* line buffer cursor            */