pjs/modules/libpref/src/prefread.cpp

/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is Mozilla.
 *
 * The Initial Developer of the Original Code is Darin Fisher.
 * Portions created by the Initial Developer are Copyright (C) 2003
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *   Darin Fisher <darin@meer.net>
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "prefread.h"
#include "nsString.h"
#include "nsUTF8Utils.h"

#ifdef TEST_PREFREAD
#include <stdio.h>
#define NS_WARNING(_s) printf(">>> " _s "!\n")
#define NS_NOTREACHED(_s) NS_WARNING(_s)
#else
#include "nsDebug.h" // for NS_WARNING
#endif

/* pref parser states */
enum {
    PREF_PARSE_INIT,
    PREF_PARSE_MATCH_STRING,
    PREF_PARSE_UNTIL_NAME,
    PREF_PARSE_QUOTED_STRING,
    PREF_PARSE_UNTIL_COMMA,
    PREF_PARSE_UNTIL_VALUE,
    PREF_PARSE_INT_VALUE,
    PREF_PARSE_COMMENT_MAYBE_START,
    PREF_PARSE_COMMENT_BLOCK,
    PREF_PARSE_COMMENT_BLOCK_MAYBE_END,
    PREF_PARSE_ESC_SEQUENCE,
    PREF_PARSE_HEX_ESCAPE,
    PREF_PARSE_UTF16_LOW_SURROGATE,
    PREF_PARSE_UNTIL_OPEN_PAREN,
    PREF_PARSE_UNTIL_CLOSE_PAREN,
    PREF_PARSE_UNTIL_SEMICOLON,
    PREF_PARSE_UNTIL_EOL
};

#define UTF16_ESC_NUM_DIGITS    4
#define HEX_ESC_NUM_DIGITS      2
#define BITS_PER_HEX_DIGIT      4

static const char kUserPref[] = "user_pref";
static const char kPref[] = "pref";
static const char kTrue[] = "true";
static const char kFalse[] = "false";

/**
 * pref_GrowBuf
 *
 * this function will increase the size of the buffer owned
 * by the given pref parse state.  We currently use a simple
 * doubling algorithm, but the only hard requirement is that
 * it increase the buffer by at least the size of the ps->esctmp
 * buffer used for escape processing (currently 6 bytes).
 *
 * this buffer is used to store partial pref lines.  it is
 * freed when the parse state is destroyed.
 *
 * @param ps
 *        parse state instance
 *
 * this function updates all pointers that reference an
 * address within lb since realloc may relocate the buffer.
 *
 * @return PR_FALSE if insufficient memory.
 */
static PRBool
pref_GrowBuf(PrefParseState *ps)
{
    int bufLen, curPos, valPos;

    bufLen = ps->lbend - ps->lb;
    curPos = ps->lbcur - ps->lb;
    valPos = ps->vb    - ps->lb;

    if (bufLen == 0)
        bufLen = 128;  /* default buffer size */
    else
        bufLen <<= 1;  /* double buffer size */

#ifdef TEST_PREFREAD
    fprintf(stderr, ">>> realloc(%d)\n", bufLen);
#endif

    ps->lb = (char*) realloc(ps->lb, bufLen);
    if (!ps->lb)
        return PR_FALSE;

    ps->lbcur = ps->lb + curPos;
    ps->lbend = ps->lb + bufLen;
    ps->vb    = ps->lb + valPos;

    return PR_TRUE;
}

/**
 * pref_DoCallback
 *
 * this function is called when a complete pref name-value pair has
 * been extracted from the input data.
 *
 * @param ps
 *        parse state instance
 *
 * @return PR_FALSE to indicate a fatal error.
 */
static PRBool
pref_DoCallback(PrefParseState *ps)
{
    PrefValue  value;

    switch (ps->vtype) {
    case PREF_STRING:
        value.stringVal = ps->vb;
        break;
    case PREF_INT:
        if ((ps->vb[0] == '-' || ps->vb[0] == '+') && ps->vb[1] == '\0') {
            NS_WARNING("malformed integer value");
            return PR_FALSE;
        }
        value.intVal = atoi(ps->vb);
        break;
    case PREF_BOOL:
        value.boolVal = (ps->vb == kTrue);
        break;
    default:
        break;
    }
    (*ps->reader)(ps->closure, ps->lb, value, ps->vtype, ps->fdefault);
    return PR_TRUE;
}

void
PREF_InitParseState(PrefParseState *ps, PrefReader reader, void *closure)
{
    memset(ps, 0, sizeof(*ps));
    ps->reader = reader;
    ps->closure = closure;
}

void
PREF_FinalizeParseState(PrefParseState *ps)
{
    if (ps->lb)
        free(ps->lb);
}

/**
 * Pseudo-BNF
 * ----------
 * function      = LJUNK function-name JUNK function-args
 * function-name = "user_pref" | "pref"
 * function-args = "(" JUNK pref-name JUNK "," JUNK pref-value JUNK ")" JUNK ";"
 * pref-name     = quoted-string
 * pref-value    = quoted-string | "true" | "false" | integer-value
 * JUNK          = *(WS | comment-block | comment-line)
 * LJUNK         = *(WS | comment-block | comment-line | bcomment-line)
 * WS            = SP | HT | LF | VT | FF | CR
 * SP            = <US-ASCII SP, space (32)>
 * HT            = <US-ASCII HT, horizontal-tab (9)>
 * LF            = <US-ASCII LF, linefeed (10)>
 * VT            = <US-ASCII HT, vertical-tab (11)>
 * FF            = <US-ASCII FF, form-feed (12)>
 * CR            = <US-ASCII CR, carriage return (13)>
 * comment-block = <C/C++ style comment block>
 * comment-line  = <C++ style comment line>
 * bcomment-line = <bourne-shell style comment line>
 */
PRBool
PREF_ParseBuf(PrefParseState *ps, const char *buf, int bufLen)
{
    const char *end;
    char c;
    char udigit;
    int state;

    state = ps->state;
    for (end = buf + bufLen; buf != end; ++buf) {
        c = *buf;
        switch (state) {
        /* initial state */
        case PREF_PARSE_INIT:
            if (ps->lbcur != ps->lb) { /* reset state */
                ps->lbcur = ps->lb;
                ps->vb    = NULL;
                ps->vtype = PREF_INVALID;
                ps->fdefault = PR_FALSE;
            }
            switch (c) {
            case '/':       /* begin comment block or line? */
                state = PREF_PARSE_COMMENT_MAYBE_START;
                break;
            case '#':       /* accept shell style comments */
                state = PREF_PARSE_UNTIL_EOL;
                break;
            case 'u':       /* indicating user_pref */
            case 'p':       /* indicating pref */
                ps->smatch = (c == 'u' ? kUserPref : kPref);
                ps->sindex = 1;
                ps->nextstate = PREF_PARSE_UNTIL_OPEN_PAREN;
                state = PREF_PARSE_MATCH_STRING;
                break;
            /* else skip char */
            }
            break;

        /* string matching */
        case PREF_PARSE_MATCH_STRING:
            if (c == ps->smatch[ps->sindex++]) {
                /* if we've matched all characters, then move to next state. */
                if (ps->smatch[ps->sindex] == '\0') {
                    state = ps->nextstate;
                    ps->nextstate = PREF_PARSE_INIT; /* reset next state */
                }
                /* else wait for next char */
            }
            else {
                NS_WARNING("malformed pref file");
                return PR_FALSE;
            }
            break;

        /* quoted string parsing */
        case PREF_PARSE_QUOTED_STRING:
            /* we assume that the initial quote has already been consumed */
            if (ps->lbcur == ps->lbend && !pref_GrowBuf(ps))
                return PR_FALSE; /* out of memory */
            if (c == '\\')
                state = PREF_PARSE_ESC_SEQUENCE;
            else if (c == ps->quotechar) {
                *ps->lbcur++ = '\0';
                state = ps->nextstate;
                ps->nextstate = PREF_PARSE_INIT; /* reset next state */
            }
            else
                *ps->lbcur++ = c;
            break;

        /* name parsing */
        case PREF_PARSE_UNTIL_NAME:
            if (c == '\"' || c == '\'') {
                ps->fdefault = (ps->smatch == kPref);
                ps->quotechar = c;
                ps->nextstate = PREF_PARSE_UNTIL_COMMA; /* return here when done */
                state = PREF_PARSE_QUOTED_STRING;
            }
            else if (c == '/') {       /* allow embedded comment */
                ps->nextstate = state; /* return here when done with comment */
                state = PREF_PARSE_COMMENT_MAYBE_START;
            }
            else if (!isspace(c)) {
                NS_WARNING("malformed pref file");
                return PR_FALSE;
            }
            break;

        /* parse until we find a comma separating name and value */
        case PREF_PARSE_UNTIL_COMMA:
            if (c == ',') {
                ps->vb = ps->lbcur;
                state = PREF_PARSE_UNTIL_VALUE;
            }
            else if (c == '/') {       /* allow embedded comment */
                ps->nextstate = state; /* return here when done with comment */
                state = PREF_PARSE_COMMENT_MAYBE_START;
            }
            else if (!isspace(c)) {
                NS_WARNING("malformed pref file");
                return PR_FALSE;
            }
            break;

        /* value parsing */
        case PREF_PARSE_UNTIL_VALUE:
            /* the pref value type is unknown.  so, we scan for the first
             * character of the value, and determine the type from that. */
            if (c == '\"' || c == '\'') {
                ps->vtype = PREF_STRING;
                ps->quotechar = c;
                ps->nextstate = PREF_PARSE_UNTIL_CLOSE_PAREN;
                state = PREF_PARSE_QUOTED_STRING;
            }
            else if (c == 't' || c == 'f') {
                ps->vb = (char *) (c == 't' ? kTrue : kFalse);
                ps->vtype = PREF_BOOL;
                ps->smatch = ps->vb;
                ps->sindex = 1;
                ps->nextstate = PREF_PARSE_UNTIL_CLOSE_PAREN;
                state = PREF_PARSE_MATCH_STRING;
            }
            else if (isdigit(c) || (c == '-') || (c == '+')) {
                ps->vtype = PREF_INT;
                /* write c to line buffer... */
                if (ps->lbcur == ps->lbend && !pref_GrowBuf(ps))
                    return PR_FALSE; /* out of memory */
                *ps->lbcur++ = c;
                state = PREF_PARSE_INT_VALUE;
            }
            else if (c == '/') {       /* allow embedded comment */
                ps->nextstate = state; /* return here when done with comment */
                state = PREF_PARSE_COMMENT_MAYBE_START;
            }
            else if (!isspace(c)) {
                NS_WARNING("malformed pref file");
                return PR_FALSE;
            }
            break;
        case PREF_PARSE_INT_VALUE:
            /* grow line buffer if necessary... */
            if (ps->lbcur == ps->lbend && !pref_GrowBuf(ps))
                return PR_FALSE; /* out of memory */
            if (isdigit(c))
                *ps->lbcur++ = c;
            else {
                *ps->lbcur++ = '\0'; /* stomp null terminator; we are done. */
                if (c == ')')
                    state = PREF_PARSE_UNTIL_SEMICOLON;
                else if (c == '/') { /* allow embedded comment */
                    ps->nextstate = PREF_PARSE_UNTIL_CLOSE_PAREN;
                    state = PREF_PARSE_COMMENT_MAYBE_START;
                }
                else if (isspace(c))
                    state = PREF_PARSE_UNTIL_CLOSE_PAREN;
                else {
                    NS_WARNING("malformed pref file");
                    return PR_FALSE;
                }
            }
            break;

        /* comment parsing */
        case PREF_PARSE_COMMENT_MAYBE_START:
            switch (c) {
            case '*': /* comment block */
                state = PREF_PARSE_COMMENT_BLOCK;
                break;
            case '/': /* comment line */
                state = PREF_PARSE_UNTIL_EOL;
                break;
            default:
                /* pref file is malformed */
                NS_WARNING("malformed pref file");
                return PR_FALSE;
            }
            break;
        case PREF_PARSE_COMMENT_BLOCK:
            if (c == '*')
                state = PREF_PARSE_COMMENT_BLOCK_MAYBE_END;
            break;
        case PREF_PARSE_COMMENT_BLOCK_MAYBE_END:
            switch (c) {
            case '/':
                state = ps->nextstate;
                ps->nextstate = PREF_PARSE_INIT;
                break;
            case '*':       /* stay in this state */
                break;
            default:
                state = PREF_PARSE_COMMENT_BLOCK;
            }
            break;

        /* string escape sequence parsing */
        case PREF_PARSE_ESC_SEQUENCE:
            /* not necessary to resize buffer here since we should be writing
             * only one character and the resize check would have been done
             * for us in the previous state */
            switch (c) {
            case '\"':
            case '\'':
            case '\\':
                break;
            case 'r':
                c = '\r';
                break;
            case 'n':
                c = '\n';
                break;
            case 'x': /* hex escape -- always interpreted as Latin-1 */
            case 'u': /* UTF16 escape */
                ps->esctmp[0] = c;
                ps->esclen = 1;
                ps->utf16[0] = ps->utf16[1] = 0;
                ps->sindex = (c == 'x' ) ?
                                HEX_ESC_NUM_DIGITS :
                                UTF16_ESC_NUM_DIGITS;
                state = PREF_PARSE_HEX_ESCAPE;
                continue;
            default:
                NS_WARNING("preserving unexpected JS escape sequence");
                /* Invalid escape sequence so we do have to write more than
                 * one character. Grow line buffer if necessary... */
                if ((ps->lbcur+1) == ps->lbend && !pref_GrowBuf(ps))
                    return PR_FALSE; /* out of memory */
                *ps->lbcur++ = '\\'; /* preserve the escape sequence */
                break;
            }
            *ps->lbcur++ = c;
            state = PREF_PARSE_QUOTED_STRING;
            break;

        /* parsing a hex (\xHH) or utf16 escape (\uHHHH) */
        case PREF_PARSE_HEX_ESCAPE:
            if ( c >= '0' && c <= '9' )
                udigit = (c - '0');
            else if ( c >= 'A' && c <= 'F' )
                udigit = (c - 'A') + 10;
            else if ( c >= 'a' && c <= 'f' )
                udigit = (c - 'a') + 10;
            else {
                /* bad escape sequence found, write out broken escape as-is */
                NS_WARNING("preserving invalid or incomplete hex escape");
                *ps->lbcur++ = '\\';  /* original escape slash */
                if ((ps->lbcur + ps->esclen) >= ps->lbend && !pref_GrowBuf(ps))
                    return PR_FALSE;
                for (int i = 0; i < ps->esclen; ++i)
                    *ps->lbcur++ = ps->esctmp[i];

                /* push the non-hex character back for re-parsing. */
                /* (++buf at the top of the loop keeps this safe)  */
                --buf;
                state = PREF_PARSE_QUOTED_STRING;
                continue;
            }

            /* have a digit */
            ps->esctmp[ps->esclen++] = c; /* preserve it */
            ps->utf16[1] <<= BITS_PER_HEX_DIGIT;
            ps->utf16[1] |= udigit;
            ps->sindex--;
            if (ps->sindex == 0) {
                /* have the full escape. Convert to UTF8 */
                int utf16len = 0;
                if (ps->utf16[0]) {
                    /* already have a high surrogate, this is a two char seq */
                    utf16len = 2;
                }
                else if (0xD800 == (0xFC00 & ps->utf16[1])) {
                    /* a high surrogate, can't convert until we have the low */
                    ps->utf16[0] = ps->utf16[1];
                    ps->utf16[1] = 0;
                    state = PREF_PARSE_UTF16_LOW_SURROGATE;
                    break;
                }
                else {
                    /* a single utf16 character */
                    ps->utf16[0] = ps->utf16[1];
                    utf16len = 1;
                }

                /* actual conversion */
                /* make sure there's room, 6 bytes is max utf8 len (in */
                /* theory; 4 bytes covers the actual utf16 range) */
                if (ps->lbcur+6 >= ps->lbend && !pref_GrowBuf(ps))
                    return PR_FALSE;

                ConvertUTF16toUTF8 converter(ps->lbcur);
                converter.write(ps->utf16, utf16len);
                ps->lbcur += converter.Size();
                state = PREF_PARSE_QUOTED_STRING;
            }
            break;

        /* looking for beginning of utf16 low surrogate */
        case PREF_PARSE_UTF16_LOW_SURROGATE:
            if (ps->sindex == 0 && c == '\\') {
                ++ps->sindex;
            }
            else if (ps->sindex == 1 && c == 'u') {
                /* escape sequence is correct, now parse hex */
                ps->sindex = UTF16_ESC_NUM_DIGITS;
                ps->esctmp[0] = 'u';
                ps->esclen = 1;
                state = PREF_PARSE_HEX_ESCAPE;
            }
            else {
                /* didn't find expected low surrogate. Ignore high surrogate
                 * (it would just get converted to nothing anyway) and start
                 * over with this character */
                 --buf;
                 if (ps->sindex == 1)
                     state = PREF_PARSE_ESC_SEQUENCE;
                 else
                     state = PREF_PARSE_QUOTED_STRING;
                 continue;
            }
            break;

        /* function open and close parsing */
        case PREF_PARSE_UNTIL_OPEN_PAREN:
            /* tolerate only whitespace and embedded comments */
            if (c == '(')
                state = PREF_PARSE_UNTIL_NAME;
            else if (c == '/') {
                ps->nextstate = state; /* return here when done with comment */
                state = PREF_PARSE_COMMENT_MAYBE_START;
            }
            else if (!isspace(c)) {
                NS_WARNING("malformed pref file");
                return PR_FALSE;
            }
            break;
        case PREF_PARSE_UNTIL_CLOSE_PAREN:
            /* tolerate only whitespace and embedded comments  */
            if (c == ')')
                state = PREF_PARSE_UNTIL_SEMICOLON;
            else if (c == '/') {
                ps->nextstate = state; /* return here when done with comment */
                state = PREF_PARSE_COMMENT_MAYBE_START;
            }
            else if (!isspace(c)) {
                NS_WARNING("malformed pref file");
                return PR_FALSE;
            }
            break;

        /* function terminator ';' parsing */
        case PREF_PARSE_UNTIL_SEMICOLON:
            /* tolerate only whitespace and embedded comments */
            if (c == ';') {
                if (!pref_DoCallback(ps))
                    return PR_FALSE;
                state = PREF_PARSE_INIT;
            }
            else if (c == '/') {
                ps->nextstate = state; /* return here when done with comment */
                state = PREF_PARSE_COMMENT_MAYBE_START;
            }
            else if (!isspace(c)) {
                NS_WARNING("malformed pref file");
                return PR_FALSE;
            }
            break;

        /* eol parsing */
        case PREF_PARSE_UNTIL_EOL:
            /* need to handle mac, unix, or dos line endings.
             * PREF_PARSE_INIT will eat the next \n in case
             * we have \r\n. */
            if (c == '\r' || c == '\n' || c == 0x1A) {
                state = ps->nextstate;
                ps->nextstate = PREF_PARSE_INIT; /* reset next state */
            }
            break;
        }
    }
    ps->state = state;
    return PR_TRUE;
}

#ifdef TEST_PREFREAD

static void
pref_reader(void       *closure,
            const char *pref,
            PrefValue   val,
            PrefType    type,
            PRBool      defPref)
{
    printf("%spref(\"%s\", ", defPref ? "" : "user_", pref);
    switch (type) {
    case PREF_STRING:
        printf("\"%s\");\n", val.stringVal);
        break;
    case PREF_INT:
        printf("%i);\n", val.intVal);
        break;
    case PREF_BOOL:
        printf("%s);\n", val.boolVal == PR_FALSE ? "false" : "true");
        break;
    }
}

int
main(int argc, char **argv)
{
    PrefParseState ps;
    char buf[4096];     /* i/o buffer */
    FILE *fp;
    int n;

    if (argc == 1) {
        printf("usage: prefread file.js\n");
        return -1;
    }

    fp = fopen(argv[1], "r");
    if (!fp) {
        printf("failed to open file\n");
        return -1;
    }

    PREF_InitParseState(&ps, pref_reader, NULL);

    while ((n = fread(buf, 1, sizeof(buf), fp)) > 0)
        PREF_ParseBuf(&ps, buf, n);

    PREF_FinalizeParseState(&ps);

    fclose(fp);
    return 0;
}

#endif /* TEST_PREFREAD */