Files
wmic/Samba/source/lib/appweb/ejs-2.0/exml/exmlParser.c
T

753 lines
16 KiB
C
Raw Normal View History

2019-02-16 00:16:52 +01:00
/*
* exml.c -- A simple SAX style XML parser
*/
/********************************* Description ********************************/
/*
* This is a recursive descent parser for XML text files. It is a one-pass
* simple parser that invokes a user supplied callback for key tokens in the
* XML file. The user supplies a read function so that XML files can be parsed
* from disk or in-memory.
*/
/********************************** Includes **********************************/
#include "exml.h"
/****************************** Forward Declarations **************************/
/* MOB -- FIX */
#if BLD_FEATURE_EXML || 1
static int parseNext(Exml *xp, int state);
static ExmlToken getToken(Exml *xp, int state);
static int getNextChar(Exml *xp);
static int scanFor(Exml *xp, char *str);
static int putLastChar(Exml *xp, int c);
static void error(Exml *xp, char *fmt, ...);
static void trimToken(Exml *xp);
/************************************ Code ************************************/
Exml *exmlOpen(MprCtx ctx, int initialSize, int maxSize)
{
Exml *xp;
xp = mprAllocTypeZeroed(ctx, Exml);
xp->inBuf = mprCreateBuf(xp, EXML_BUFSIZE, EXML_BUFSIZE);
xp->tokBuf = mprCreateBuf(xp, initialSize, maxSize);
return xp;
}
/******************************************************************************/
void exmlClose(Exml *xp)
{
mprAssert(xp);
mprFree(xp);
}
/******************************************************************************/
void exmlSetParserHandler(Exml *xp, ExmlHandler h)
{
mprAssert(xp);
xp->handler = h;
}
/******************************************************************************/
void exmlSetInputStream(Exml *xp, ExmlInputStream s, void *arg)
{
mprAssert(xp);
xp->readFn = s;
xp->inputArg = arg;
}
/******************************************************************************/
/*
* Set the parse arg
*/
void exmlSetParseArg(Exml *xp, void *parseArg)
{
mprAssert(xp);
xp->parseArg = parseArg;
}
/******************************************************************************/
/*
* Set the parse arg
*/
void *exmlGetParseArg(Exml *xp)
{
mprAssert(xp);
return xp->parseArg;
}
/******************************************************************************/
/*
* Parse an XML file. Return 0 for success, -1 for error.
*/
int exmlParse(Exml *xp)
{
mprAssert(xp);
return parseNext(xp, EXML_BEGIN);
}
/******************************************************************************/
/*
* XML parser. This is a recursive descent parser. Return -1 for errors, 0 for
* EOF and 1 if there is still more data to parse.
*/
static int parseNext(Exml *xp, int state)
{
ExmlHandler handler;
ExmlToken token;
MprBuf *tokBuf;
char *tname, *aname;
int rc;
mprAssert(state >= 0);
tokBuf = xp->tokBuf;
handler = xp->handler;
tname = aname = 0;
rc = 0;
/*
* In this parse loop, the state is never assigned EOF or ERR. In
* such cases we always return EOF or ERR.
*/
while (1) {
token = getToken(xp, state);
if (token == TOKEN_TOO_BIG) {
error(xp, "XML token is too big");
goto err;
}
switch (state) {
case EXML_BEGIN: /* ------------------------------------------ */
/*
* Expect to get an element, comment or processing instruction
*/
switch (token) {
case TOKEN_EOF:
goto exit;
case TOKEN_LS:
/*
* Recurse to handle the new element, comment etc.
*/
rc = parseNext(xp, EXML_AFTER_LS);
if (rc < 0) {
goto exit;
}
break;
default:
error(xp, "Syntax error");
goto err;
}
break;
case EXML_AFTER_LS: /* ------------------------------------------ */
switch (token) {
case TOKEN_COMMENT:
state = EXML_COMMENT;
rc = (*handler)(xp, state, "!--", 0, mprGetBufStart(tokBuf));
if (rc < 0) {
goto err;
}
rc = 1;
goto exit;
case TOKEN_CDATA:
state = EXML_CDATA;
rc = (*handler)(xp, state, "!--", 0, mprGetBufStart(tokBuf));
if (rc < 0) {
goto err;
}
rc = 1;
goto exit;
case TOKEN_INSTRUCTIONS:
/* Just ignore processing instructions */
rc = 1;
goto exit;
case TOKEN_TEXT:
state = EXML_NEW_ELT;
tname = mprStrdup(xp, mprGetBufStart(tokBuf));
if (tname == 0) {
rc = MPR_ERR_MEMORY;
goto exit;
}
rc = (*handler)(xp, state, tname, 0, 0);
if (rc < 0) {
goto err;
}
break;
default:
error(xp, "Syntax error");
goto err;
}
break;
case EXML_NEW_ELT: /* ------------------------------------------ */
/*
* We have seen the opening "<element" for a new element and have
* not yet seen the terminating ">" of the opening element.
*/
switch (token) {
case TOKEN_TEXT:
/*
* Must be an attribute name
*/
aname = mprStrdup(xp, mprGetBufStart(tokBuf));
token = getToken(xp, state);
if (token != TOKEN_EQ) {
error(xp, "Missing assignment for attribute \"%s\"", aname);
goto err;
}
token = getToken(xp, state);
if (token != TOKEN_TEXT) {
error(xp, "Missing value for attribute \"%s\"", aname);
goto err;
}
state = EXML_NEW_ATT;
rc = (*handler)(xp, state, tname, aname,
mprGetBufStart(tokBuf));
if (rc < 0) {
goto err;
}
state = EXML_NEW_ELT;
break;
case TOKEN_GR:
/*
* This is ">" the termination of the opening element
*/
if (*tname == '\0') {
error(xp, "Missing element name");
goto err;
}
/*
* Tell the user that the opening element is now complete
*/
state = EXML_ELT_DEFINED;
rc = (*handler)(xp, state, tname, 0, 0);
if (rc < 0) {
goto err;
}
state = EXML_ELT_DATA;
break;
case TOKEN_SLASH_GR:
/*
* If we see a "/>" then this is a solo element
*/
if (*tname == '\0') {
error(xp, "Missing element name");
goto err;
}
state = EXML_SOLO_ELT_DEFINED;
rc = (*handler)(xp, state, tname, 0, 0);
if (rc < 0) {
goto err;
}
rc = 1;
goto exit;
default:
error(xp, "Syntax error");
goto err;
}
break;
case EXML_ELT_DATA: /* -------------------------------------- */
/*
* We have seen the full opening element "<name ...>" and now
* await data or another element.
*/
if (token == TOKEN_LS) {
/*
* Recurse to handle the new element, comment etc.
*/
rc = parseNext(xp, EXML_AFTER_LS);
if (rc < 0) {
goto exit;
}
break;
} else if (token == TOKEN_LS_SLASH) {
state = EXML_END_ELT;
break;
} else if (token != TOKEN_TEXT) {
goto err;
}
if (mprGetBufLength(tokBuf) > 0) {
/*
* Pass the data between the element to the user
*/
rc = (*handler)(xp, state, tname, 0, mprGetBufStart(tokBuf));
if (rc < 0) {
goto err;
}
}
break;
case EXML_END_ELT: /* -------------------------------------- */
if (token != TOKEN_TEXT) {
error(xp, "Missing closing element name for \"%s\"", tname);
goto err;
}
/*
* The closing element name must match the opening element name
*/
if (strcmp(tname, mprGetBufStart(tokBuf)) != 0) {
error(xp,
"Closing element name \"%s\" does not match on line %d"
"opening name \"%s\"",
mprGetBufStart(tokBuf), xp->lineNumber, tname);
goto err;
}
rc = (*handler)(xp, state, tname, 0, 0);
if (rc < 0) {
goto err;
}
if (getToken(xp, state) != TOKEN_GR) {
error(xp, "Syntax error");
goto err;
}
return 1;
case EXML_EOF: /* ---------------------------------------------- */
goto exit;
case EXML_ERR: /* ---------------------------------------------- */
default:
goto err;
}
}
mprAssert(0);
err:
rc = -1;
exit:
mprFree(tname);
mprFree(aname);
return rc;
}
/******************************************************************************/
/*
* Lexical analyser for XML. Return the next token reading input as required.
* It uses a one token look ahead and push back mechanism (LAR1 parser).
* Text token identifiers are left in the tokBuf parser buffer on exit.
* This Lex has special cases for the states EXML_ELT_DATA where we
* have an optimized read of element data, and EXML_AFTER_LS where we
* distinguish between element names, processing instructions and comments.
*/
static ExmlToken getToken(Exml *xp, int state)
{
MprBuf *tokBuf, *inBuf;
uchar *cp;
int c, rc;
tokBuf = xp->tokBuf;
inBuf = xp->inBuf;
mprAssert(state >= 0);
if ((c = getNextChar(xp)) < 0) {
return TOKEN_EOF;
}
mprFlushBuf(tokBuf);
/*
* Special case parsing for names and for element data. We do this for
* performance so we can return to the caller the largest token possible
*/
if (state == EXML_ELT_DATA) {
/*
* Read all the data up to the start of the closing element "<" or the
* start of a sub-element.
*/
#if UNUSED
while (isspace(c)) {
if ((c = getNextChar(xp)) < 0) {
return TOKEN_EOF;
}
}
#endif
if (c == '<') {
if ((c = getNextChar(xp)) < 0) {
return TOKEN_EOF;
}
if (c == '/') {
return TOKEN_LS_SLASH;
}
putLastChar(xp, c);
return TOKEN_LS;
}
do {
if (mprPutCharToBuf(tokBuf, c) < 0) {
return TOKEN_TOO_BIG;
}
if ((c = getNextChar(xp)) < 0) {
return TOKEN_EOF;
}
} while (c != '<');
/*
* Put back the last look-ahead character
*/
putLastChar(xp, c);
/*
* If all white space, then zero the token buffer
*/
for (cp = tokBuf->start; *cp; cp++) {
if (!isspace(*cp)) {
return TOKEN_TEXT;
}
}
mprFlushBuf(tokBuf);
return TOKEN_TEXT;
}
while (1) {
switch (c) {
case ' ':
case '\n':
case '\t':
case '\r':
break;
case '<':
if ((c = getNextChar(xp)) < 0) {
return TOKEN_EOF;
}
if (c == '/') {
return TOKEN_LS_SLASH;
}
putLastChar(xp, c);
return TOKEN_LS;
case '=':
return TOKEN_EQ;
case '>':
return TOKEN_GR;
case '/':
if ((c = getNextChar(xp)) < 0) {
return TOKEN_EOF;
}
if (c == '>') {
return TOKEN_SLASH_GR;
}
return TOKEN_ERR;
case '\"':
case '\'':
xp->quoteChar = c;
/* Fall through */
default:
/*
* We handle element names, attribute names and attribute values
* here. We do NOT handle data between elements here. Read the
* token. Stop on white space or a closing element ">"
*/
if (xp->quoteChar) {
if ((c = getNextChar(xp)) < 0) {
return TOKEN_EOF;
}
while (c != xp->quoteChar) {
if (mprPutCharToBuf(tokBuf, c) < 0) {
return TOKEN_TOO_BIG;
}
if ((c = getNextChar(xp)) < 0) {
return TOKEN_EOF;
}
}
xp->quoteChar = 0;
} else {
while (!isspace(c) && c != '>' && c != '/' && c != '=') {
if (mprPutCharToBuf(tokBuf, c) < 0) {
return TOKEN_TOO_BIG;
}
if ((c = getNextChar(xp)) < 0) {
return TOKEN_EOF;
}
}
putLastChar(xp, c);
}
if (mprGetBufLength(tokBuf) <= 0) {
return TOKEN_ERR;
}
mprAddNullToBuf(tokBuf);
if (state == EXML_AFTER_LS) {
/*
* If we are just inside an element "<", then analyze what we
* have to see if we have an element name, instruction or
* comment. Tokbuf will hold "?" for instructions or "!--"
* for comments.
*/
if (mprLookAtNextCharInBuf(tokBuf) == '?') {
/* Just ignore processing instructions */
rc = scanFor(xp, "?>");
if (rc < 0) {
return TOKEN_TOO_BIG;
} else if (rc == 0) {
return TOKEN_ERR;
}
return TOKEN_INSTRUCTIONS;
} else if (mprLookAtNextCharInBuf(tokBuf) == '!') {
/*
* First discard the comment leadin "!--" and eat leading
* white space.
*/
if (strcmp((char*) tokBuf->start, "![CDATA[") == 0) {
mprFlushBuf(tokBuf);
#if UNUSED
c = mprLookAtNextCharInBuf(inBuf);
while (isspace(c)) {
if ((c = getNextChar(xp)) < 0) {
return TOKEN_EOF;
}
c = mprLookAtNextCharInBuf(inBuf);
}
#endif
rc = scanFor(xp, "]]>");
if (rc < 0) {
return TOKEN_TOO_BIG;
} else if (rc == 0) {
return TOKEN_ERR;
}
return TOKEN_CDATA;
} else {
mprFlushBuf(tokBuf);
#if UNUSED
c = mprLookAtNextCharInBuf(inBuf);
while (isspace(c)) {
if ((c = getNextChar(xp)) < 0) {
return TOKEN_EOF;
}
c = mprLookAtNextCharInBuf(inBuf);
}
#endif
rc = scanFor(xp, "-->");
if (rc < 0) {
return TOKEN_TOO_BIG;
} else if (rc == 0) {
return TOKEN_ERR;
}
return TOKEN_COMMENT;
}
}
}
trimToken(xp);
return TOKEN_TEXT;
}
if ((c = getNextChar(xp)) < 0) {
return TOKEN_EOF;
}
}
/* Should never get here */
mprAssert(0);
return TOKEN_ERR;
}
/******************************************************************************/
/*
* Scan for a pattern. Eat and discard input up to the pattern. Return 1 if
* the pattern was found, return 0 if not found. Return < 0 on errors.
*/
static int scanFor(Exml *xp, char *str)
{
MprBuf *tokBuf;
char *cp;
int c;
mprAssert(str);
tokBuf = xp->tokBuf;
while (1) {
for (cp = str; *cp; cp++) {
if ((c = getNextChar(xp)) < 0) {
return 0;
}
if (tokBuf) {
if (mprPutCharToBuf(tokBuf, c) < 0) {
return -1;
}
}
if (c != *cp) {
break;
}
}
if (*cp == '\0') {
/*
* Remove the pattern from the tokBuf
*/
if (tokBuf) {
mprAdjustBufEnd(tokBuf, -(int) strlen(str));
trimToken(xp);
}
return 1;
}
}
}
/******************************************************************************/
/*
* Get another character. We read and buffer blocks of data if we need more
* data to parse.
*/
static int getNextChar(Exml *xp)
{
MprBuf *inBuf;
char c;
int l;
inBuf = xp->inBuf;
if (mprGetBufLength(inBuf) <= 0) {
/*
* Flush to reset the servp/endp pointers to the start of the buffer
* so we can do a maximal read
*/
mprFlushBuf(inBuf);
l = (xp->readFn)(xp, xp->inputArg, mprGetBufStart(inBuf),
mprGetBufLinearSpace(inBuf));
if (l <= 0) {
return -1;
}
mprAdjustBufEnd(inBuf, l);
}
c = mprGetCharFromBuf(inBuf);
if (c == '\n') {
xp->lineNumber++;
}
return c;
}
/******************************************************************************/
/*
* Put back a character in the input buffer
*/
static int putLastChar(Exml *xp, int c)
{
if (mprInsertCharToBuf(xp->inBuf, (char) c) < 0) {
mprAssert(0);
return -1;
}
if (c == '\n') {
xp->lineNumber--;
}
return 0;
}
/******************************************************************************/
/*
* Output a parse message
*/
static void error(Exml *xp, char *fmt, ...)
{
va_list args;
char *buf;
mprAssert(fmt);
va_start(args, fmt);
mprAllocVsprintf(MPR_LOC_ARGS(xp), &buf, MPR_MAX_STRING, fmt, args);
va_end(args);
/*
* MOB need to add the failing line text and a pointer to which column
*/
mprFree(xp->errMsg);
mprAllocSprintf(MPR_LOC_ARGS(xp), &xp->errMsg, MPR_MAX_STRING,
"XML error: %s\nAt line %d\n", buf, xp->lineNumber);
mprFree(buf);
}
/******************************************************************************/
/*
* Remove trailing whitespace in a token and ensure it is terminated with
* a NULL for easy parsing
*/
static void trimToken(Exml *xp)
{
while (isspace(mprLookAtLastCharInBuf(xp->tokBuf))) {
mprAdjustBufEnd(xp->tokBuf, -1);
}
mprAddNullToBuf(xp->tokBuf);
}
/******************************************************************************/
const char *exmlGetErrorMsg(Exml *xp)
{
if (xp->errMsg == 0) {
return "";
}
return xp->errMsg;
}
/******************************************************************************/
int exmlGetLineNumber(Exml *xp)
{
return xp->lineNumber;
}
/******************************************************************************/
#else
void exmlParserDummy() {}
#endif /* BLD_FEATURE_EXML */
/*
* Local variables:
* tab-width: 4
* c-basic-offset: 4
* End:
* vim:tw=78
* vim600: sw=4 ts=4 fdm=marker
* vim<600: sw=4 ts=4
*/