mirror of
https://github.com/darlinghq/darling-libxml2.git
synced 2025-04-02 21:42:06 +00:00
1184 lines
29 KiB
C
1184 lines
29 KiB
C
/*
|
|
* parser.c : an XML 1.0 non-verifying parser
|
|
*
|
|
* See Copyright for the status of this software.
|
|
*
|
|
* $Id$
|
|
*/
|
|
|
|
#include <config.h>
|
|
#include <stdio.h>
|
|
#include <ctype.h>
|
|
#include <string.h> /* for memset() only */
|
|
#include <malloc.h>
|
|
#include <sys/stat.h>
|
|
#ifdef HAVE_FCNTL_H
|
|
#include <fcntl.h>
|
|
#endif
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif
|
|
#ifdef HAVE_ZLIB_H
|
|
#include <zlib.h>
|
|
#endif
|
|
|
|
#include "xml_tree.h"
|
|
#include "xml_parser.h"
|
|
#include "xml_entities.h"
|
|
|
|
/*
|
|
* A few macros needed to help building the parser.
|
|
*/
|
|
|
|
#ifdef UNICODE
|
|
/*
|
|
* UNICODE version of the macros. Incomplete now TODO !!!!
|
|
*/
|
|
#define IS_CHAR(c) \
|
|
(((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || \
|
|
(((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF)))
|
|
|
|
#define SKIP_BLANKS(p) \
|
|
while ((*(p) == 0x20) || (*(p) == 0x09) || (*(p) == 0xa) || \
|
|
(*(p) == 0x3000)) (p)++;
|
|
|
|
/* I'm too lazy to complete this one TODO !!!! */
|
|
#define IS_BASECHAR(c) \
|
|
((((c) >= 0x41) && ((c) <= 0x5a)) || \
|
|
(((c) >= 0x61) && ((c) <= 0x7a)) || \
|
|
(((c) >= 0xaa) && ((c) <= 0x5b)) || \
|
|
(((c) >= 0xc0) && ((c) <= 0xd6)) || \
|
|
(((c) >= 0xd8) && ((c) <= 0xf6)) || \
|
|
(((c) >= 0xf8) && ((c) <= 0xff)) || \
|
|
((c) == 0xba))
|
|
|
|
/* I'm too lazy to complete this one TODO !!!! */
|
|
#define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
|
|
|
|
/* I'm too lazy to complete this one TODO !!!! */
|
|
#define IS_COMBINING(c) 0
|
|
|
|
#define IS_IGNORABLE(c) \
|
|
((((c) >= 0x200c) && ((c) <= 0x200f)) || \
|
|
(((c) >= 0x202a) && ((c) <= 0x202e)) || \
|
|
(((c) >= 0x206a) && ((c) <= 0x206f)) || \
|
|
((c) == 0xfeff))
|
|
|
|
#define IS_EXTENDER(c) \
|
|
(((c) == 0xb7) || ((c) == 0x2d0) || ((c) == 0x2d1) || \
|
|
((c) == 0x387) || ((c) == 0x640) || ((c) == 0xe46) || \
|
|
((c) == 0xec6) || ((c) == 0x3005) \
|
|
(((c) >= 0x3031) && ((c) <= 0x3035)) || \
|
|
(((c) >= 0x309b) && ((c) <= 0x309e)) || \
|
|
(((c) >= 0x30fc) && ((c) <= 0x30fe)) || \
|
|
(((c) >= 0xff70) && ((c) <= 0xff9e)) || \
|
|
((c) == 0xff9f))
|
|
|
|
#define IS_IDEOGRAPHIC(c) \
|
|
((((c) >= 0x4e00) && ((c) <= 0x9fa5)) || \
|
|
(((c) >= 0xf900) && ((c) <= 0xfa2d)) || \
|
|
(((c) >= 0x3021) && ((c) <= 0x3029)) || \
|
|
((c) == 0x3007))
|
|
|
|
#define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c))
|
|
|
|
/* I'm too lazy to complete this one ! */
|
|
#define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
|
|
#else
|
|
/*
|
|
* 8bits / ASCII version of the macros.
|
|
*/
|
|
#define IS_CHAR(c) \
|
|
(((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || ((c) >= 0x20))
|
|
|
|
#define IS_BASECHAR(c) \
|
|
((((c) >= 0x41) && ((c) <= 0x5a)) || \
|
|
(((c) >= 0x61) && ((c) <= 0x7a)) || \
|
|
(((c) >= 0xaa) && ((c) <= 0x5b)) || \
|
|
(((c) >= 0xc0) && ((c) <= 0xd6)) || \
|
|
(((c) >= 0xd8) && ((c) <= 0xf6)) || \
|
|
(((c) >= 0xf8) && ((c) <= 0xff)) || \
|
|
((c) == 0xba))
|
|
|
|
#define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
|
|
|
|
#define IS_LETTER(c) IS_BASECHAR(c)
|
|
|
|
#define IS_COMBINING(c) 0
|
|
|
|
#define IS_IGNORABLE(c) 0
|
|
|
|
#define IS_EXTENDER(c) ((c) == 0xb7)
|
|
|
|
#define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
|
|
#endif
|
|
|
|
|
|
#define SKIP_EOL(p) \
|
|
if (*(p) == 0x13) { p++ ; if (*(p) == 0x10) p++; } \
|
|
if (*(p) == 0x10) { p++ ; if (*(p) == 0x13) p++; }
|
|
|
|
#define SKIP_BLANKS(p) \
|
|
while (IS_BLANK(*(p))) (p)++;
|
|
|
|
#define MOVETO_ENDTAG(p) \
|
|
while (IS_CHAR(*p) && (*(p) != '>')) (p)++;
|
|
|
|
#define MOVETO_STARTTAG(p) \
|
|
while (IS_CHAR(*p) && (*(p) != '<')) (p)++;
|
|
|
|
/*
|
|
* Forward definition for recusive behaviour.
|
|
*/
|
|
xmlNodePtr xmlParseElement(xmlParserCtxtPtr ctxt);
|
|
|
|
/*
|
|
* xmlHandleData : this routine represent's the specific application
|
|
* behaviour when reading a piece of text.
|
|
*
|
|
* For example in WebDav, any piece made only of blanks is eliminated
|
|
*/
|
|
|
|
CHAR *xmlHandleData(CHAR *in) {
|
|
CHAR *cur;
|
|
|
|
if (in == NULL) return(NULL);
|
|
cur = in;
|
|
while (IS_CHAR(*cur)) {
|
|
if (!IS_BLANK(*cur)) goto not_blank;
|
|
cur++;
|
|
}
|
|
free(in);
|
|
return(NULL);
|
|
|
|
not_blank:
|
|
return(in);
|
|
}
|
|
|
|
/*
|
|
* xmlStrndup : a strdup for array of CHAR's
|
|
*/
|
|
|
|
CHAR *xmlStrndup(const CHAR *cur, int len) {
|
|
CHAR *ret = malloc((len + 1) * sizeof(CHAR));
|
|
|
|
if (ret == NULL) {
|
|
fprintf(stderr, "malloc of %d byte failed\n",
|
|
(len + 1) * sizeof(CHAR));
|
|
return(NULL);
|
|
}
|
|
memcpy(ret, cur, len * sizeof(CHAR));
|
|
ret[len] = 0;
|
|
return(ret);
|
|
}
|
|
|
|
/*
|
|
* xmlStrdup : a strdup for CHAR's
|
|
*/
|
|
|
|
CHAR *xmlStrdup(const CHAR *cur) {
|
|
const CHAR *p = cur;
|
|
|
|
while (IS_CHAR(*p)) p++;
|
|
return(xmlStrndup(cur, p - cur));
|
|
}
|
|
|
|
/*
|
|
* xmlStrcmp : a strcmp for CHAR's
|
|
*/
|
|
|
|
int xmlStrcmp(const CHAR *str1, const CHAR *str2) {
|
|
register int tmp;
|
|
|
|
do {
|
|
tmp = *str1++ - *str2++;
|
|
if (tmp != 0) return(tmp);
|
|
} while ((*str1 != 0) && (*str2 != 0));
|
|
return (*str1 - *str2);
|
|
}
|
|
|
|
/*
|
|
* xmlStrncmp : a strncmp for CHAR's
|
|
*/
|
|
|
|
int xmlStrncmp(const CHAR *str1, const CHAR *str2, int len) {
|
|
register int tmp;
|
|
|
|
if (len <= 0) return(0);
|
|
do {
|
|
tmp = *str1++ - *str2++;
|
|
if (tmp != 0) return(tmp);
|
|
len--;
|
|
if (len <= 0) return(0);
|
|
} while ((*str1 != 0) && (*str2 != 0));
|
|
return (*str1 - *str2);
|
|
}
|
|
|
|
/*
|
|
* xmlStrchr : a strchr for CHAR's
|
|
*/
|
|
|
|
CHAR *xmlStrchr(const CHAR *str, CHAR val) {
|
|
while (*str != 0) {
|
|
if (*str == val) return((CHAR *) str);
|
|
str++;
|
|
}
|
|
return(NULL);
|
|
}
|
|
|
|
/*
|
|
* xmlParseName : parse an XML name.
|
|
*/
|
|
|
|
CHAR *xmlParseName(xmlParserCtxtPtr ctxt) {
|
|
const CHAR *q;
|
|
CHAR *ret = NULL;
|
|
|
|
/*
|
|
* Name ::= (Letter | '_') (NameChar)*
|
|
*/
|
|
if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return(NULL);
|
|
q = ctxt->cur++;
|
|
while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
|
|
(ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') || (ctxt->cur[0] == '_') ||
|
|
(ctxt->cur[0] == ':') ||
|
|
(IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
|
|
(IS_EXTENDER(ctxt->cur[0])))
|
|
ctxt->cur++;
|
|
|
|
ret = xmlStrndup(q, ctxt->cur - q);
|
|
|
|
return(ret);
|
|
}
|
|
|
|
/*
|
|
* Parse and return a string between quotes or doublequotes
|
|
*/
|
|
CHAR *xmlParseQuotedString(xmlParserCtxtPtr ctxt) {
|
|
CHAR *ret = NULL;
|
|
const CHAR *q;
|
|
|
|
if (ctxt->cur[0] == '"') {
|
|
ctxt->cur++;
|
|
q = ctxt->cur;
|
|
while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '"')) ctxt->cur++;
|
|
if (ctxt->cur[0] != '"')
|
|
fprintf(stderr, "String not closed \"%.50s\n", q);
|
|
else {
|
|
ret = xmlStrndup(q, ctxt->cur - q);
|
|
ctxt->cur++;
|
|
}
|
|
} else if (ctxt->cur[0] == '\''){
|
|
ctxt->cur++;
|
|
q = ctxt->cur;
|
|
while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '\'')) ctxt->cur++;
|
|
if (ctxt->cur[0] != '\'')
|
|
fprintf(stderr, "String not closed '%.50s\n", q);
|
|
else {
|
|
ret = xmlStrndup(q, ctxt->cur - q);
|
|
ctxt->cur++;
|
|
}
|
|
}
|
|
return(ret);
|
|
}
|
|
|
|
/*
|
|
* Skip an XML (SGML) comment <!-- .... -->
|
|
*
|
|
* TODO !!!! Save the comment in the tree !!!
|
|
*/
|
|
void xmlParserSkipComment(xmlParserCtxtPtr ctxt) {
|
|
const CHAR *q, *start;
|
|
const CHAR *r;
|
|
|
|
/*
|
|
* An extra check may avoid errors and isn't that costly !
|
|
*/
|
|
if ((ctxt->cur[0] != '<') || (ctxt->cur[1] != '!') ||
|
|
(ctxt->cur[2] != '-') || (ctxt->cur[3] != '-')) return;
|
|
|
|
ctxt->cur += 4;
|
|
start = q = ctxt->cur;
|
|
ctxt->cur++;
|
|
r = ctxt->cur;
|
|
ctxt->cur++;
|
|
while (IS_CHAR(ctxt->cur[0]) &&
|
|
((ctxt->cur[0] == ':') || (ctxt->cur[0] != '>') ||
|
|
(*r != '-') || (*q != '-'))) {
|
|
ctxt->cur++;r++;q++;
|
|
}
|
|
if (!IS_CHAR(ctxt->cur[0])) {
|
|
fprintf(stderr, "Comment not terminated <!--%.50s\n", start);
|
|
ctxt->cur = start; /* !!! We shouldn't really try to recover !!! */
|
|
} else {
|
|
ctxt->cur++;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* xmlParseNamespace: parse specific '<?namespace ...' constructs.
|
|
*/
|
|
|
|
void xmlParseNamespace(xmlParserCtxtPtr ctxt) {
|
|
CHAR *href = NULL;
|
|
CHAR *AS = NULL;
|
|
int garbage = 0;
|
|
|
|
/*
|
|
* We just skipped "namespace" or "xml:namespace"
|
|
*/
|
|
SKIP_BLANKS(ctxt->cur);
|
|
|
|
while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '>')) {
|
|
/*
|
|
* We can have "ns" or "prefix" attributes
|
|
* Old encoding as 'href' or 'AS' attributes is still supported
|
|
*/
|
|
if ((ctxt->cur[0] == 'n') && (ctxt->cur[1] == 's')) {
|
|
garbage = 0;
|
|
ctxt->cur += 2;
|
|
SKIP_BLANKS(ctxt->cur);
|
|
|
|
if (ctxt->cur[0] != '=') continue;
|
|
ctxt->cur++;
|
|
SKIP_BLANKS(ctxt->cur);
|
|
|
|
href = xmlParseQuotedString(ctxt);
|
|
SKIP_BLANKS(ctxt->cur);
|
|
} else if ((ctxt->cur[0] == 'h') && (ctxt->cur[1] == 'r') &&
|
|
(ctxt->cur[2] == 'e') && (ctxt->cur[3] == 'f')) {
|
|
garbage = 0;
|
|
ctxt->cur += 4;
|
|
SKIP_BLANKS(ctxt->cur);
|
|
|
|
if (ctxt->cur[0] != '=') continue;
|
|
ctxt->cur++;
|
|
SKIP_BLANKS(ctxt->cur);
|
|
|
|
href = xmlParseQuotedString(ctxt);
|
|
SKIP_BLANKS(ctxt->cur);
|
|
} else if ((ctxt->cur[0] == 'p') && (ctxt->cur[1] == 'r') &&
|
|
(ctxt->cur[2] == 'e') && (ctxt->cur[3] == 'f') &&
|
|
(ctxt->cur[4] == 'i') && (ctxt->cur[5] == 'x')) {
|
|
garbage = 0;
|
|
ctxt->cur += 6;
|
|
SKIP_BLANKS(ctxt->cur);
|
|
|
|
if (ctxt->cur[0] != '=') continue;
|
|
ctxt->cur++;
|
|
SKIP_BLANKS(ctxt->cur);
|
|
|
|
AS = xmlParseQuotedString(ctxt);
|
|
SKIP_BLANKS(ctxt->cur);
|
|
} else if ((ctxt->cur[0] == 'A') && (ctxt->cur[1] == 'S')) {
|
|
garbage = 0;
|
|
ctxt->cur += 2;
|
|
SKIP_BLANKS(ctxt->cur);
|
|
|
|
if (ctxt->cur[0] != '=') continue;
|
|
ctxt->cur++;
|
|
SKIP_BLANKS(ctxt->cur);
|
|
|
|
AS = xmlParseQuotedString(ctxt);
|
|
SKIP_BLANKS(ctxt->cur);
|
|
} else if ((ctxt->cur[0] == '?') && (ctxt->cur[1] == '>')) {
|
|
garbage = 0;
|
|
ctxt->cur ++;
|
|
} else {
|
|
/*
|
|
* Found garbage when parsing the namespace
|
|
*/
|
|
if (!garbage) fprintf(stderr,
|
|
"\nxmlParseNamespace found garbage: ");
|
|
fprintf(stderr, "%c", ctxt->cur[0]);
|
|
ctxt->cur++;
|
|
}
|
|
}
|
|
|
|
MOVETO_ENDTAG(ctxt->cur);
|
|
ctxt->cur++;
|
|
|
|
/*
|
|
* Register the DTD.
|
|
*/
|
|
if (href != NULL)
|
|
xmlNewDtd(ctxt->doc, href, AS);
|
|
|
|
if (AS != NULL) free(AS);
|
|
if (href != NULL) free(href);
|
|
}
|
|
|
|
/*
|
|
* xmlParsePI: parse an XML Processing Instruction.
|
|
*/
|
|
|
|
void xmlParsePI(xmlParserCtxtPtr ctxt) {
|
|
if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) {
|
|
/*
|
|
* this is a Processing Instruction.
|
|
*/
|
|
ctxt->cur += 2;
|
|
|
|
/*
|
|
* Special for WebDav, support for the Processing Instruction
|
|
* '<?namespace ...' contruct in the header of the XML document.
|
|
*/
|
|
if ((ctxt->cur[0] == 'n') && (ctxt->cur[1] == 'a') &&
|
|
(ctxt->cur[2] == 'm') && (ctxt->cur[3] == 'e') &&
|
|
(ctxt->cur[4] == 's') && (ctxt->cur[5] == 'p') &&
|
|
(ctxt->cur[6] == 'a') && (ctxt->cur[7] == 'c') &&
|
|
(ctxt->cur[8] == 'e')) {
|
|
ctxt->cur += 9;
|
|
xmlParseNamespace(ctxt);
|
|
} else if ((ctxt->cur[0] == 'x') && (ctxt->cur[1] == 'm') &&
|
|
(ctxt->cur[2] == 'l') && (ctxt->cur[3] == ':') &&
|
|
(ctxt->cur[4] == 'n') && (ctxt->cur[5] == 'a') &&
|
|
(ctxt->cur[6] == 'm') && (ctxt->cur[7] == 'e') &&
|
|
(ctxt->cur[8] == 's') && (ctxt->cur[9] == 'p') &&
|
|
(ctxt->cur[10] == 'a') && (ctxt->cur[11] == 'c') &&
|
|
(ctxt->cur[12] == 'e')) {
|
|
ctxt->cur += 13;
|
|
xmlParseNamespace(ctxt);
|
|
} else {
|
|
/* Unknown PI, ignore it ! */
|
|
fprintf(stderr, "xmlParsePI : skipping unknown PI %30s\n",
|
|
ctxt->cur);
|
|
MOVETO_ENDTAG(ctxt->cur);
|
|
ctxt->cur++;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* xmlParseAttribute: parse a start of tag.
|
|
*
|
|
* Attribute ::= Name Eq AttValue
|
|
*/
|
|
|
|
void xmlParseAttribute(xmlParserCtxtPtr ctxt, xmlNodePtr node) {
|
|
const CHAR *q;
|
|
CHAR *name, *value = NULL;
|
|
|
|
if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) {
|
|
return;
|
|
}
|
|
q = ctxt->cur++;
|
|
while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
|
|
(ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
|
|
(ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') ||
|
|
(IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
|
|
(IS_EXTENDER(ctxt->cur[0])))
|
|
ctxt->cur++;
|
|
name = xmlStrndup(q, ctxt->cur - q);
|
|
|
|
/*
|
|
* We should have the equal, we are laxist here and allow attributes
|
|
* without values and extra spaces.
|
|
*/
|
|
SKIP_BLANKS(ctxt->cur);
|
|
if (ctxt->cur[0] == '=') {
|
|
ctxt->cur++;
|
|
SKIP_BLANKS(ctxt->cur);
|
|
if ((ctxt->cur[0] != '\'') && (ctxt->cur[0] != '"')) {
|
|
fprintf(stderr, "Quotes were expected for attribute value %.20s\n",
|
|
q);
|
|
} else
|
|
value = xmlParseQuotedString(ctxt);
|
|
}
|
|
|
|
/*
|
|
* Add the attribute to the node.
|
|
*/
|
|
if (name != NULL) {
|
|
xmlNewProp(node, name, value);
|
|
free(name);
|
|
}
|
|
if ( value != NULL )
|
|
free(value);
|
|
}
|
|
|
|
/*
|
|
* xmlParseStartTag: parse a start of tag.
|
|
*/
|
|
|
|
xmlNodePtr xmlParseStartTag(xmlParserCtxtPtr ctxt) {
|
|
const CHAR *q;
|
|
CHAR *ns, *name;
|
|
xmlDtdPtr dtd = NULL;
|
|
xmlNodePtr ret = NULL;
|
|
|
|
/*
|
|
* Theorically one should just parse a Name, but with the addition
|
|
* of the namespace needed for WebDav, it's a bit more complicated
|
|
* since the element name may be prefixed by a namespace prefix.
|
|
*
|
|
* QName ::= (NSPart ':')? LocalPart
|
|
* NSPart ::= Name
|
|
* LocalPart ::= Name
|
|
* STag ::= '<' QName (S Attribute)* S? '>'
|
|
*
|
|
* instead of :
|
|
*
|
|
* STag ::= '<' QName (S Attribute)* S? '>'
|
|
*/
|
|
if (ctxt->cur[0] != '<') return(NULL);
|
|
ctxt->cur++;
|
|
|
|
if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return(NULL);
|
|
q = ctxt->cur++;
|
|
while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
|
|
(ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
|
|
(ctxt->cur[0] == '_') ||
|
|
(IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
|
|
(IS_EXTENDER(ctxt->cur[0])))
|
|
ctxt->cur++;
|
|
|
|
if (ctxt->cur[0] == ':') {
|
|
ns = xmlStrndup(q, ctxt->cur - q);
|
|
|
|
ctxt->cur++; /* skip the column */
|
|
if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) {
|
|
fprintf(stderr,
|
|
"Start tag : no element name after namespace identifier %.20s\n",
|
|
q);
|
|
free(ns);
|
|
return(NULL);
|
|
}
|
|
q = ctxt->cur++;
|
|
while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
|
|
(ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
|
|
(ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') ||
|
|
(IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
|
|
(IS_EXTENDER(ctxt->cur[0])))
|
|
ctxt->cur++;
|
|
name = xmlStrndup(q, ctxt->cur - q);
|
|
|
|
/*
|
|
* Search the DTD associated to ns.
|
|
*/
|
|
dtd = xmlSearchDtd(ctxt->doc, ns);
|
|
if (dtd == NULL)
|
|
fprintf(stderr, "Start tag : Couldn't find namespace %s\n", ns);
|
|
free(ns);
|
|
} else
|
|
name = xmlStrndup(q, ctxt->cur - q);
|
|
|
|
ret = xmlNewNode(dtd, name, NULL);
|
|
|
|
/*
|
|
* Now parse the attributes, it ends up with the ending
|
|
*
|
|
* (S Attribute)* S?
|
|
*/
|
|
SKIP_BLANKS(ctxt->cur);
|
|
while ((IS_CHAR(ctxt->cur[0])) &&
|
|
(ctxt->cur[0] != '>') &&
|
|
((ctxt->cur[0] != '/') || (ctxt->cur[1] != '>'))) {
|
|
if (IS_LETTER(ctxt->cur[0]) || (ctxt->cur[0] == '_'))
|
|
xmlParseAttribute(ctxt, ret);
|
|
else {
|
|
/* We should warn TODO !!! */
|
|
ctxt->cur++;
|
|
}
|
|
SKIP_BLANKS(ctxt->cur);
|
|
}
|
|
|
|
return(ret);
|
|
}
|
|
|
|
/*
|
|
* xmlParseEndTag: parse an end of tag, note that the '</' part has
|
|
* already been read.
|
|
*/
|
|
|
|
void xmlParseEndTag(xmlParserCtxtPtr ctxt, xmlDtdPtr *dtdPtr, CHAR **tagPtr) {
|
|
const CHAR *q;
|
|
CHAR *ns, *name;
|
|
xmlDtdPtr dtd = NULL;
|
|
|
|
*dtdPtr = NULL;
|
|
*tagPtr = NULL;
|
|
|
|
/*
|
|
* Theorically one should just parse a Name, but with the addition
|
|
* of the namespace needed for WebDav, it's a bit more complicated
|
|
* since the element name may be prefixed by a namespace prefix.
|
|
*
|
|
* QName ::= (NSPart ':')? LocalPart
|
|
* NSPart ::= Name
|
|
* LocalPart ::= Name
|
|
* ETag ::= '</' QName S? '>'
|
|
*
|
|
* instead of :
|
|
*
|
|
* ETag ::= '</' Name S? '>'
|
|
*/
|
|
if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return;
|
|
q = ctxt->cur++;
|
|
while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
|
|
(ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
|
|
(ctxt->cur[0] == '_') ||
|
|
(IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
|
|
(IS_EXTENDER(ctxt->cur[0])))
|
|
ctxt->cur++;
|
|
|
|
if (ctxt->cur[0] == ':') {
|
|
ns = xmlStrndup(q, ctxt->cur - q);
|
|
|
|
ctxt->cur++; /* skip the column */
|
|
if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) {
|
|
fprintf(stderr,
|
|
"End tag : no element name after namespace identifier %.20s\n",
|
|
q);
|
|
free(ns);
|
|
return;
|
|
}
|
|
q = ctxt->cur++;
|
|
while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
|
|
(ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
|
|
(ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') ||
|
|
(IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
|
|
(IS_EXTENDER(ctxt->cur[0])))
|
|
ctxt->cur++;
|
|
name = xmlStrndup(q, ctxt->cur - q);
|
|
|
|
/*
|
|
* Search the DTD associated to ns.
|
|
*/
|
|
dtd = xmlSearchDtd(ctxt->doc, ns);
|
|
if (dtd == NULL)
|
|
fprintf(stderr, "End tag : Couldn't find namespace %s\n", ns);
|
|
free(ns);
|
|
} else
|
|
name = xmlStrndup(q, ctxt->cur - q);
|
|
|
|
*dtdPtr = dtd;
|
|
*tagPtr = name;
|
|
|
|
/*
|
|
* We should definitely be at the ending "S? '>'" part
|
|
*/
|
|
SKIP_BLANKS(ctxt->cur);
|
|
if ((!IS_CHAR(ctxt->cur[0])) || (ctxt->cur[0] != '>')) {
|
|
fprintf(stderr, "End tag : expected '>', got %.20s\n", ctxt->cur);
|
|
/*
|
|
* Note : skipping to the next '>' is probably otherkill,
|
|
* especially in case the '>' is hust missing.
|
|
*
|
|
* Otherwise add:
|
|
* MOVETO_ENDTAG(ctxt->cur);
|
|
*/
|
|
} else
|
|
ctxt->cur++;
|
|
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* xmlParseCDSect: escaped pure raw content.
|
|
*/
|
|
CHAR *xmlParseCDSect(xmlParserCtxtPtr ctxt) {
|
|
const CHAR *r, *s, *base;
|
|
CHAR *ret;
|
|
|
|
base = ctxt->cur;
|
|
if (!IS_CHAR(ctxt->cur[0])) {
|
|
fprintf(stderr, "CData section not finished : %.20s\n", base);
|
|
return(NULL);
|
|
}
|
|
r = ctxt->cur++;
|
|
if (!IS_CHAR(ctxt->cur[0])) {
|
|
fprintf(stderr, "CData section not finished : %.20s\n", base);
|
|
return(NULL);
|
|
}
|
|
s = ctxt->cur++;
|
|
while (IS_CHAR(ctxt->cur[0]) &&
|
|
((*r != ']') || (*s != ']') || (ctxt->cur[0] != '>'))) {
|
|
r++;s++;ctxt->cur++;
|
|
}
|
|
if (!IS_CHAR(ctxt->cur[0])) {
|
|
fprintf(stderr, "CData section not finished : %.20s\n", base);
|
|
return(NULL);
|
|
}
|
|
ret = xmlStrndup(base, ctxt->cur-base);
|
|
|
|
return(ret);
|
|
}
|
|
|
|
/*
|
|
* xmlParseContent: a content is
|
|
* (element | PCData | Reference | CDSect | PI | Comment)
|
|
*
|
|
* element : starts by '<'
|
|
* PCData : any CHAR but '&' or '<'
|
|
* Reference : starts by '&'
|
|
* CDSect : starts by '<![CDATA['
|
|
* PI : starts by '<?'
|
|
*/
|
|
|
|
xmlNodePtr xmlParseContent(xmlParserCtxtPtr ctxt, xmlNodePtr node) {
|
|
const CHAR *q;
|
|
CHAR *data = NULL;
|
|
xmlNodePtr ret = NULL;
|
|
|
|
/*
|
|
* First case : a Processing Instruction.
|
|
*/
|
|
if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) {
|
|
xmlParsePI(ctxt);
|
|
}
|
|
/*
|
|
* Second case : a CDSection
|
|
*/
|
|
if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '!') &&
|
|
(ctxt->cur[2] == '[') && (ctxt->cur[3] == 'C') &&
|
|
(ctxt->cur[4] == 'D') && (ctxt->cur[5] == 'A') &&
|
|
(ctxt->cur[6] == 'T') && (ctxt->cur[7] == 'A') &&
|
|
(ctxt->cur[8] == '[')) {
|
|
ctxt->cur += 9;
|
|
data = xmlParseCDSect(ctxt);
|
|
}
|
|
/*
|
|
* Third case : a sub-element.
|
|
*/
|
|
else if (ctxt->cur[0] == '<') {
|
|
ret = xmlParseElement(ctxt);
|
|
}
|
|
/*
|
|
* Last case, text. Note that References are handled directly.
|
|
*/
|
|
else {
|
|
q = ctxt->cur;
|
|
while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '<')) ctxt->cur++;
|
|
|
|
if (!IS_CHAR(ctxt->cur[0])) {
|
|
fprintf(stderr, "Truncated content : %.50s\n", q);
|
|
return(NULL);
|
|
}
|
|
|
|
/*
|
|
* Do the Entities decoding...
|
|
*/
|
|
data = xmlStrdup(xmlDecodeEntities(ctxt->doc, q, ctxt->cur - q));
|
|
}
|
|
|
|
/*
|
|
* Handle the data if any. If there is no child
|
|
* add it as content, otherwise create a new node of type text.
|
|
*/
|
|
if (data != NULL)
|
|
data = xmlHandleData(data);
|
|
if (data != NULL) {
|
|
if (node->childs == NULL)
|
|
xmlNodeSetContent(node, data);
|
|
else
|
|
ret = xmlNewText(data);
|
|
free(data);
|
|
}
|
|
|
|
return(ret);
|
|
}
|
|
|
|
/*
|
|
* xmlParseElement: parse an XML element
|
|
*/
|
|
|
|
xmlNodePtr xmlParseElement(xmlParserCtxtPtr ctxt) {
|
|
xmlNodePtr ret, child;
|
|
const CHAR *openTag = ctxt->cur;
|
|
const CHAR *closeTag = ctxt->cur;
|
|
|
|
ret = xmlParseStartTag(ctxt);
|
|
if (ret == NULL) {
|
|
return(NULL);
|
|
}
|
|
|
|
/*
|
|
* Check for an Empty Element.
|
|
*/
|
|
if ((ctxt->cur[0] == '/') && (ctxt->cur[1] == '>')) {
|
|
ctxt->cur += 2;
|
|
return(ret);
|
|
}
|
|
if (ctxt->cur[0] == '>') ctxt->cur++;
|
|
else {
|
|
fprintf(stderr, "Couldn't find end of Start Tag %.30s\n", openTag);
|
|
return(NULL);
|
|
}
|
|
|
|
/*
|
|
* Parse the content of the element:
|
|
* (element | PCData | Reference | CDSect | PI | Comment) *
|
|
*
|
|
* element : starts by '<'
|
|
* PCData : any CHAR but '&' or '<'
|
|
* Reference : starts by '&'
|
|
* CDSect : starts by '<![CDATA['
|
|
* PI : starts by '<?'
|
|
*
|
|
* The loop stops upon detection of an end of tag '</'
|
|
*/
|
|
while ((IS_CHAR(ctxt->cur[0])) &&
|
|
((ctxt->cur[0] != '<') || (ctxt->cur[1] != '/'))) {
|
|
child = xmlParseContent(ctxt, ret);
|
|
if (child != NULL)
|
|
xmlAddChild(ret, child);
|
|
}
|
|
if (!IS_CHAR(ctxt->cur[0])) {
|
|
fprintf(stderr, "Premature end of data in tag %.30s\n", openTag);
|
|
return(NULL);
|
|
}
|
|
|
|
/*
|
|
* parse the end of tag : '</' has been detected.
|
|
*/
|
|
ctxt->cur += 2;
|
|
if (ctxt->cur[0] == '>') ctxt->cur++; /* simplified closing </> */
|
|
else {
|
|
CHAR *endTag;
|
|
xmlDtdPtr endDtd;
|
|
|
|
xmlParseEndTag(ctxt, &endDtd, &endTag);
|
|
|
|
/*
|
|
* Check that the Name in the ETag is the same as in the STag.
|
|
*/
|
|
if (endDtd != ret->dtd) {
|
|
fprintf(stderr, "Start and End tags don't use the same DTD:\n");
|
|
fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
|
|
}
|
|
if (strcmp(ret->name, endTag)) {
|
|
fprintf(stderr, "Start and End tags don't use the same name:\n");
|
|
fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
|
|
}
|
|
|
|
if ( endTag != NULL )
|
|
free(endTag);
|
|
}
|
|
|
|
return(ret);
|
|
}
|
|
|
|
/*
|
|
* xmlParseXMLDecl: parse an XML declaration header
|
|
*/
|
|
|
|
void xmlParseXMLDecl(xmlParserCtxtPtr ctxt) {
|
|
CHAR *version;
|
|
|
|
/*
|
|
* We know that '<?xml' is here.
|
|
*/
|
|
ctxt->cur += 5;
|
|
|
|
/*
|
|
* Parse the version info
|
|
*/
|
|
SKIP_BLANKS(ctxt->cur);
|
|
|
|
/*
|
|
* We should have 'version=' here !
|
|
*/
|
|
if ((ctxt->cur[0] == 'v') && (ctxt->cur[1] == 'e') &&
|
|
(ctxt->cur[2] == 'r') && (ctxt->cur[3] == 's') &&
|
|
(ctxt->cur[4] == 'i') && (ctxt->cur[5] == 'o') &&
|
|
(ctxt->cur[6] == 'n') && (ctxt->cur[7] == '=')) {
|
|
ctxt->cur += 8;
|
|
version = xmlParseQuotedString(ctxt);
|
|
if (version == NULL)
|
|
ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION);
|
|
else {
|
|
ctxt->doc = xmlNewDoc(version);
|
|
free(version);
|
|
}
|
|
} else {
|
|
ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION);
|
|
}
|
|
|
|
/*
|
|
* We should check for Required Markup Declaration TODO !!!!
|
|
*/
|
|
MOVETO_ENDTAG(ctxt->cur);
|
|
ctxt->cur++;
|
|
|
|
}
|
|
|
|
/*
|
|
* xmlParseMisc: parse an XML Misc optionnal field.
|
|
* (Comment | PI | S)*
|
|
*/
|
|
|
|
void xmlParseMisc(xmlParserCtxtPtr ctxt) {
|
|
while (((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) ||
|
|
((ctxt->cur[0] == '<') && (ctxt->cur[1] == '!') &&
|
|
(ctxt->cur[2] == '-') && (ctxt->cur[2] == '-')) ||
|
|
IS_BLANK(ctxt->cur[0])) {
|
|
if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) {
|
|
xmlParsePI(ctxt);
|
|
} else if (IS_BLANK(ctxt->cur[0])) {
|
|
ctxt->cur++;
|
|
} else
|
|
xmlParserSkipComment(ctxt);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* xmlParseDocument : parse an XML document and build a tree.
|
|
*/
|
|
|
|
int xmlParseDocument(xmlParserCtxtPtr ctxt) {
|
|
/*
|
|
* We should check for encoding here and plug-in some
|
|
* conversion code TODO !!!!
|
|
*/
|
|
|
|
/*
|
|
* Wipe out everything which is before the first '<'
|
|
*/
|
|
SKIP_BLANKS(ctxt->cur);
|
|
|
|
/*
|
|
* Check for the XMLDecl in the Prolog.
|
|
*/
|
|
if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?') &&
|
|
(ctxt->cur[2] == 'x') && (ctxt->cur[3] == 'm') &&
|
|
(ctxt->cur[4] == 'l')) {
|
|
xmlParseXMLDecl(ctxt);
|
|
/* SKIP_EOL(cur); */
|
|
SKIP_BLANKS(ctxt->cur);
|
|
} else if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?') &&
|
|
(ctxt->cur[2] == 'X') && (ctxt->cur[3] == 'M') &&
|
|
(ctxt->cur[4] == 'L')) {
|
|
/*
|
|
* The first drafts were using <?XML and the final W3C REC
|
|
* now use <?xml ...
|
|
*/
|
|
xmlParseXMLDecl(ctxt);
|
|
/* SKIP_EOL(cur); */
|
|
SKIP_BLANKS(ctxt->cur);
|
|
} else {
|
|
ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION);
|
|
}
|
|
|
|
/*
|
|
* The Misc part of the Prolog
|
|
* (Comment | PI | S) *
|
|
*/
|
|
xmlParseMisc(ctxt);
|
|
|
|
/*
|
|
* Time to start parsing
|
|
*/
|
|
ctxt->doc->root = xmlParseElement(ctxt);
|
|
|
|
return(0);
|
|
}
|
|
|
|
/*
|
|
* xmlParseDoc : parse an XML in-memory document and build a tree.
|
|
*/
|
|
|
|
xmlDocPtr xmlParseDoc(CHAR *cur) {
|
|
xmlDocPtr ret;
|
|
xmlParserCtxtPtr ctxt;
|
|
|
|
if (cur == NULL) return(NULL);
|
|
|
|
ctxt = (xmlParserCtxtPtr) malloc(sizeof(xmlParserCtxt));
|
|
if (ctxt == NULL) {
|
|
perror("malloc");
|
|
return(NULL);
|
|
}
|
|
|
|
xmlInitParserCtxt(ctxt);
|
|
ctxt->base = cur;
|
|
ctxt->cur = cur;
|
|
|
|
xmlParseDocument(ctxt);
|
|
ret = ctxt->doc;
|
|
free(ctxt->nodes);
|
|
free(ctxt);
|
|
|
|
return(ret);
|
|
}
|
|
|
|
/*
|
|
* xmlParseFile : parse an XML file and build a tree.
|
|
*/
|
|
|
|
xmlDocPtr xmlParseFile(const char *filename) {
|
|
xmlDocPtr ret;
|
|
#ifdef HAVE_ZLIB_H
|
|
gzFile input;
|
|
#else
|
|
int input;
|
|
#endif
|
|
int res;
|
|
struct stat buf;
|
|
char *buffer;
|
|
xmlParserCtxtPtr ctxt;
|
|
|
|
res = stat(filename, &buf);
|
|
if (res < 0) return(NULL);
|
|
|
|
#ifdef HAVE_ZLIB_H
|
|
retry_bigger:
|
|
buffer = malloc((buf.st_size * 20) + 100);
|
|
#else
|
|
buffer = malloc(buf.st_size + 100);
|
|
#endif
|
|
if (buffer == NULL) {
|
|
perror("malloc");
|
|
return(NULL);
|
|
}
|
|
|
|
memset(buffer, 0, sizeof(buffer));
|
|
#ifdef HAVE_ZLIB_H
|
|
input = gzopen (filename, "r");
|
|
if (input == NULL) {
|
|
fprintf (stderr, "Cannot read file %s :\n", filename);
|
|
perror ("gzopen failed");
|
|
return(NULL);
|
|
}
|
|
#else
|
|
input = open (filename, O_RDONLY);
|
|
if (input < 0) {
|
|
fprintf (stderr, "Cannot read file %s :\n", filename);
|
|
perror ("open failed");
|
|
return(NULL);
|
|
}
|
|
#endif
|
|
#ifdef HAVE_ZLIB_H
|
|
res = gzread(input, buffer, 20 * buf.st_size);
|
|
#else
|
|
res = read(input, buffer, buf.st_size);
|
|
#endif
|
|
if (res < 0) {
|
|
fprintf (stderr, "Cannot read file %s :\n", filename);
|
|
#ifdef HAVE_ZLIB_H
|
|
perror ("gzread failed");
|
|
#else
|
|
perror ("read failed");
|
|
#endif
|
|
return(NULL);
|
|
}
|
|
#ifdef HAVE_ZLIB_H
|
|
gzclose(input);
|
|
if (res >= 20 * buf.st_size) {
|
|
free(buffer);
|
|
buf.st_size *= 2;
|
|
goto retry_bigger;
|
|
}
|
|
buf.st_size = res;
|
|
#else
|
|
close(input);
|
|
#endif
|
|
|
|
|
|
ctxt = (xmlParserCtxtPtr) malloc(sizeof(xmlParserCtxt));
|
|
if (ctxt == NULL) {
|
|
perror("malloc");
|
|
return(NULL);
|
|
}
|
|
buffer[buf.st_size] = '\0';
|
|
|
|
xmlInitParserCtxt(ctxt);
|
|
ctxt->filename = filename;
|
|
ctxt->base = buffer;
|
|
ctxt->cur = buffer;
|
|
|
|
xmlParseDocument(ctxt);
|
|
ret = ctxt->doc;
|
|
free(buffer);
|
|
free(ctxt->nodes);
|
|
free(ctxt);
|
|
|
|
return(ret);
|
|
}
|
|
|
|
/*
|
|
* xmlParseFile : parse an XML memory block and build a tree.
|
|
*/
|
|
|
|
xmlDocPtr xmlParseMemory(char *buffer, int size) {
|
|
xmlDocPtr ret;
|
|
xmlParserCtxtPtr ctxt;
|
|
|
|
ctxt = (xmlParserCtxtPtr) malloc(sizeof(xmlParserCtxt));
|
|
if (ctxt == NULL) {
|
|
perror("malloc");
|
|
return(NULL);
|
|
}
|
|
|
|
buffer[size - 1] = '\0';
|
|
|
|
xmlInitParserCtxt(ctxt);
|
|
ctxt->base = buffer;
|
|
ctxt->cur = buffer;
|
|
|
|
xmlParseDocument(ctxt);
|
|
ret = ctxt->doc;
|
|
free(ctxt->nodes);
|
|
free(ctxt);
|
|
|
|
return(ret);
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Initialize parser context */
|
|
void xmlInitParserCtxt(xmlParserCtxtPtr ctxt)
|
|
{
|
|
int i;
|
|
|
|
ctxt->filename = NULL;
|
|
ctxt->base = NULL;
|
|
ctxt->cur = NULL;
|
|
ctxt->line = 1;
|
|
ctxt->col = 1;
|
|
ctxt->doc = NULL;
|
|
ctxt->depth = 0;
|
|
ctxt->max_depth = 10;
|
|
ctxt->nodes = (xmlNodePtr *) malloc(ctxt->max_depth * sizeof(xmlNodePtr));
|
|
if (ctxt->nodes == NULL) {
|
|
fprintf(stderr, "malloc of %d byte failed\n",
|
|
ctxt->max_depth * sizeof(xmlNodePtr));
|
|
ctxt->max_depth = 0;
|
|
} else {
|
|
for (i = 0;i < ctxt->max_depth;i++)
|
|
ctxt->nodes[i] = NULL;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* Clear (release owned resources) and reinitialize context
|
|
*/
|
|
void xmlClearParserCtxt(xmlParserCtxtPtr ctx)
|
|
{
|
|
xmlInitParserCtxt(ctx);
|
|
}
|
|
|
|
|
|
/*
|
|
* Setup the parser context to parse a new buffer; Clears any prior
|
|
* contents from the parser context. The buffer parameter must not be
|
|
* NULL, but the filename parameter can be
|
|
*/
|
|
void xmlSetupParserForBuffer(xmlParserCtxtPtr ctxt, const CHAR* buffer,
|
|
const char* filename)
|
|
{
|
|
xmlClearParserCtxt(ctxt);
|
|
ctxt->base = buffer;
|
|
ctxt->cur = buffer;
|
|
ctxt->filename = filename;
|
|
}
|
|
|
|
|
|
|
|
void xmlReportError(xmlParserCtxtPtr ctx, const CHAR* msg)
|
|
{
|
|
fputs(msg, stderr);
|
|
}
|