diff --git a/ChangeLog b/ChangeLog index b10b534e..91a10f1b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +Mon Jul 5 18:45:31 CEST 1999 Daniel Veillard + + * parser.c, entities.c, valid.c: cleanup bug #1591 + * configure.in: cleanup bug #1592 + * HTMLparser.[ch], testHTML.c: started adding an HTML parser using + the same tree back-end. Hence gdome will be available for it. + * doc/Makefile.am: close bug #617 + Sat Jun 26 23:36:38 EDT 1999 Daniel Veillard * parser.c: alloctate a per parser context SAX interface block diff --git a/HTMLparser.c b/HTMLparser.c new file mode 100644 index 00000000..ce131997 --- /dev/null +++ b/HTMLparser.c @@ -0,0 +1,2372 @@ +/* + * HTMLparser.c : an HTML 4.0 non-verifying parser + * + * See Copyright for the status of this software. + * + * Daniel.Veillard@w3.org + */ + +#ifdef WIN32 +#define HAVE_FCNTL_H +#include +#else +#include +#endif +#include +#include +#include /* for memset() only */ +#include +#include +#ifdef HAVE_FCNTL_H +#include +#endif +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_ZLIB_H +#include +#endif + +#include "tree.h" +#include "HTMLparser.h" +#include "entities.h" +#include "encoding.h" +#include "valid.h" +#include "parserInternals.h" + +#define DEBUG + +/************************************************************************ + * * + * The list of HTML elements and their properties * + * * + ************************************************************************/ + +typedef struct htmlElemDesc { + const CHAR *name; /* The tag name */ + int startTag; /* Whether the start tag can be implied */ + int endTag; /* Whether the end tag can be implied */ + int empty; /* Is this an empty element ? */ + int depr; /* Is this a deprecated element ? */ + int dtd; /* 1: only in Loose DTD, 2: only Frameset one */ + const char *desc; /* the description */ +} htmlElemDesc, *htmlElemDescPtr; + +/* + * Name Start Tag End Tag Empty Depr. DTD Description + */ +htmlElemDesc html40ElementTable[] = { +{ "A", 0, 0, 0, 0, 0, "anchor " }, +{ "ABBR", 0, 0, 0, 0, 0, "abbreviated form" }, +{ "ACRONYM", 0, 0, 0, 0, 0, "" }, +{ "ADDRESS", 0, 0, 0, 0, 0, "information on author " }, +{ "APPLET", 0, 0, 0, 1, 1, "Java applet " }, +{ "AREA", 0, 2, 1, 0, 0, "client-side image map area " }, +{ "B", 0, 0, 0, 0, 0, "bold text style" }, +{ "BASE", 0, 2, 1, 0, 0, "document base URI " }, +{ "BASEFONT", 0, 2, 1, 1, 1, "base font size " }, +{ "BDO", 0, 0, 0, 0, 0, "I18N BiDi over-ride " }, +{ "BIG", 0, 0, 0, 0, 0, "large text style" }, +{ "BLOCKQUOTE", 0, 0, 0, 0, 0, "long quotation " }, +{ "BODY", 1, 1, 0, 0, 0, "document body " }, +{ "BR", 0, 2, 1, 0, 0, "forced line break " }, +{ "BUTTON", 0, 0, 0, 0, 0, "push button " }, +{ "CAPTION", 0, 0, 0, 0, 0, "table caption " }, +{ "CENTER", 0, 0, 0, 1, 1, "shorthand for DIV align=center " }, +{ "CITE", 0, 0, 0, 0, 0, "citation" }, +{ "CODE", 0, 0, 0, 0, 0, "computer code fragment" }, +{ "COL", 0, 2, 1, 0, 0, "table column " }, +{ "COLGROUP", 0, 1, 0, 0, 0, "table column group " }, +{ "DD", 0, 1, 0, 0, 0, "definition description " }, +{ "DEL", 0, 0, 0, 0, 0, "deleted text " }, +{ "DFN", 0, 0, 0, 0, 0, "instance definition" }, +{ "DIR", 0, 0, 0, 1, 1, "directory list" }, +{ "DIV", 0, 0, 0, 0, 0, "generic language/style container"}, +{ "DL", 0, 0, 0, 0, 0, "definition list " }, +{ "DT", 0, 1, 0, 0, 0, "definition term " }, +{ "EM", 0, 0, 0, 0, 0, "emphasis" }, +{ "FIELDSET", 0, 0, 0, 0, 0, "form control group " }, +{ "FONT", 0, 0, 0, 1, 1, "local change to font " }, +{ "FORM", 0, 0, 0, 0, 0, "interactive form " }, +{ "FRAME", 0, 2, 1, 0, 2, "subwindow " }, +{ "FRAMESET", 0, 0, 0, 0, 2, "window subdivision" }, +{ "H1", 0, 0, 0, 0, 0, "heading " }, +{ "H2", 0, 0, 0, 0, 0, "heading " }, +{ "H3", 0, 0, 0, 0, 0, "heading " }, +{ "H4", 0, 0, 0, 0, 0, "heading " }, +{ "H5", 0, 0, 0, 0, 0, "heading " }, +{ "H6", 0, 0, 0, 0, 0, "heading " }, +{ "HEAD", 1, 1, 0, 0, 0, "document head " }, +{ "HR", 0, 2, 1, 0, 0, "horizontal rule " }, +{ "HTML", 1, 1, 0, 0, 0, "document root element " }, +{ "I", 0, 0, 0, 0, 0, "italic text style" }, +{ "IFRAME", 0, 0, 0, 0, 1, "inline subwindow " }, +{ "IMG", 0, 2, 1, 0, 0, "Embedded image " }, +{ "INPUT", 0, 2, 1, 0, 0, "form control " }, +{ "INS", 0, 0, 0, 0, 0, "inserted text" }, +{ "ISINDEX", 0, 2, 1, 1, 1, "single line prompt " }, +{ "KBD", 0, 0, 0, 0, 0, "text to be entered by the user" }, +{ "LABEL", 0, 0, 0, 0, 0, "form field label text " }, +{ "LEGEND", 0, 0, 0, 0, 0, "fieldset legend " }, +{ "LI", 0, 1, 0, 0, 0, "list item " }, +{ "LINK", 0, 2, 1, 0, 0, "a media-independent link " }, +{ "MAP", 0, 0, 0, 0, 0, "client-side image map " }, +{ "MENU", 0, 0, 0, 1, 1, "menu list " }, +{ "META", 0, 2, 1, 0, 0, "generic metainformation " }, +{ "NOFRAMES", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " }, +{ "NOSCRIPT", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " }, +{ "OBJECT", 0, 0, 0, 0, 0, "generic embedded object " }, +{ "OL", 0, 0, 0, 0, 0, "ordered list " }, +{ "OPTGROUP", 0, 0, 0, 0, 0, "option group " }, +{ "OPTION", 0, 1, 0, 0, 0, "selectable choice " }, +{ "P", 0, 1, 0, 0, 0, "paragraph " }, +{ "PARAM", 0, 2, 1, 0, 0, "named property value " }, +{ "PRE", 0, 0, 0, 0, 0, "preformatted text " }, +{ "Q", 0, 0, 0, 0, 0, "short inline quotation " }, +{ "S", 0, 0, 0, 1, 1, "strike-through text style" }, +{ "SAMP", 0, 0, 0, 0, 0, "sample program output, scripts, etc." }, +{ "SCRIPT", 0, 0, 0, 0, 0, "script statements " }, +{ "SELECT", 0, 0, 0, 0, 0, "option selector " }, +{ "SMALL", 0, 0, 0, 0, 0, "small text style" }, +{ "SPAN", 0, 0, 0, 0, 0, "generic language/style container " }, +{ "STRIKE", 0, 0, 0, 1, 1, "strike-through text" }, +{ "STRONG", 0, 0, 0, 0, 0, "strong emphasis" }, +{ "STYLE", 0, 0, 0, 0, 0, "style info " }, +{ "SUB", 0, 0, 0, 0, 0, "subscript" }, +{ "SUP", 0, 0, 0, 0, 0, "superscript " }, +{ "TABLE", 0, 0, 0, 0, 0, " " }, +{ "TBODY", 1, 1, 0, 0, 0, "table body " }, +{ "TD", 0, 1, 0, 0, 0, "table data cell" }, +{ "TEXTAREA", 0, 0, 0, 0, 0, "multi-line text field " }, +{ "TFOOT", 0, 1, 0, 0, 0, "table footer " }, +{ "TH", 0, 1, 0, 0, 0, "table header cell" }, +{ "THEAD", 0, 1, 0, 0, 0, "table header " }, +{ "TITLE", 0, 0, 0, 0, 0, "document title " }, +{ "TR", 0, 1, 0, 0, 0, "table row " }, +{ "TT", 0, 0, 0, 0, 0, "teletype or monospaced text style" }, +{ "U", 0, 0, 0, 1, 1, "underlined text style" }, +{ "UL", 0, 0, 0, 0, 0, "unordered list " }, +{ "VAR", 0, 0, 0, 0, 0, "instance of a variable or program argument" }, +}; + +/* + * start tags that imply the end of a current element + * any tag of each line implies the end of the current element if the type of + * that element is in the same line + */ +CHAR *htmlEquEnd[] = { +"DT", "DD", "LI", "OPTION", NULL, +"H1", "H2", "H3", "H4", "H5", "H6", NULL, +"OL", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", NULL, +NULL +}; +/* + * acording the HTML DTD, HR should be added to the 2nd line above, as it + * is not allowed within a H1, H2, H3, etc. But we should tolerate that case + * because many documents contain rules in headings... + */ + +/* + * start tags that imply the end of current element + */ +CHAR *htmlStartClose[] = { +"FORM", "FORM", "P", "HR", "H1", "H2", "H3", "H4", "H5", "H6", + "DL", "UL", "OL", "MENU", "DIR", "ADDRESS", "PRE", + "LISTING", "XMP", "HEAD", NULL, +"HEAD", "P", NULL, +"TITLE", "P", NULL, +"BODY", "HEAD", "STYLE", "LINK", "TITLE", "P", NULL, +"LI", "P", "H1", "H2", "H3", "H4", "H5", "H6", "DL", "ADDRESS", + "PRE", "LISTING", "XMP", "HEAD", NULL, +"HR", "P", "HEAD", NULL, +"H1", "P", "HEAD", NULL, +"H2", "P", "HEAD", NULL, +"H3", "P", "HEAD", NULL, +"H4", "P", "HEAD", NULL, +"H5", "P", "HEAD", NULL, +"H6", "P", "HEAD", NULL, +"DIR", "P", "HEAD", NULL, +"ADDRESS", "P", "HEAD", "UL", NULL, +"PRE", "P", "HEAD", "UL", NULL, +"LISTING", "P", "HEAD", NULL, +"XMP", "P", "HEAD", NULL, +"BLOCKQUOTE", "P", "HEAD", NULL, +"DL", "P", "DT", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", + "XMP", "HEAD", NULL, +"DT", "P", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", "HEAD", NULL, +"DD", "P", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", "HEAD", NULL, +"UL", "P", "HEAD", "OL", "MENU", "DIR", "ADDRESS", "PRE", + "LISTING", "XMP", NULL, +"OL", "P", "HEAD", "UL", NULL, +"MENU", "P", "HEAD", "UL", NULL, +"P", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6", NULL, +"DIV", "P", "HEAD", NULL, +"NOSCRIPT", "P", "HEAD", NULL, +"CENTER", "FONT", "B", "I", "P", "HEAD", NULL, +"A", "A", NULL, +"CAPTION", "P", NULL, +"COLGROUP", "CAPTION", "COLGROUP", "COL", "P", NULL, +"COL", "CAPTION", "COL", "P", NULL, +"TABLE", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6", "PRE", + "LISTING", "XMP", "A", NULL, +"TH", "TH", "TD", NULL, +"TD", "TH", "TD", NULL, +"TR", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", NULL, +"THEAD", "CAPTION", "COL", "COLGROUP", NULL, +"TFOOT", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", "THEAD", + "TBODY", NULL, +"TBODY", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", "THEAD", + "TFOOT", "TBODY", NULL, +"OPTGROUP", "OPTION", NULL, +"FIELDSET", "LEGEND", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6", + "PRE", "LISTING", "XMP", "A", NULL, +NULL +}; + +static CHAR** htmlStartCloseIndex[100]; +static int htmlStartCloseIndexinitialized = 0; + +/************************************************************************ + * * + * functions to handle HTML specific data * + * * + ************************************************************************/ + +/** + * htmlInitAutoClose: + * + * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. + * + */ +void +htmlInitAutoClose(void) { + int index, i = 0; + + if (htmlStartCloseIndexinitialized) return; + + for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL; + index = 0; + while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) { + htmlStartCloseIndex[index++] = &htmlStartClose[i]; + while (htmlStartClose[i] != NULL) i++; + i++; + } +} + +/** + * htmlTagLookup: + * @tag: The tag name + * + * Lookup the HTML tag in the ElementTable + * + * Returns the related htmlElemDescPtr or NULL if not found. + */ +htmlElemDescPtr +htmlTagLookup(const CHAR *tag) { + int i = 0; + + for (i = 0; i < (sizeof(html40ElementTable) / + sizeof(html40ElementTable[0]));i++) { + if (!xmlStrcmp(tag, html40ElementTable[i].name)) + return(&html40ElementTable[i]); + } + return(NULL); +} + +/** + * htmlCheckAutoClose: + * @new: The new tag name + * @old: The old tag name + * + * Checks wether the new tag is one of the registered valid tags for closing old. + * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. + * + * Returns 0 if no, 1 if yes. + */ +int +htmlCheckAutoClose(const CHAR *new, const CHAR *old) { + int i, index; + CHAR **close; + + if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose(); + + /* inefficient, but not a big deal */ + for (index = 0; index < 100;index++) { + close = htmlStartCloseIndex[index]; + if (close == NULL) return(0); + if (!xmlStrcmp(*close, new)) break; + } + + i = close - htmlStartClose; + i++; + while (htmlStartClose[i] != NULL) { + if (!xmlStrcmp(htmlStartClose[i], old)) { +#ifdef DEBUG + printf("htmlCheckAutoClose: %s closes %s\n", new, old); +#endif + return(1); + } + i++; + } + return(0); +} + +/** + * htmlAutoClose: + * @ctxt: an HTML parser context + * @new: The new tag name + * + * The HTmL DtD allows a tag to implicitely close other tags. + * The list is kept in htmlStartClose array. This function is + * called when a new tag has been detected and generates the + * appropriates closes if possible/needed. + */ +void +htmlAutoClose(htmlParserCtxtPtr ctxt, const CHAR *new) { + const CHAR *old; + + while ((ctxt->node != NULL) && + (htmlCheckAutoClose(new, ctxt->node->name))) { + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, ctxt->node->name); + } +} + + +/************************************************************************ + * * + * Parser stacks related functions and macros * + * * + ************************************************************************/ + +/* + * Generic function for accessing stacks in the Parser Context + */ + +#define PUSH_AND_POP(type, name) \ +int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \ + if (ctxt->name##Nr >= ctxt->name##Max) { \ + ctxt->name##Max *= 2; \ + ctxt->name##Tab = (void *) realloc(ctxt->name##Tab, \ + ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \ + if (ctxt->name##Tab == NULL) { \ + fprintf(stderr, "realloc failed !\n"); \ + exit(1); \ + } \ + } \ + ctxt->name##Tab[ctxt->name##Nr] = value; \ + ctxt->name = value; \ + return(ctxt->name##Nr++); \ +} \ +type html##name##Pop(htmlParserCtxtPtr ctxt) { \ + type ret; \ + if (ctxt->name##Nr <= 0) return(0); \ + ctxt->name##Nr--; \ + if (ctxt->name##Nr > 0) \ + ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \ + else \ + ctxt->name = NULL; \ + ret = ctxt->name##Tab[ctxt->name##Nr]; \ + ctxt->name##Tab[ctxt->name##Nr] = 0; \ + return(ret); \ +} \ + +PUSH_AND_POP(xmlNodePtr, node) + +/* + * Macros for accessing the content. Those should be used only by the parser, + * and not exported. + * + * Dirty macros, i.e. one need to make assumption on the context to use them + * + * CUR_PTR return the current pointer to the CHAR to be parsed. + * CUR returns the current CHAR value, i.e. a 8 bit value if compiled + * in ISO-Latin or UTF-8, and the current 16 bit value if compiled + * in UNICODE mode. This should be used internally by the parser + * only to compare to ASCII values otherwise it would break when + * running with UTF-8 encoding. + * NXT(n) returns the n'th next CHAR. Same as CUR is should be used only + * to compare on ASCII based substring. + * SKIP(n) Skip n CHAR, and must also be used only to skip ASCII defined + * strings within the parser. + * + * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding + * + * CURRENT Returns the current char value, with the full decoding of + * UTF-8 if we are using this mode. It returns an int. + * NEXT Skip to the next character, this does the proper decoding + * in UTF-8 mode. It also pop-up unfinished entities on the fly. + * It returns the pointer to the current CHAR. + * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly + */ + +#define CUR (*ctxt->input->cur) +#define SKIP(val) ctxt->input->cur += (val) +#define NXT(val) ctxt->input->cur[(val)] +#define CUR_PTR ctxt->input->cur + +#define SKIP_BLANKS \ + while (IS_BLANK(*(ctxt->input->cur))) NEXT + +#ifndef USE_UTF_8 +#define CURRENT (*ctxt->input->cur) +#define NEXT ((*ctxt->input->cur) ? \ + (((*(ctxt->input->cur) == '\n') ? \ + (ctxt->input->line++, ctxt->input->col = 1) : \ + (ctxt->input->col++)), ctxt->input->cur++) : \ + (ctxt->input->cur)) +#else +#endif + + +/************************************************************************ + * * + * Commodity functions to handle entities * + * * + ************************************************************************/ + +/* + * Macro used to grow the current buffer. + */ +#define growBuffer(buffer) { \ + buffer##_size *= 2; \ + buffer = (CHAR *) realloc(buffer, buffer##_size * sizeof(CHAR)); \ + if (buffer == NULL) { \ + perror("realloc failed"); \ + exit(1); \ + } \ +} + + +/** + * htmlDecodeEntities: + * @ctxt: the parser context + * @len: the len to decode (in bytes !), -1 for no size limit + * @end: an end marker CHAR, 0 if none + * @end2: an end marker CHAR, 0 if none + * @end3: an end marker CHAR, 0 if none + * + * Subtitute the HTML entitis by their value + * + * Returns A newly allocated string with the substitution done. The caller + * must deallocate it ! + */ +CHAR * +htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, + CHAR end, CHAR end2, CHAR end3) { + CHAR *buffer = NULL; + int buffer_size = 0; + CHAR *out = NULL; + + CHAR *cur = NULL; + xmlEntityPtr ent; + const CHAR *start = CUR_PTR; + unsigned int max = (unsigned int) len; + + /* + * allocate a translation buffer. + */ + buffer_size = 1000; + buffer = (CHAR *) malloc(buffer_size * sizeof(CHAR)); + if (buffer == NULL) { + perror("xmlDecodeEntities: malloc failed"); + return(NULL); + } + out = buffer; + + /* + * Ok loop until we reach one of the ending char or a size limit. + */ + while ((CUR_PTR - start < max) && (CUR != end) && + (CUR != end2) && (CUR != end3)) { + + if (CUR == '&') { + if (NXT(1) == '#') { + int val = htmlParseCharRef(ctxt); + /* TODO: invalid for UTF-8 variable encoding !!! */ + *out++ = val; + } else { + ent = htmlParseEntityRef(ctxt); + if (ent != NULL) { + cur = ent->content; + while (*cur != 0) { + *out++ = *cur++; + if (out - buffer > buffer_size - 100) { + int index = out - buffer; + + growBuffer(buffer); + out = &buffer[index]; + } + } + } + } + } else { + /* TODO: invalid for UTF-8 , use COPY(out); */ + *out++ = CUR; + if (out - buffer > buffer_size - 100) { + int index = out - buffer; + + growBuffer(buffer); + out = &buffer[index]; + } + NEXT; + } + } + *out++ = 0; + return(buffer); +} + + +/************************************************************************ + * * + * Commodity functions to handle encodings * + * * + ************************************************************************/ + +/** + * htmlSwitchEncoding: + * @ctxt: the parser context + * @len: the len of @cur + * + * change the input functions when discovering the character encoding + * of a given entity. + * + */ +void +htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc) +{ + switch (enc) { + case XML_CHAR_ENCODING_ERROR: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "encoding unknown\n"); + ctxt->wellFormed = 0; + break; + case XML_CHAR_ENCODING_NONE: + /* let's assume it's UTF-8 without the XML decl */ + return; + case XML_CHAR_ENCODING_UTF8: + /* default encoding, no conversion should be needed */ + return; + case XML_CHAR_ENCODING_UTF16LE: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding UTF16 little endian not supported\n"); + break; + case XML_CHAR_ENCODING_UTF16BE: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding UTF16 big endian not supported\n"); + break; + case XML_CHAR_ENCODING_UCS4LE: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding USC4 little endian not supported\n"); + break; + case XML_CHAR_ENCODING_UCS4BE: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding USC4 big endian not supported\n"); + break; + case XML_CHAR_ENCODING_EBCDIC: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding EBCDIC not supported\n"); + break; + case XML_CHAR_ENCODING_UCS4_2143: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding UCS4 2143 not supported\n"); + break; + case XML_CHAR_ENCODING_UCS4_3412: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding UCS4 3412 not supported\n"); + break; + case XML_CHAR_ENCODING_UCS2: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding UCS2 not supported\n"); + break; + case XML_CHAR_ENCODING_8859_1: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding ISO_8859_1 ISO Latin 1 not supported\n"); + break; + case XML_CHAR_ENCODING_8859_2: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding ISO_8859_2 ISO Latin 2 not supported\n"); + break; + case XML_CHAR_ENCODING_8859_3: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding ISO_8859_3 not supported\n"); + break; + case XML_CHAR_ENCODING_8859_4: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding ISO_8859_4 not supported\n"); + break; + case XML_CHAR_ENCODING_8859_5: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding ISO_8859_5 not supported\n"); + break; + case XML_CHAR_ENCODING_8859_6: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding ISO_8859_6 not supported\n"); + break; + case XML_CHAR_ENCODING_8859_7: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding ISO_8859_7 not supported\n"); + break; + case XML_CHAR_ENCODING_8859_8: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding ISO_8859_8 not supported\n"); + break; + case XML_CHAR_ENCODING_8859_9: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding ISO_8859_9 not supported\n"); + break; + case XML_CHAR_ENCODING_2022_JP: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding ISO-2022-JPnot supported\n"); + break; + case XML_CHAR_ENCODING_SHIFT_JIS: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding Shift_JISnot supported\n"); + break; + case XML_CHAR_ENCODING_EUC_JP: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding EUC-JPnot supported\n"); + break; + } +} + + +/************************************************************************ + * * + * Commodity functions, cleanup needed ? * + * * + ************************************************************************/ + +/** + * areBlanks: + * @ctxt: an HTML parser context + * @str: a CHAR * + * @len: the size of @str + * + * Is this a sequence of blank chars that one can ignore ? + * + * TODO: to be corrected accodingly to DTD information if available + * + * Returns 1 if ignorable 0 otherwise. + */ + +static int areBlanks(htmlParserCtxtPtr ctxt, const CHAR *str, int len) { + int i; + xmlNodePtr lastChild; + + for (i = 0;i < len;i++) + if (!(IS_BLANK(str[i]))) return(0); + + if (CUR != '<') return(0); + if (ctxt->node == NULL) return(0); + lastChild = xmlGetLastChild(ctxt->node); + if (lastChild == NULL) { + if (ctxt->node->content != NULL) return(0); + } else if (xmlNodeIsText(lastChild)) + return(0); + return(1); +} + +/** + * htmlHandleEntity: + * @ctxt: an HTML parser context + * @entity: an XML entity pointer. + * + * Default handling of an HTML entity, call the parser with the + * substitution string + */ + +void +htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) { + int len; + + if (entity->content == NULL) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n", + entity->name); + ctxt->wellFormed = 0; + return; + } + len = xmlStrlen(entity->content); + + /* + * Just handle the content as a set of chars. + */ + if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) + ctxt->sax->characters(ctxt->userData, entity->content, len); + +} + +/** + * htmlNewDoc: + * @URI: URI for the dtd, or NULL + * @ExternalID: the external ID of the DTD, or NULL + * + * Returns a new document + */ +htmlDocPtr +htmlNewDoc(const CHAR *URI, const CHAR *ExternalID) { + xmlDocPtr cur; + + /* + * Allocate a new document and fill the fields. + */ + cur = (xmlDocPtr) malloc(sizeof(xmlDoc)); + if (cur == NULL) { + fprintf(stderr, "xmlNewDoc : malloc failed\n"); + return(NULL); + } + + cur->type = XML_DOCUMENT_NODE; + cur->version = NULL; + if (ExternalID != NULL) cur->ID = xmlStrdup(ExternalID); + else cur->ID = NULL; + if (URI != NULL) cur->DTD = xmlStrdup(URI); + else cur->DTD = NULL; + cur->name = NULL; + cur->root = NULL; + cur->intSubset = NULL; + cur->extSubset = NULL; + cur->oldNs = NULL; + cur->encoding = NULL; + cur->standalone = 1; + cur->compression = 0; +#ifndef XML_WITHOUT_CORBA + cur->_private = NULL; + cur->vepv = NULL; +#endif + return(cur); +} + + +/************************************************************************ + * * + * The parser itself * + * Relates to http://www.w3.org/TR/html40 * + * * + ************************************************************************/ + +/************************************************************************ + * * + * The parser itself * + * * + ************************************************************************/ + +/** + * htmlParseHTMLName: + * @ctxt: an HTML parser context + * + * parse an HTML tag or attribute name, note that we convert it to uppercase + * since HTML names are not case-sensitive. + * + * Returns the Tag Name parsed or NULL + */ + +CHAR * +htmlParseHTMLName(htmlParserCtxtPtr ctxt) { + CHAR *ret = NULL; + int i = 0; + CHAR loc[100]; + + if (!IS_LETTER(CUR) && (CUR != '_') && + (CUR != ':')) return(NULL); + + while ((i < 100) && ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) { + if ((CUR >= 0x61) && (CUR <= 0x7a)) loc[i] = CUR - 0x20; + else loc[i] = CUR; + i++; + + NEXT; + } + + ret = xmlStrndup(loc, i); + + return(ret); +} + +/** + * htmlParseName: + * @ctxt: an HTML parser context + * + * parse an HTML name, this routine is case sensistive. + * + * Returns the Name parsed or NULL + */ + +CHAR * +htmlParseName(htmlParserCtxtPtr ctxt) { + const CHAR *q; + CHAR *ret = NULL; + + if (!IS_LETTER(CUR) && (CUR != '_') && + (CUR != ':')) return(NULL); + q = NEXT; + + while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) || + (CUR == '.') || (CUR == '-') || + (CUR == '_') || (CUR == ':') || + (IS_COMBINING(CUR)) || + (IS_EXTENDER(CUR))) + NEXT; + + ret = xmlStrndup(q, CUR_PTR - q); + + return(ret); +} + +/** + * htmlParseNmtoken: + * @ctxt: an HTML parser context + * + * parse an HTML Nmtoken. + * + * Returns the Nmtoken parsed or NULL + */ + +CHAR * +htmlParseNmtoken(htmlParserCtxtPtr ctxt) { + const CHAR *q; + CHAR *ret = NULL; + + q = NEXT; + + while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) || + (CUR == '.') || (CUR == '-') || + (CUR == '_') || (CUR == ':') || + (IS_COMBINING(CUR)) || + (IS_EXTENDER(CUR))) + NEXT; + + ret = xmlStrndup(q, CUR_PTR - q); + + return(ret); +} + +/** + * htmlParseEntityRef: + * @ctxt: an HTML parser context + * + * parse ENTITY references declarations + * + * [68] EntityRef ::= '&' Name ';' + * + * Returns the xmlEntityPtr if found, or NULL otherwise. + */ +xmlEntityPtr +htmlParseEntityRef(htmlParserCtxtPtr ctxt) { + const CHAR *q; /* !!!!!!!!!!! Unused !!!!!!!!!! */ + CHAR *name; + xmlEntityPtr ent = NULL; + + q = CUR_PTR; + if (CUR == '&') { + NEXT; + name = htmlParseName(ctxt); + if (name == NULL) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n"); + ctxt->wellFormed = 0; + } else { + if (CUR == ';') { + NEXT; + /* + * Ask first SAX for entity resolution, otherwise try the + * predefined set. + */ + if (ctxt->sax != NULL) { + if (ctxt->sax->getEntity != NULL) + ent = ctxt->sax->getEntity(ctxt->userData, name); + if (ent == NULL) + ent = xmlGetPredefinedEntity(name); + } + + /* + * Well Formedness Constraint if: + * - standalone + * or + * - no external subset and no external parameter entities + * referenced + * then + * the entity referenced must have been declared + * + * TODO: to be double checked !!! This is wrong ! + */ + if (ent == NULL) { + if (ctxt->sax != NULL) { + if (((ctxt->sax->isStandalone != NULL) && + ctxt->sax->isStandalone(ctxt->userData) == 1) || + (((ctxt->sax->hasInternalSubset == NULL) || + ctxt->sax->hasInternalSubset(ctxt->userData) == 0) && + ((ctxt->sax->hasExternalSubset == NULL) || + ctxt->sax->hasExternalSubset(ctxt->userData) == 0))) { + if (ctxt->sax->error != NULL) + ctxt->sax->error(ctxt->userData, + "Entity '%s' not defined\n", name); + ctxt->wellFormed = 0; + } + } else { + fprintf(stderr, "Entity '%s' not defined\n", name); + ctxt->wellFormed = 0; + } + } + + /* + * Well Formedness Constraint : + * The referenced entity must be a parsed entity. + */ + if (ent != NULL) { + switch (ent->type) { + case XML_INTERNAL_PARAMETER_ENTITY: + case XML_EXTERNAL_PARAMETER_ENTITY: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Attempt to reference the parameter entity '%s'\n", name); + ctxt->wellFormed = 0; + break; + + case XML_EXTERNAL_GENERAL_UNPARSED_ENTITY: + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Attempt to reference unparsed entity '%s'\n", name); + ctxt->wellFormed = 0; + break; + } + } + + /* + * TODO: !!! + * Well Formedness Constraint : + * The referenced entity must not lead to recursion ! + */ + + + } else { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "htmlParseEntityRef: expecting ';'\n"); + ctxt->wellFormed = 0; + } + free(name); + } + } + return(ent); +} + +/** + * htmlParseAttValue: + * @ctxt: an HTML parser context + * + * parse a value for an attribute + * Note: the parser won't do substitution of entities here, this + * will be handled later in xmlStringGetNodeList, unless it was + * asked for ctxt->replaceEntities != 0 + * + * [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | + * "'" ([^<&'] | Reference)* "'" + * + * Returns the AttValue parsed or NULL. + */ + +CHAR * +htmlParseAttValue(htmlParserCtxtPtr ctxt) { + CHAR *ret = NULL; + + if (CUR == '"') { + NEXT; + if (ctxt->replaceEntities != 0) + ret = xmlDecodeEntities(ctxt, -1, XML_SUBSTITUTE_REF, '"', '<', 0); + else + ret = xmlDecodeEntities(ctxt, -1, XML_SUBSTITUTE_NONE, '"', '<', 0); + if (CUR == '<') { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Unescaped '<' not allowed in attributes values\n"); + ctxt->wellFormed = 0; + } + if (CUR != '"') { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n"); + ctxt->wellFormed = 0; + } else + NEXT; + } else if (CUR == '\'') { + NEXT; + if (ctxt->replaceEntities != 0) + ret = xmlDecodeEntities(ctxt, -1, XML_SUBSTITUTE_REF, '\'', '<', 0); + else + ret = xmlDecodeEntities(ctxt, -1, XML_SUBSTITUTE_NONE, '\'', '<', 0); + if (CUR == '<') { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Unescaped '<' not allowed in attributes values\n"); + ctxt->wellFormed = 0; + } + if (CUR != '\'') { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n"); + ctxt->wellFormed = 0; + } else + NEXT; + } else { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "AttValue: \" or ' expected\n"); + ctxt->wellFormed = 0; + } + + return(ret); +} + +/** + * htmlParseSystemLiteral: + * @ctxt: an HTML parser context + * + * parse an HTML Literal + * + * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") + * + * Returns the SystemLiteral parsed or NULL + */ + +CHAR * +htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { + const CHAR *q; + CHAR *ret = NULL; + + if (CUR == '"') { + NEXT; + q = CUR_PTR; + while ((IS_CHAR(CUR)) && (CUR != '"')) + NEXT; + if (!IS_CHAR(CUR)) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n"); + ctxt->wellFormed = 0; + } else { + ret = xmlStrndup(q, CUR_PTR - q); + NEXT; + } + } else if (CUR == '\'') { + NEXT; + q = CUR_PTR; + while ((IS_CHAR(CUR)) && (CUR != '\'')) + NEXT; + if (!IS_CHAR(CUR)) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n"); + ctxt->wellFormed = 0; + } else { + ret = xmlStrndup(q, CUR_PTR - q); + NEXT; + } + } else { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n"); + ctxt->wellFormed = 0; + } + + return(ret); +} + +/** + * htmlParsePubidLiteral: + * @ctxt: an HTML parser context + * + * parse an HTML public literal + * + * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" + * + * Returns the PubidLiteral parsed or NULL. + */ + +CHAR * +htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { + const CHAR *q; + CHAR *ret = NULL; + /* + * Name ::= (Letter | '_') (NameChar)* + */ + if (CUR == '"') { + NEXT; + q = CUR_PTR; + while (IS_PUBIDCHAR(CUR)) NEXT; + if (CUR != '"') { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n"); + ctxt->wellFormed = 0; + } else { + ret = xmlStrndup(q, CUR_PTR - q); + NEXT; + } + } else if (CUR == '\'') { + NEXT; + q = CUR_PTR; + while ((IS_LETTER(CUR)) && (CUR != '\'')) + NEXT; + if (!IS_LETTER(CUR)) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n"); + ctxt->wellFormed = 0; + } else { + ret = xmlStrndup(q, CUR_PTR - q); + NEXT; + } + } else { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n"); + ctxt->wellFormed = 0; + } + + return(ret); +} + +/** + * htmlParseCharData: + * @ctxt: an HTML parser context + * @cdata: int indicating whether we are within a CDATA section + * + * parse a CharData section. + * if we are within a CDATA section ']]>' marks an end of section. + * + * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) + */ + +void +htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) { + const CHAR *q; + + q = CUR_PTR; + while ((IS_CHAR(CUR)) && (CUR != '<') && + (CUR != '&')) { + if ((CUR == ']') && (NXT(1) == ']') && + (NXT(2) == '>')) { + if (cdata) break; + else { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Sequence ']]>' not allowed in content\n"); + ctxt->wellFormed = 0; + } + } + NEXT; + } + if (q == CUR_PTR) return; + + /* + * Ok the segment [q CUR_PTR] is to be consumed as chars. + */ + if (ctxt->sax != NULL) { + if (areBlanks(ctxt, q, CUR_PTR - q)) { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace(ctxt->userData, q, CUR_PTR - q); + } else { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters(ctxt->userData, q, CUR_PTR - q); + } + } +} + +/** + * htmlParseExternalID: + * @ctxt: an HTML parser context + * @publicID: a CHAR** receiving PubidLiteral + * @strict: indicate whether we should restrict parsing to only + * production [75], see NOTE below + * + * Parse an External ID or a Public ID + * + * NOTE: Productions [75] and [83] interract badly since [75] can generate + * 'PUBLIC' S PubidLiteral S SystemLiteral + * + * [75] ExternalID ::= 'SYSTEM' S SystemLiteral + * | 'PUBLIC' S PubidLiteral S SystemLiteral + * + * [83] PublicID ::= 'PUBLIC' S PubidLiteral + * + * Returns the function returns SystemLiteral and in the second + * case publicID receives PubidLiteral, is strict is off + * it is possible to return NULL and have publicID set. + */ + +CHAR * +htmlParseExternalID(htmlParserCtxtPtr ctxt, CHAR **publicID, int strict) { + CHAR *URI = NULL; + + if ((CUR == 'S') && (NXT(1) == 'Y') && + (NXT(2) == 'S') && (NXT(3) == 'T') && + (NXT(4) == 'E') && (NXT(5) == 'M')) { + SKIP(6); + if (!IS_BLANK(CUR)) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Space required after 'SYSTEM'\n"); + ctxt->wellFormed = 0; + } + SKIP_BLANKS; + URI = htmlParseSystemLiteral(ctxt); + if (URI == NULL) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "htmlParseExternalID: SYSTEM, no URI\n"); + ctxt->wellFormed = 0; + } + } else if ((CUR == 'P') && (NXT(1) == 'U') && + (NXT(2) == 'B') && (NXT(3) == 'L') && + (NXT(4) == 'I') && (NXT(5) == 'C')) { + SKIP(6); + if (!IS_BLANK(CUR)) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Space required after 'PUBLIC'\n"); + ctxt->wellFormed = 0; + } + SKIP_BLANKS; + *publicID = htmlParsePubidLiteral(ctxt); + if (*publicID == NULL) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "htmlParseExternalID: PUBLIC, no Public Identifier\n"); + ctxt->wellFormed = 0; + } + if (strict) { + /* + * We don't handle [83] so "S SystemLiteral" is required. + */ + if (!IS_BLANK(CUR)) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Space required after the Public Identifier\n"); + ctxt->wellFormed = 0; + } + } else { + /* + * We handle [83] so we return immediately, if + * "S SystemLiteral" is not detected. From a purely parsing + * point of view that's a nice mess. + */ + const CHAR *ptr = CUR_PTR; + if (!IS_BLANK(*ptr)) return(NULL); + + while (IS_BLANK(*ptr)) ptr++; + if ((*ptr != '\'') || (*ptr != '"')) return(NULL); + } + SKIP_BLANKS; + URI = htmlParseSystemLiteral(ctxt); + if (URI == NULL) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "htmlParseExternalID: PUBLIC, no URI\n"); + ctxt->wellFormed = 0; + } + } + return(URI); +} + +/** + * htmlParseComment: + * @ctxt: an HTML parser context + * @create: should we create a node, or just skip the content + * + * Parse an XML (SGML) comment + * + * [15] Comment ::= '' + */ +void +htmlParseComment(htmlParserCtxtPtr ctxt, int create) { + const CHAR *q, *start; + const CHAR *r; + CHAR *val; + + /* + * Check that there is a comment right here. + */ + if ((CUR != '<') || (NXT(1) != '!') || + (NXT(2) != '-') || (NXT(3) != '-')) return; + + SKIP(4); + start = q = CUR_PTR; + NEXT; + r = CUR_PTR; + NEXT; + while (IS_CHAR(CUR) && + ((CUR == ':') || (CUR != '>') || + (*r != '-') || (*q != '-'))) { + if ((*r == '-') && (*q == '-')) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Comment must not contain '--' (double-hyphen)`\n"); + ctxt->wellFormed = 0; + } + NEXT;r++;q++; + } + if (!IS_CHAR(CUR)) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "Comment not terminated \n