1998-04-13 20:24:54 +00:00
|
|
|
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
|
|
/*
|
|
|
|
* The contents of this file are subject to the Netscape Public License
|
|
|
|
* Version 1.0 (the "NPL"); you may not use this file except in
|
|
|
|
* compliance with the NPL. You may obtain a copy of the NPL at
|
|
|
|
* http://www.mozilla.org/NPL/
|
|
|
|
*
|
|
|
|
* Software distributed under the NPL is distributed on an "AS IS" basis,
|
|
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
|
|
|
|
* for the specific language governing rights and limitations under the
|
|
|
|
* NPL.
|
|
|
|
*
|
|
|
|
* The Initial Developer of this code under the NPL is Netscape
|
|
|
|
* Communications Corporation. Portions created by Netscape are
|
|
|
|
* Copyright (C) 1998 Netscape Communications Corporation. All Rights
|
|
|
|
* Reserved.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* MODULE NOTES:
|
|
|
|
* @update gess 4/1/98
|
|
|
|
*
|
|
|
|
* This file contains the declarations for all the
|
|
|
|
* HTML specific token types that our HTML tokenizer
|
|
|
|
* delegate understands.
|
|
|
|
*
|
|
|
|
* If you want to add a new kind of token, this is
|
|
|
|
* the place to do it. You should also add a bit of glue
|
|
|
|
* code to the HTML tokenizer delegate class.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef HTMLTOKENS_H
|
|
|
|
#define HTMLTOKENS_H
|
|
|
|
|
|
|
|
#include "nsToken.h"
|
|
|
|
#include <iostream.h>
|
|
|
|
|
|
|
|
// If you define these to true then crlf sequences and whitespace come
|
|
|
|
// through the scanner as seperate tokens.
|
|
|
|
#undef TOKENIZE_CRLF
|
|
|
|
#undef TOKENIZE_WHITESPACE
|
|
|
|
|
|
|
|
class CScanner;
|
|
|
|
|
|
|
|
enum eHTMLTokenTypes {
|
|
|
|
eToken_unknown=2000,
|
|
|
|
|
|
|
|
eToken_start, eToken_end, eToken_comment, eToken_entity,
|
|
|
|
eToken_whitespace, eToken_newline, eToken_text, eToken_attribute,
|
|
|
|
eToken_script, eToken_style, eToken_skippedcontent, //used in cases like <SCRIPT> where we skip over script content.
|
|
|
|
eToken_last
|
|
|
|
};
|
|
|
|
|
|
|
|
//*** This enum is used to define the known universe of HTML tags.
|
|
|
|
//*** The use of this table doesn't preclude of from using non-standard
|
|
|
|
//*** tags. It simply makes normal tag handling more efficient.
|
|
|
|
enum eHTMLTags
|
|
|
|
{
|
|
|
|
eHTMLTag_unknown=0, eHTMLTag_doctype, eHTMLTag_a, eHTMLTag_acronym,
|
|
|
|
eHTMLTag_address, eHTMLTag_applet, eHTMLTag_area, eHTMLTag_bold,
|
|
|
|
eHTMLTag_base, eHTMLTag_basefont, eHTMLTag_bdo, eHTMLTag_big,
|
|
|
|
eHTMLTag_blink, eHTMLTag_blockquote, eHTMLTag_body, eHTMLTag_br,
|
|
|
|
eHTMLTag_button, eHTMLTag_caption, eHTMLTag_center,
|
|
|
|
eHTMLTag_certificate, eHTMLTag_cite,
|
|
|
|
eHTMLTag_code, eHTMLTag_col, eHTMLTag_colgroup, eHTMLTag_dd,
|
|
|
|
eHTMLTag_del, eHTMLTag_dfn, eHTMLTag_div, eHTMLTag_dir,
|
|
|
|
eHTMLTag_dl, eHTMLTag_dt, eHTMLTag_em, eHTMLTag_embed,
|
|
|
|
eHTMLTag_fieldset, eHTMLTag_font, eHTMLTag_footer,
|
|
|
|
eHTMLTag_form, eHTMLTag_frame, eHTMLTag_frameset,
|
|
|
|
eHTMLTag_h1, eHTMLTag_h2, eHTMLTag_h3, eHTMLTag_h4,
|
|
|
|
eHTMLTag_h5, eHTMLTag_h6, eHTMLTag_head, eHTMLTag_header,
|
|
|
|
eHTMLTag_hr, eHTMLTag_html, eHTMLTag_iframe, eHTMLTag_ilayer,
|
|
|
|
eHTMLTag_italic, eHTMLTag_img, eHTMLTag_ins, eHTMLTag_input,
|
|
|
|
eHTMLTag_isindex, eHTMLTag_kbd, eHTMLTag_keygen,
|
|
|
|
eHTMLTag_label, eHTMLTag_layer, eHTMLTag_legend, eHTMLTag_listitem,
|
|
|
|
eHTMLTag_link, eHTMLTag_listing, eHTMLTag_map, eHTMLTag_marquee,
|
|
|
|
eHTMLTag_math, eHTMLTag_menu, eHTMLTag_meta, eHTMLTag_newline,
|
|
|
|
eHTMLTag_noembed, eHTMLTag_noframes, eHTMLTag_nolayer, eHTMLTag_noscript,
|
|
|
|
eHTMLTag_note, eHTMLTag_object, eHTMLTag_ol,
|
|
|
|
eHTMLTag_option, eHTMLTag_paragraph, eHTMLTag_param, eHTMLTag_plaintext,
|
|
|
|
eHTMLTag_pre, eHTMLTag_quotation, eHTMLTag_strike, eHTMLTag_samp,
|
|
|
|
eHTMLTag_script, eHTMLTag_select, eHTMLTag_server, eHTMLTag_small,
|
|
|
|
eHTMLTag_spacer, eHTMLTag_span,
|
|
|
|
eHTMLTag_strong, eHTMLTag_style, eHTMLTag_sub, eHTMLTag_sup,
|
|
|
|
eHTMLTag_table, eHTMLTag_tbody, eHTMLTag_td, eHTMLTag_tfoot,
|
|
|
|
eHTMLTag_thead, eHTMLTag_th, eHTMLTag_tr, eHTMLTag_textarea,
|
|
|
|
eHTMLTag_title, eHTMLTag_tt, eHTMLTag_monofont, eHTMLTag_u,
|
|
|
|
eHTMLTag_ul, eHTMLTag_var, eHTMLTag_wbr, eHTMLTag_whitespace,
|
|
|
|
eHTMLTag_xmp,
|
|
|
|
eHTMLTag_userdefined
|
|
|
|
};
|
|
|
|
|
|
|
|
//*** This enum is used to define the known universe of HTML attributes.
|
|
|
|
//*** The use of this table doesn't preclude of from using non-standard
|
|
|
|
//*** attributes. It simply makes normal tag handling more efficient.
|
|
|
|
enum eHTMLAttributes {
|
|
|
|
eHTMLAttr_abbrev, eHTMLAttr_above, eHTMLAttr_alt, eHTMLAttr_array,
|
|
|
|
eHTMLAttr_author, eHTMLAttr_background, eHTMLAttr_banner, eHTMLAttr_below,
|
|
|
|
eHTMLAttr_bgsound, eHTMLAttr_box, eHTMLAttr_bt, eHTMLAttr_class,
|
|
|
|
eHTMLAttr_comment, eHTMLAttr_credit, eHTMLAttr_dir, eHTMLAttr_figure,
|
|
|
|
eHTMLAttr_footnote, eHTMLAttr_height, eHTMLAttr_id, eHTMLAttr_lang,
|
|
|
|
eHTMLAttr_math, eHTMLAttr_name, eHTMLAttr_nextid, eHTMLAttr_nobreak,
|
|
|
|
eHTMLAttr_note, eHTMLAttr_option, eHTMLAttr_overlay, eHTMLAttr_person,
|
|
|
|
eHTMLAttr_public, eHTMLAttr_range, eHTMLAttr_root, eHTMLAttr_sgml,
|
|
|
|
eHTMLAttr_sqrt, eHTMLAttr_src, eHTMLAttr_style, eHTMLAttr_text,
|
|
|
|
eHTMLAttr_title, eHTMLAttr_wordbreak, eHTMLAttr_width, eHTMLAttr_xmp
|
|
|
|
};
|
|
|
|
|
|
|
|
PRInt32 ConsumeQuotedString(PRUnichar aChar,nsString& aString,CScanner* aScanner);
|
|
|
|
PRInt32 ConsumeAttributeText(PRUnichar aChar,nsString& aString,CScanner* aScanner);
|
|
|
|
PRInt32 FindEntityIndex(const char* aBuffer,PRInt32 aBufLen=-1);
|
|
|
|
eHTMLTags DetermineHTMLTagType(const nsString& aString);
|
|
|
|
eHTMLTokenTypes DetermineTokenType(const nsString& aString);
|
|
|
|
|
|
|
|
|
1998-04-15 17:27:56 +00:00
|
|
|
/** -----------------------------------------------------
|
|
|
|
* This declares the basic token type used in the html-
|
|
|
|
* parser.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
*/ //---------------------------------------------------
|
1998-04-13 20:24:54 +00:00
|
|
|
class CHTMLToken : public CToken {
|
1998-04-15 17:27:56 +00:00
|
|
|
public:
|
1998-04-13 20:24:54 +00:00
|
|
|
CHTMLToken(const nsString& aString);
|
1998-04-15 17:27:56 +00:00
|
|
|
virtual eHTMLTags GetHTMLTag();
|
|
|
|
void SetHTMLTag(eHTMLTags aTagType);
|
|
|
|
protected:
|
1998-04-13 20:24:54 +00:00
|
|
|
eHTMLTags mTagType;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
1998-04-15 17:27:56 +00:00
|
|
|
/** -----------------------------------------------------
|
|
|
|
* This declares start tokens, which always take the
|
|
|
|
* form <xxxx>. This class also knows how to consume
|
|
|
|
* related attributes.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
*/ //---------------------------------------------------
|
1998-04-13 20:24:54 +00:00
|
|
|
class CStartToken: public CHTMLToken {
|
|
|
|
public:
|
|
|
|
CStartToken(const nsString& aString);
|
|
|
|
virtual PRInt32 Consume(PRUnichar aChar,CScanner* aScanner);
|
|
|
|
virtual eHTMLTags GetHTMLTag();
|
1998-04-15 17:27:56 +00:00
|
|
|
virtual const char* GetClassName(void);
|
|
|
|
virtual PRInt32 GetTokenType(void);
|
|
|
|
void SetAttributed(PRBool aValue);
|
|
|
|
PRBool IsAttributed(void);
|
1998-04-13 20:24:54 +00:00
|
|
|
virtual void DebugDumpSource(ostream& out);
|
|
|
|
|
|
|
|
protected:
|
1998-04-15 17:27:56 +00:00
|
|
|
PRBool mAttributed;
|
1998-04-13 20:24:54 +00:00
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
1998-04-15 17:27:56 +00:00
|
|
|
/** -----------------------------------------------------
|
|
|
|
* This declares end tokens, which always take the
|
|
|
|
* form </xxxx>. This class also knows how to consume
|
|
|
|
* related attributes.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
*/ //---------------------------------------------------
|
1998-04-13 20:24:54 +00:00
|
|
|
class CEndToken: public CHTMLToken {
|
|
|
|
public:
|
|
|
|
CEndToken(const nsString& aString);
|
|
|
|
virtual PRInt32 Consume(PRUnichar aChar,CScanner* aScanner);
|
|
|
|
virtual eHTMLTags GetHTMLTag();
|
1998-04-15 17:27:56 +00:00
|
|
|
virtual const char* GetClassName(void);
|
|
|
|
virtual PRInt32 GetTokenType(void);
|
1998-04-13 20:24:54 +00:00
|
|
|
virtual void DebugDumpSource(ostream& out);
|
|
|
|
};
|
|
|
|
|
|
|
|
|
1998-04-15 17:27:56 +00:00
|
|
|
/** -----------------------------------------------------
|
|
|
|
* This declares comment tokens. Comments are usually
|
|
|
|
* thought of as tokens, but we treat them that way
|
|
|
|
* here so that the parser can have a consistent view
|
|
|
|
* of all tokens.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
*/ //---------------------------------------------------
|
1998-04-13 20:24:54 +00:00
|
|
|
class CCommentToken: public CHTMLToken {
|
|
|
|
public:
|
|
|
|
CCommentToken(const nsString& aString);
|
|
|
|
virtual PRInt32 Consume(PRUnichar aChar,CScanner* aScanner);
|
1998-04-15 00:33:06 +00:00
|
|
|
virtual const char* GetClassName(void);
|
1998-04-15 17:27:56 +00:00
|
|
|
virtual PRInt32 GetTokenType(void);
|
1998-04-13 20:24:54 +00:00
|
|
|
char mLeadingChar;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
1998-04-15 17:27:56 +00:00
|
|
|
/** -----------------------------------------------------
|
|
|
|
* This class declares entity tokens, which always take
|
|
|
|
* the form &xxxx;. This class also offers a few utility
|
|
|
|
* methods that allow you to easily reduce entities.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
*/ //---------------------------------------------------
|
1998-04-13 20:24:54 +00:00
|
|
|
class CEntityToken : public CHTMLToken {
|
|
|
|
public:
|
|
|
|
CEntityToken(const nsString& aString);
|
1998-04-15 17:27:56 +00:00
|
|
|
virtual const char* GetClassName(void);
|
|
|
|
virtual PRInt32 GetTokenType(void);
|
1998-04-13 20:24:54 +00:00
|
|
|
PRInt32 TranslateToUnicode(void);
|
|
|
|
virtual PRInt32 Consume(PRUnichar aChar,CScanner* aScanner);
|
|
|
|
static PRInt32 ConsumeEntity(PRUnichar aChar,nsString& aString,CScanner* aScanner);
|
|
|
|
static PRInt32 TranslateToUnicode(nsString& aString);
|
|
|
|
static PRInt32 FindEntityIndex(const char* aBuffer,PRInt32 aBufLen=-1);
|
|
|
|
static PRBool VerifyEntityTable(void);
|
|
|
|
static PRInt32 ReduceEntities(nsString& aString);
|
|
|
|
virtual void DebugDumpSource(ostream& out);
|
|
|
|
|
|
|
|
private:
|
|
|
|
static PRInt32 mEntityTokenCount;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
1998-04-15 17:27:56 +00:00
|
|
|
/** -----------------------------------------------------
|
|
|
|
* Whitespace tokens are used where whitespace can be
|
|
|
|
* detected as distinct from text. This allows us to
|
|
|
|
* easily skip leading/trailing whitespace when desired.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
*/ //---------------------------------------------------
|
1998-04-13 20:24:54 +00:00
|
|
|
#ifdef TOKENIZE_WHITESPACE
|
|
|
|
class CWhitespaceToken: public CHTMLToken {
|
|
|
|
public:
|
|
|
|
CWhitespaceToken(const nsString& aString);
|
|
|
|
virtual PRInt32 Consume(PRUnichar aChar,CScanner* aScanner);
|
1998-04-15 17:27:56 +00:00
|
|
|
virtual const char* GetClassName(void);
|
|
|
|
virtual PRInt32 GetTokenType(void);
|
1998-04-13 20:24:54 +00:00
|
|
|
};
|
|
|
|
#endif
|
|
|
|
|
1998-04-15 17:27:56 +00:00
|
|
|
/** -----------------------------------------------------
|
|
|
|
* Text tokens contain the normalized form of html text.
|
|
|
|
* These tokens are guaranteed not to contain entities,
|
|
|
|
* start or end tags, or newlines.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
*/ //---------------------------------------------------
|
1998-04-13 20:24:54 +00:00
|
|
|
class CTextToken: public CHTMLToken {
|
|
|
|
public:
|
|
|
|
CTextToken(const nsString& aString);
|
|
|
|
virtual PRInt32 Consume(PRUnichar aChar,CScanner* aScanner);
|
1998-04-15 17:27:56 +00:00
|
|
|
virtual const char* GetClassName(void);
|
|
|
|
virtual PRInt32 GetTokenType(void);
|
1998-04-13 20:24:54 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
1998-04-15 17:27:56 +00:00
|
|
|
/** -----------------------------------------------------
|
|
|
|
* Attribute tokens are used to contain attribute key/value
|
|
|
|
* pairs whereever they may occur. Typically, they should
|
|
|
|
* occur only in start tokens. However, we may expand that
|
|
|
|
* ability when XML tokens become commonplace.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
*/ //---------------------------------------------------
|
1998-04-13 20:24:54 +00:00
|
|
|
class CAttributeToken: public CHTMLToken {
|
|
|
|
public:
|
|
|
|
CAttributeToken(const nsString& aString);
|
|
|
|
virtual PRInt32 Consume(PRUnichar aChar,CScanner* aScanner);
|
1998-04-15 17:27:56 +00:00
|
|
|
virtual const char* GetClassName(void);
|
|
|
|
virtual PRInt32 GetTokenType(void);
|
1998-04-13 20:24:54 +00:00
|
|
|
virtual nsString& GetKey(void) {return mTextKey;}
|
|
|
|
virtual void DebugDumpToken(ostream& out);
|
|
|
|
virtual void DebugDumpSource(ostream& out);
|
|
|
|
PRBool mLastAttribute;
|
|
|
|
|
|
|
|
protected:
|
|
|
|
nsString mTextKey;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
1998-04-15 17:27:56 +00:00
|
|
|
/** -----------------------------------------------------
|
|
|
|
* Newline tokens contain, you guessed it, newlines.
|
|
|
|
* They consume newline (CR/LF) either alone or in pairs.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
*/ //---------------------------------------------------
|
1998-04-13 20:24:54 +00:00
|
|
|
#ifdef TOKENIZE_CRLF
|
|
|
|
class CNewlineToken: public CHTMLToken {
|
|
|
|
public:
|
|
|
|
CNewlineToken(const nsString& aString);
|
|
|
|
virtual PRInt32 Consume(PRUnichar aChar,CScanner* aScanner);
|
1998-04-15 17:27:56 +00:00
|
|
|
virtual const char* GetClassName(void);
|
|
|
|
virtual PRInt32 GetTokenType(void);
|
1998-04-13 20:24:54 +00:00
|
|
|
};
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
1998-04-15 17:27:56 +00:00
|
|
|
/** -----------------------------------------------------
|
|
|
|
* Script tokens contain sequences of javascript (or, gulp,
|
|
|
|
* any other script you care to send). We don't tokenize
|
|
|
|
* it here, nor validate it. We just wrap it up, and pass
|
|
|
|
* it along to the html parser, who sends it (later on)
|
|
|
|
* to the scripting engine.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
*/ //---------------------------------------------------
|
1998-04-13 20:24:54 +00:00
|
|
|
class CScriptToken: public CHTMLToken {
|
|
|
|
public:
|
|
|
|
|
|
|
|
CScriptToken(const nsString& aString);
|
1998-04-15 17:27:56 +00:00
|
|
|
virtual const char* GetClassName(void);
|
|
|
|
virtual PRInt32 GetTokenType(void);
|
1998-04-13 20:24:54 +00:00
|
|
|
protected:
|
|
|
|
};
|
|
|
|
|
|
|
|
|
1998-04-15 17:27:56 +00:00
|
|
|
/** -----------------------------------------------------
|
|
|
|
* Style tokens contain sequences of css style. We don't
|
|
|
|
* tokenize it here, nor validate it. We just wrap it up,
|
|
|
|
* and pass it along to the html parser, who sends it
|
|
|
|
* (later on) to the style engine.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
*/ //---------------------------------------------------
|
1998-04-13 20:24:54 +00:00
|
|
|
class CStyleToken: public CHTMLToken {
|
|
|
|
public:
|
|
|
|
CStyleToken(const nsString& aString);
|
1998-04-15 17:27:56 +00:00
|
|
|
virtual const char* GetClassName(void);
|
|
|
|
virtual PRInt32 GetTokenType(void);
|
1998-04-13 20:24:54 +00:00
|
|
|
protected:
|
|
|
|
};
|
|
|
|
|
|
|
|
|
1998-04-15 17:27:56 +00:00
|
|
|
/** -----------------------------------------------------
|
|
|
|
* This is a placeholder token, which is being deprecated.
|
|
|
|
* Don't bother paying attention to this.
|
|
|
|
*
|
|
|
|
* @update gess 3/25/98
|
|
|
|
*/ //---------------------------------------------------
|
1998-04-13 20:24:54 +00:00
|
|
|
class CSkippedContentToken: public CAttributeToken {
|
|
|
|
public:
|
|
|
|
CSkippedContentToken(const nsString& aString);
|
|
|
|
virtual PRInt32 Consume(PRUnichar aChar,CScanner* aScanner);
|
1998-04-15 17:27:56 +00:00
|
|
|
virtual const char* GetClassName(void);
|
|
|
|
virtual PRInt32 GetTokenType(void);
|
1998-04-13 20:24:54 +00:00
|
|
|
protected:
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|