gecko-dev/modules/libnls/headers/readers.h

/*
*****************************************************************************************
*                                                                                       *
* COPYRIGHT:                                                                            *
*   (C) Copyright Taligent, Inc.,  1997                                                 *
*   (C) Copyright International Business Machines Corporation,  1997                    *
*   Licensed Material - Program-Property of IBM - All Rights Reserved.                  *
*   US Government Users Restricted Rights - Use, duplication, or disclosure             *
*   restricted by GSA ADP Schedule Contract with IBM Corp.                              *
*                                                                                       *
*****************************************************************************************
*
* File READERS.H
*
* Contains support classes for the resource-bundle code
* This file contains a group of support classes that are used by the resource-bundle code.
* These classes are as follows:
*   UnicodeStreamReader - A small wrapper class around istream that allows it to read either ASCII
*     or Unicode data.
*   UnicodeStreamWriter - A small wrapper class around ostream that allows it to write Unicode data
*   ResourceFormatReader - A class that parses the low-level special characters in our resource
*     definition file format
*
* @author		Richard Gillam
*
* Modification History:
*
*	Date		Name		Description
*	3/4/97		aliu		Modified to support more efficient DataSink class as
*							an alternative to ostream objects.
*	3/13/97		aliu		Added getNextToken() and supporting methods to enable
*							tokenization and parsing of file from front to back.
*	3/18/97		aliu		Changed getNextToken() to getSingleToken() and wrote
*							a new getNextToken() which merges adjacent strings.
*	3/20/97		aliu		Removed obsolete classes to read tagged and comma-
*							delimited lists (now handled by getNextToken()), and
*							commented out unused classes UnicodeStreamWriter and
*							UnicodeDataSinkWriter.
*	3/25/97		aliu		Cleaned up code.
*
*****************************************************************************************
*/

#ifndef _READERS
#define _READERS

#ifndef _PTYPES
#include "ptypes.h"
#endif

//#include "datasink.h"
#include <stdio.h>
class UnicodeString;

enum {
	kNoErr		= 0,
	kEofOnRead,
	kEofOnWrite,
	kItemNotFound
};

//========================================================================================
// UnicodeStreamReader
//========================================================================================
/** Wrapper around istream for reading Unicode data
 *  This class wraps an istream and allows us to read Unicode character data.  The stream
 *  may actually be in either ASCII or Unicode format, but this class always returns
 *  Unicode characters.  The caller can pass the following values for "format":
 *    kASCII - Incoming data is ASCII; zero-pad everything out to 16 bits to get Unicode
 *    kBigEndianUnicode - Incoming data is Unicode, and the most significant byte
 *      of each character comes first
 *    kLittleEndianUnicode - Incoming data is Unicode, and the least significant byte
 *      of each character comes first
 *    kAuto - Infer the character format from the incoming data.  This relies on the
 *      "official" Unicode text file format:  A file containing Unicode starts with the
 *      Unicode byte order mark ($FEFF).  If we read something else, the file is assumed to
 *      be ASCII.  If it's $FEFF or $FFFE, we know it's Unicode and can infer the byte
 *      ordering we need to use.
 *    kDefault - Incoming data is Unicode, and whatever byte ordering the system we're
 *      running on uses internally is the byte ordering we're using (used for memory streams).
 */

#ifdef NLS_MAC
#pragma export on
#endif

class UnicodeStreamReader {
	public:
		enum CharFormat {
			kASCII,
			kBigEndianUnicode,
			kLittleEndianUnicode,
			kAuto,
			kDefault
		};

                                    UnicodeStreamReader(    FILE*       stream,
                                                            CharFormat  format);
									~UnicodeStreamReader();

		void						reset();

		UniChar						get(short&	err);
		void						putback(UniChar		theChar,
											short		err = kNoErr);


		enum Endian {
			kBig,
			kLittle,
			kUnknown
		};

	protected:
		static Endian				fgEndian;

	private:
		static void					determineEndianism();

		FILE*						fStream;
		CharFormat					fFormat;
		UniChar						fPutback;
};

//========================================================================================
// ResourceFormatReader
//========================================================================================
/**
 * Class for reading information from a file in our resource-definition format.
 * This takes care of interpreting (and when necessary disregarding) the extra stuff
 * we allow people to put into the file to make it human-readable.  The special characters
 * we allow in resource files are as follows:
 *   / * Begins a comment, which is terminated by * / (The spaces in these tokens aren't
 *		really there; I inserted them to keep the C++ compiler from seeing them as
 *		comment delimiters itself; this is standard C++/Java comment syntax).  These
 *		comments do not nest.
 *   // Begins a comment that terminates at the end of the line.
 *   "  begins and ends a quoted string.  Within a quoted string characters that would
 *      otherwise have special meaning (except for backslash escape sequences) don't.
 *   \  Begins an escape sequence.  The following escape sequences are possible:
 *      \n     Line feed
 *      \t     Tab
 *      \x##   ASCII (Latin1) character.  May be followed by one or two hex digits that
 *             specify the actual character value (if there are no hex digits, or if
 *             the value would be 0, the \x sequence is ignored)
 *      \u#### Unicode character.  May be followed by up to four hex digits that specify
 *             the actual character value (if there are no hex digits, or if the value
 *             would be 0, the \u sequence is ignored)
 *      \      Backslash before any other character deprives that character of a special
 *             meaning, if it had a special meaning.  Thus, \\ represents a backslash,
 *             and \" can be used to put a quote into a quoted string.
 * In addition, whitespace characters (spaces, tabs, line feeds, carriage returns, and
 * Unicode paragraph separators) are ignored, unless they occur within quoted strings.
 * Adjacent string literals are merged together, with a single interveing space, unless
 * both are quoted strings, in which case no space is added between them.
 */
class ResourceFormatReader {
public:
                                ResourceFormatReader(   FILE*                           stream,
                                                        UnicodeStreamReader::CharFormat format);
								~ResourceFormatReader();

	/**
	 * The types of tokens which may be returned by getNextToken.
	 */
	enum ETokenType
	{
		kString,				// A string token, such as "MonthNames"
		kOpenBrace,				// An opening brace character
		kCloseBrace,			// A closing brace character
		kComma,					// A comma

		kEOF,					// End of the file has been reached successfully
		kError,					// An error, such an unterminated quoted string
		kTokenTypeCount = 4		// Number of "real" token types
	};

	/**
	 * Read and return the next token from the stream.  If the token is
	 * of type kString, fill in the stringToken parameter with the
	 * token.  If the token is kError, then the err parameter will contain
	 * the specific error.  This will be kItemNotFound at the end of file,
	 * indicating that all tokens have been returned.  This method will
	 * never return kString twice in a row; instead, multiple adjacent string
	 * tokens will be merged into one, with a single intervening space, unless
	 * both token are quoted strings, in which case no intervening space is
	 * added.
	 *
	 * @param stringToken	Fill in parameter to receive value of string
	 *						token, if the return value is kString.
	 * @param err			Fill in parameter to receive error code,
	 *						if the return value is kError.  After the
	 *						last token is returned, this will be set to
	 *						kItemNotFound, and kError will be returned.
	 *						Any other value indicates an abnormal error.
	 * @return				The type of the next token.  This will be either
	 *						kString, kOpenBrace, kCloseBrace, kComma, or
	 *						kError.  It will never be kNull.
	 */
	ETokenType					getNextToken(	UnicodeString&	stringToken,
												short&			err);

	/**
	 * Reset to the start of the input stream.  After calling this method,
	 * the next call to getNextToken() will return the first token in the
	 * stream (if there is one).
	 */
	void						reset();

protected:
	/**
	 * Retrieve the next character, ignoring comments.  If skipwhite is true,
	 * whitespace is skipped as well.
	 */
	UniChar						getNextChar(t_bool skipwhite, short& err);

	ETokenType					getStringToken(UniChar			initialChar,
											   UnicodeString&	stringToken,
											   short&			err);

	void						seekUntilNewline(short&	err);

	void						seekUntilEndOfComment(short& err);

	UniChar						convertEscapeSequence(short& err);

	static t_bool					isWhitespace(UniChar c);

	static t_bool					isNewline(UniChar c);

	static t_bool					isHexDigit(UniChar c);

	// Special characters we recognize during processing
	static const UniChar		kOPENBRACE;
	static const UniChar		kCLOSEBRACE;
	static const UniChar		kCOMMA;
	static const UniChar		kQUOTE;
	static const UniChar		kESCAPE;
	static const UniChar		kSLASH;
	static const UniChar		kASTERISK;
	static const UniChar		kSPACE;

	UnicodeStreamReader			fReader;
};

#ifdef NLS_MAC
#pragma export off
#endif

#endif