mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-31 06:05:44 +00:00
251 lines
9.7 KiB
C++
251 lines
9.7 KiB
C++
/*
|
|
*****************************************************************************************
|
|
* *
|
|
* COPYRIGHT: *
|
|
* (C) Copyright Taligent, Inc., 1997 *
|
|
* (C) Copyright International Business Machines Corporation, 1997 *
|
|
* Licensed Material - Program-Property of IBM - All Rights Reserved. *
|
|
* US Government Users Restricted Rights - Use, duplication, or disclosure *
|
|
* restricted by GSA ADP Schedule Contract with IBM Corp. *
|
|
* *
|
|
*****************************************************************************************
|
|
*
|
|
* File READERS.H
|
|
*
|
|
* Contains support classes for the resource-bundle code
|
|
* This file contains a group of support classes that are used by the resource-bundle code.
|
|
* These classes are as follows:
|
|
* UnicodeStreamReader - A small wrapper class around istream that allows it to read either ASCII
|
|
* or Unicode data.
|
|
* UnicodeStreamWriter - A small wrapper class around ostream that allows it to write Unicode data
|
|
* ResourceFormatReader - A class that parses the low-level special characters in our resource
|
|
* definition file format
|
|
*
|
|
* @author Richard Gillam
|
|
*
|
|
* Modification History:
|
|
*
|
|
* Date Name Description
|
|
* 3/4/97 aliu Modified to support more efficient DataSink class as
|
|
* an alternative to ostream objects.
|
|
* 3/13/97 aliu Added getNextToken() and supporting methods to enable
|
|
* tokenization and parsing of file from front to back.
|
|
* 3/18/97 aliu Changed getNextToken() to getSingleToken() and wrote
|
|
* a new getNextToken() which merges adjacent strings.
|
|
* 3/20/97 aliu Removed obsolete classes to read tagged and comma-
|
|
* delimited lists (now handled by getNextToken()), and
|
|
* commented out unused classes UnicodeStreamWriter and
|
|
* UnicodeDataSinkWriter.
|
|
* 3/25/97 aliu Cleaned up code.
|
|
*
|
|
*****************************************************************************************
|
|
*/
|
|
|
|
#ifndef _READERS
|
|
#define _READERS
|
|
|
|
#ifndef _PTYPES
|
|
#include "ptypes.h"
|
|
#endif
|
|
|
|
//#include "datasink.h"
|
|
#include <stdio.h>
|
|
class UnicodeString;
|
|
|
|
enum {
|
|
kNoErr = 0,
|
|
kEofOnRead,
|
|
kEofOnWrite,
|
|
kItemNotFound
|
|
};
|
|
|
|
//========================================================================================
|
|
// UnicodeStreamReader
|
|
//========================================================================================
|
|
/** Wrapper around istream for reading Unicode data
|
|
* This class wraps an istream and allows us to read Unicode character data. The stream
|
|
* may actually be in either ASCII or Unicode format, but this class always returns
|
|
* Unicode characters. The caller can pass the following values for "format":
|
|
* kASCII - Incoming data is ASCII; zero-pad everything out to 16 bits to get Unicode
|
|
* kBigEndianUnicode - Incoming data is Unicode, and the most significant byte
|
|
* of each character comes first
|
|
* kLittleEndianUnicode - Incoming data is Unicode, and the least significant byte
|
|
* of each character comes first
|
|
* kAuto - Infer the character format from the incoming data. This relies on the
|
|
* "official" Unicode text file format: A file containing Unicode starts with the
|
|
* Unicode byte order mark ($FEFF). If we read something else, the file is assumed to
|
|
* be ASCII. If it's $FEFF or $FFFE, we know it's Unicode and can infer the byte
|
|
* ordering we need to use.
|
|
* kDefault - Incoming data is Unicode, and whatever byte ordering the system we're
|
|
* running on uses internally is the byte ordering we're using (used for memory streams).
|
|
*/
|
|
|
|
#ifdef NLS_MAC
|
|
#pragma export on
|
|
#endif
|
|
|
|
class UnicodeStreamReader {
|
|
public:
|
|
enum CharFormat {
|
|
kASCII,
|
|
kBigEndianUnicode,
|
|
kLittleEndianUnicode,
|
|
kAuto,
|
|
kDefault
|
|
};
|
|
|
|
UnicodeStreamReader( FILE* stream,
|
|
CharFormat format);
|
|
~UnicodeStreamReader();
|
|
|
|
void reset();
|
|
|
|
UniChar get(short& err);
|
|
void putback(UniChar theChar,
|
|
short err = kNoErr);
|
|
|
|
|
|
enum Endian {
|
|
kBig,
|
|
kLittle,
|
|
kUnknown
|
|
};
|
|
|
|
protected:
|
|
static Endian fgEndian;
|
|
|
|
private:
|
|
static void determineEndianism();
|
|
|
|
FILE* fStream;
|
|
CharFormat fFormat;
|
|
UniChar fPutback;
|
|
};
|
|
|
|
//========================================================================================
|
|
// ResourceFormatReader
|
|
//========================================================================================
|
|
/**
|
|
* Class for reading information from a file in our resource-definition format.
|
|
* This takes care of interpreting (and when necessary disregarding) the extra stuff
|
|
* we allow people to put into the file to make it human-readable. The special characters
|
|
* we allow in resource files are as follows:
|
|
* / * Begins a comment, which is terminated by * / (The spaces in these tokens aren't
|
|
* really there; I inserted them to keep the C++ compiler from seeing them as
|
|
* comment delimiters itself; this is standard C++/Java comment syntax). These
|
|
* comments do not nest.
|
|
* // Begins a comment that terminates at the end of the line.
|
|
* " begins and ends a quoted string. Within a quoted string characters that would
|
|
* otherwise have special meaning (except for backslash escape sequences) don't.
|
|
* \ Begins an escape sequence. The following escape sequences are possible:
|
|
* \n Line feed
|
|
* \t Tab
|
|
* \x## ASCII (Latin1) character. May be followed by one or two hex digits that
|
|
* specify the actual character value (if there are no hex digits, or if
|
|
* the value would be 0, the \x sequence is ignored)
|
|
* \u#### Unicode character. May be followed by up to four hex digits that specify
|
|
* the actual character value (if there are no hex digits, or if the value
|
|
* would be 0, the \u sequence is ignored)
|
|
* \ Backslash before any other character deprives that character of a special
|
|
* meaning, if it had a special meaning. Thus, \\ represents a backslash,
|
|
* and \" can be used to put a quote into a quoted string.
|
|
* In addition, whitespace characters (spaces, tabs, line feeds, carriage returns, and
|
|
* Unicode paragraph separators) are ignored, unless they occur within quoted strings.
|
|
* Adjacent string literals are merged together, with a single interveing space, unless
|
|
* both are quoted strings, in which case no space is added between them.
|
|
*/
|
|
class ResourceFormatReader {
|
|
public:
|
|
ResourceFormatReader( FILE* stream,
|
|
UnicodeStreamReader::CharFormat format);
|
|
~ResourceFormatReader();
|
|
|
|
/**
|
|
* The types of tokens which may be returned by getNextToken.
|
|
*/
|
|
enum ETokenType
|
|
{
|
|
kString, // A string token, such as "MonthNames"
|
|
kOpenBrace, // An opening brace character
|
|
kCloseBrace, // A closing brace character
|
|
kComma, // A comma
|
|
|
|
kEOF, // End of the file has been reached successfully
|
|
kError, // An error, such an unterminated quoted string
|
|
kTokenTypeCount = 4 // Number of "real" token types
|
|
};
|
|
|
|
/**
|
|
* Read and return the next token from the stream. If the token is
|
|
* of type kString, fill in the stringToken parameter with the
|
|
* token. If the token is kError, then the err parameter will contain
|
|
* the specific error. This will be kItemNotFound at the end of file,
|
|
* indicating that all tokens have been returned. This method will
|
|
* never return kString twice in a row; instead, multiple adjacent string
|
|
* tokens will be merged into one, with a single intervening space, unless
|
|
* both token are quoted strings, in which case no intervening space is
|
|
* added.
|
|
*
|
|
* @param stringToken Fill in parameter to receive value of string
|
|
* token, if the return value is kString.
|
|
* @param err Fill in parameter to receive error code,
|
|
* if the return value is kError. After the
|
|
* last token is returned, this will be set to
|
|
* kItemNotFound, and kError will be returned.
|
|
* Any other value indicates an abnormal error.
|
|
* @return The type of the next token. This will be either
|
|
* kString, kOpenBrace, kCloseBrace, kComma, or
|
|
* kError. It will never be kNull.
|
|
*/
|
|
ETokenType getNextToken( UnicodeString& stringToken,
|
|
short& err);
|
|
|
|
/**
|
|
* Reset to the start of the input stream. After calling this method,
|
|
* the next call to getNextToken() will return the first token in the
|
|
* stream (if there is one).
|
|
*/
|
|
void reset();
|
|
|
|
protected:
|
|
/**
|
|
* Retrieve the next character, ignoring comments. If skipwhite is true,
|
|
* whitespace is skipped as well.
|
|
*/
|
|
UniChar getNextChar(t_bool skipwhite, short& err);
|
|
|
|
ETokenType getStringToken(UniChar initialChar,
|
|
UnicodeString& stringToken,
|
|
short& err);
|
|
|
|
void seekUntilNewline(short& err);
|
|
|
|
void seekUntilEndOfComment(short& err);
|
|
|
|
UniChar convertEscapeSequence(short& err);
|
|
|
|
static t_bool isWhitespace(UniChar c);
|
|
|
|
static t_bool isNewline(UniChar c);
|
|
|
|
static t_bool isHexDigit(UniChar c);
|
|
|
|
// Special characters we recognize during processing
|
|
static const UniChar kOPENBRACE;
|
|
static const UniChar kCLOSEBRACE;
|
|
static const UniChar kCOMMA;
|
|
static const UniChar kQUOTE;
|
|
static const UniChar kESCAPE;
|
|
static const UniChar kSLASH;
|
|
static const UniChar kASTERISK;
|
|
static const UniChar kSPACE;
|
|
|
|
UnicodeStreamReader fReader;
|
|
};
|
|
|
|
#ifdef NLS_MAC
|
|
#pragma export off
|
|
#endif
|
|
|
|
#endif
|