gecko-dev/xpcom/ds/nsStringTokenizer.cpp
1999-07-15 06:40:44 +00:00

552 lines
13 KiB
C++
Raw Blame History

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/*
* The contents of this file are subject to the Netscape Public License
* Version 1.0 (the "NPL"); you may not use this file except in
* compliance with the NPL. You may obtain a copy of the NPL at
* http://www.mozilla.org/NPL/
*
* Software distributed under the NPL is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
* for the specific language governing rights and limitations under the
* NPL.
*
* The Initial Developer of this code under the NPL is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All Rights
* Reserved.
*/
#include "nsStringTokenizer.h"
nsStringTokenizer::nsStringTokenizer(const char* aFieldSep,const char* aRecordSep) :
mDataStartDelimiter(""),
mDataEndDelimiter(""),
mSubstrStartDelimiter(""),
mSubstrEndDelimiter(""),
mFieldSeparator(aFieldSep),
mRecordSeparator(aRecordSep)
{
mBuffer=0;
mOffset=0;
mValidChars[0]=mValidChars[1]=mValidChars[2]=mValidChars[3]=0;
mInvalidChars[0]=mInvalidChars[1]=mInvalidChars[2]=mInvalidChars[3]=0;
mCharSpec=eGivenChars;
}
nsStringTokenizer::~nsStringTokenizer(){
}
/**
* This method can tell you whether a given char is in the valid set
* given by the user in the constructor
* @update gess7/10/99
*/
void nsStringTokenizer::SetBuffer(nsString& aBuffer) {
mBuffer=&aBuffer;
}
/**
* Call this to add a token specifier to this tokenizer.
* Ultimately -- this method will be callable any number of times
* so that you can have multiple token types.
*
* @update gess7/10/99
*/
void nsStringTokenizer::AddTokenSpec(const char* aTokenSpec) {
if(aTokenSpec) {
ExpandDataSpecifier(aTokenSpec);
}
}
/**
* This method can tell you whether a given char is in the valid set
* given by the user in the constructor
* @update gess7/10/99
*/
inline PRBool nsStringTokenizer::IsValidDataChar(PRUnichar aChar) {
PRBool result=PR_FALSE;
switch(mCharSpec) {
case eGivenChars:
{
PRInt32 theByteNum=aChar/32;
PRInt32 theBitNum=aChar-(theByteNum*32);
PRInt32 shift=(1<<theBitNum);
PRInt32 value=PRInt32(mValidChars[theByteNum]&shift);
result=PRBool(value>0);
}
break;
case eAllChars:
result=PR_TRUE;
break;
case eExceptChars:
break;
}
return result;
}
inline void SetChars(PRInt32 array[3],PRUnichar aStart,PRUnichar aStop){
PRInt32 theChar;
for(theChar=aStart;theChar<=aStop;theChar++){
PRInt32 theByteNum=theChar/32;
PRInt32 theBitNum=theChar-(theByteNum*32);
PRInt32 shift=(1<<theBitNum);
array[theByteNum]|=shift;
}
}
inline void ClearChars(PRInt32 array[3],PRUnichar aStart,PRUnichar aStop){
PRInt32 theChar;
for(theChar=aStart;theChar<=aStop;theChar++){
PRInt32 theByteNum=theChar/32;
PRInt32 theBitNum=theChar-(theByteNum*32);
PRInt32 shift=(1<<theBitNum);
array[theByteNum]&=(~shift);
}
}
/**
* This method constructs the legal charset and data delimiter pairs.
* Specifier rules are:
* abc -- allows a set of characters
* [a-z] -- allows all chars in given range
* [*-*] -- allows all characters
* ^abc -- disallows a set of characters
* [a^z] -- disallows all characters in given range
* [a*b] -- specifies a delimiter pair for the entire token
* [a+b] -- specifies a delimiter pair for substrings in the token
* @update gess7/10/99
*/
void nsStringTokenizer::ExpandDataSpecifier(const char* aDataSpec) {
if(aDataSpec) {
PRInt32 theIndex=-1;
char theChar=0;
while(theChar=aDataSpec[++theIndex]) {
switch(theChar) {
case '[':
switch(aDataSpec[theIndex+2]){
case '-':
{
char theStart=aDataSpec[theIndex+1];
char theEnd=aDataSpec[theIndex+3];
if(('*'==theStart) && (theStart==theEnd)) {
mCharSpec=eAllChars;
}
else {
SetChars(mValidChars,theStart,theEnd);
}
}
break;
case '^': //specify a range of invalid chars
{
char theStart=aDataSpec[theIndex+1];
char theEnd=aDataSpec[theIndex+3];
SetChars(mInvalidChars,theStart,theEnd);
}
break;
case '*': //this char signals a delimiter pair
mDataStartDelimiter+=aDataSpec[theIndex+1];
mDataEndDelimiter+=aDataSpec[theIndex+3];
break;
case '+': //this char signals a delimiter pair for substrings
mSubstrStartDelimiter+=aDataSpec[theIndex+1];
mSubstrEndDelimiter+=aDataSpec[theIndex+3];
break;
default:
break;
}
theIndex+=4;
break;
case '^'://they've given us a list (not a range) of invalid chars
{
while(theChar=aDataSpec[++theIndex]) {
if('['!=theChar) {
SetChars(mInvalidChars,theChar,theChar);
}
else {
--theIndex;
break;
}
}
}
break;
default:
SetChars(mValidChars,theChar,theChar);
break;
}//switch
}
}
/* DEBUG CODE TO SHOW STRING OF GIVEN CHARSET
CAutoString temp;
for(PRInt32 theChar=0;theChar<128;theChar++){
if(IsValidDataChar(theChar))
temp+=theChar;
}
PRInt32 x=10;
*/
}
nsStringTokenizer::eCharTypes nsStringTokenizer::DetermineCharType(PRUnichar ch) {
eCharTypes result=eUnknown;
if(mRecordSeparator[0]==ch)
result=eRecordSeparator;
else if(mFieldSeparator[0]==ch)
result=eFieldSeparator;
else if((mDataStartDelimiter[0]==ch) || (mDataEndDelimiter[0]==ch))
result=eDataDelimiter;
else if(IsValidDataChar(ch))
result=eDataChar;
return result;
}
/**
* Moves the input stream to the start of the file.
* @update gess7/25/98
* @return yes if all is well
*/
PRBool nsStringTokenizer::FirstRecord(void){
mOffset=0;
return PRBool(mBuffer!=0);
}
/**
* Seeks to next record
* @update gess7/25/98
* @return PR_TRUE if there IS a next record
*/
PRBool nsStringTokenizer::NextRecord(void){
PRBool result=PR_FALSE;
if(mBuffer) {
PRInt32 status=SkipOver(mRecordSeparator);
if(NS_OK==status) {
if(SkipToValidData()) {
if(NS_OK==status) {
result=HasNextToken();
}
}
else result=PR_FALSE;
}
}
return result;
}
/*
* LAST MODS: gess 12Aug94
* PARMS: <09><>
* RETURNS: YES if there is another field to be read.
* PURPOSE: Allows a client to ask the io system to test for
the presence of another field.
*/
PRBool nsStringTokenizer::HasNextToken(void){
PRBool result=PR_FALSE;
if(mBuffer){
while(More()) {
//Now go test to see if there is any other field data in this record.
//The appropriate algorithm here is to scan the file until you
//find one of following things occurs:
// 1. You find a field separator
// 2. You find a record separator
// 3. You hit the end of the file
// 4. You find a valid char.
PRUnichar theChar;
GetChar(theChar);
switch(DetermineCharType(theChar)){
case eUnknown: //ok to skip junk between delimiters...
if(-1<mSubstrStartDelimiter.Find(theChar)) {
break;
}
case eDataChar:
if(kSpace<theChar) {
UnGetChar(theChar);
return PR_TRUE;
}
break;
case eDataDelimiter:
UnGetChar(theChar);
return PR_TRUE;
case eFieldSeparator:
SkipOver(mFieldSeparator[0]);
return PR_TRUE;
case eRecordSeparator:
UnGetChar(theChar);
return PR_FALSE;
default:
return PR_FALSE;
}
}
}//if
return result;
}
/**
* LAST MODS: gess 4Jul94
* PARMS:
* RETURNS: error code; 0 means all is well.
* PURPOSE: Gets the next field of data from the stream.
* NOTES: This does not currently handle fields that have
field delimiters (ie quotes).
* WARNING: You should have called HasNextToken prior
to calling this method, so that you can
fail gracefully if you encounter the end
of your input stream (unexpectedly). If
this method hits EOF, it returns an error.
*/
PRInt32 nsStringTokenizer::GetNextToken(nsString& aToken){
PRInt32 result=0;
if(mBuffer && More()) {
PRUnichar theChar;
if(mDataStartDelimiter.Length()) {
result=GetChar(theChar); //skip delimiter...
if(mFieldSeparator[0]==theChar)
return result;
aToken+=theChar;
}
if(NS_OK==result) {
PRUnichar theTerm[]={mFieldSeparator[0],mRecordSeparator[0],0,0};
if(mDataEndDelimiter.Length()) {
theTerm[2]=mDataEndDelimiter[0];
}
nsAutoString terms(mRecordSeparator);
terms+=mFieldSeparator;
result=ReadUntil(aToken,terms,PRBool(0!=mDataEndDelimiter[0]));
if(NS_OK==result) {
PRInt32 status=SkipOver(mFieldSeparator[0]);
}
}
}
return result;
}
/*
* This method gets called when the system wants to jump over any garbage before that may be in a
* string. Typically, this happens before, inbetween and after valid data rows.
*
* LAST MODS: gess 11Aug94
* RETURNS: 0 if all is well; non-zero for error. If you hit EOF, return 0.
*/
PRBool nsStringTokenizer::SkipToValidData(void){
PRInt32 result=0;
PRUnichar ch;
if(mBuffer) {
while(More()) {
result=GetChar(ch);
switch(DetermineCharType(ch)){
case eDataChar:
if(!mDataStartDelimiter[0]) {
UnGetChar(ch);
return PR_TRUE;
}
break;
case eDataDelimiter:
if(ch==mDataStartDelimiter[0]) {
UnGetChar(ch);
return PR_TRUE;
}
break;
case eFieldSeparator:
case eRecordSeparator:
UnGetChar(ch);
return PR_TRUE;
default:
break;
} //switch
} //while
}//if
return PR_FALSE;
}
PRInt32 nsStringTokenizer::SkipOver(PRUnichar aSkipChar) {
PRUnichar theChar=0;
PRInt32 result=NS_OK;
if(mBuffer) {
while(NS_OK==result) {
result=GetChar(theChar);
if(NS_OK == result) {
if(theChar!=aSkipChar) {
UnGetChar(theChar);
break;
}
}
else break;
} //while
}//if
return result;
}
PRInt32 nsStringTokenizer::SkipOver(nsString& aString) {
PRUnichar theChar=0;
PRInt32 result=NS_OK;
if(mBuffer) {
while(NS_OK==result) {
result=GetChar(theChar);
if(NS_OK == result) {
PRInt32 index=aString.Find(theChar);
if(-1==index) {
UnGetChar(theChar);
break;
}
}
else break;
} //while
} //if
return result;
}
PRInt32 nsStringTokenizer::ReadUntil(nsString& aString,nsString& aTermSet,PRBool addTerminal){
PRInt32 result=NS_OK;
PRUnichar theChar=0;
PRBool theCharIsValid;
if(mBuffer) {
while(NS_OK == result) {
result=GetChar(theChar);
if(NS_OK==result) {
PRBool found=PR_FALSE;
PRInt32 index=aTermSet.Find(theChar);
if(kNotFound<index)
found=PR_TRUE;
if(found) {
if(addTerminal)
aString+=theChar;
else UnGetChar(theChar);
break;
}
else {
PRInt32 pos=mSubstrStartDelimiter.Find(theChar);
if(-1<pos) {
aString+=theChar;
result=ReadUntil(aString,mSubstrEndDelimiter[pos],PR_TRUE);
}
else if(theCharIsValid){
if(IsValidDataChar(theChar)){
aString+=theChar;
}
else theCharIsValid=PR_FALSE;
}
} //else
} //if
} //while
}//if
return result;
}
PRInt32 nsStringTokenizer::ReadUntil(nsString& aString,PRUnichar aTerminalChar,PRBool addTerminal){
PRInt32 result=NS_OK;
PRUnichar theChar=0;
if(mBuffer) {
while(NS_OK == result) {
result=GetChar(theChar);
if(NS_OK==result) {
if(theChar==aTerminalChar){
if(addTerminal)
aString+=theChar;
else UnGetChar(theChar);
break;
}
else aString+=theChar;
}//if
} //while
}//if
return result;
}
PRBool nsStringTokenizer::More(void){
PRBool result=PR_FALSE;
if(mBuffer) {
if(mOffset<mBuffer->Length())
result=PR_TRUE;
}
return result;
}
PRInt32 nsStringTokenizer::GetChar(PRUnichar& aChar){
PRInt32 result=kEOF;
if(mBuffer) {
if(mOffset<mBuffer->Length()) {
aChar=(*mBuffer)[mOffset++];
result=0;
}
}
return result;
}
void nsStringTokenizer::UnGetChar(PRUnichar aChar){
if(mOffset>0)
mOffset--;
}
/*
* Call this method if you want the tokenizer to iterate your string
* and automatically call you back with each token
*
* @parm aFunctor is the object you want me to notify
* @update gess 07/10/99
* RETURNS: 0 if all went well
*/
PRInt32 nsStringTokenizer::Iterate(nsString& aBuffer,ITokenizeFunctor& aFunctor) {
PRInt32 result=0;
PRInt32 theRecordNum=-1;
nsString* theOldBuffer=mBuffer;
mBuffer=&aBuffer;
FirstRecord();
while(HasNextToken()){
theRecordNum++;
PRInt32 theTokenNum=-1;
while(HasNextToken()){
theTokenNum++;
nsAutoString theString;
GetNextToken(theString);
aFunctor(theString,theRecordNum,theTokenNum);
}
NextRecord();
}
mBuffer=theOldBuffer;
return result;
}