gecko-dev/xpcom/ds/nsStringTokenizer.h
1999-07-15 06:40:44 +00:00

147 lines
5.7 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/*
* The contents of this file are subject to the Netscape Public License
* Version 1.0 (the "NPL"); you may not use this file except in
* compliance with the NPL. You may obtain a copy of the NPL at
* http://www.mozilla.org/NPL/
*
* Software distributed under the NPL is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
* for the specific language governing rights and limitations under the
* NPL.
*
* The Initial Developer of this code under the NPL is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All Rights
* Reserved.
*/
/**
* MODULE NOTES:
* @update gess 4/1/98
*
* This class knows how to read delimited data from a string.
* Here are the 2 things you need to know to use this class effectively:
*
* ================================================
* How To Setup The Tokenizer
* ================================================
*
* The input charset can be either constrained or uncontrained. Constrained means
* that you've chosen to allow only certain chars into your tokens. Unconstrained
* means that any char (other than delimiters) are legal in your tokens.
* If you want unconstrained input, use [*-*] your dataspec. To contrain your token
* charset, you set ranges or single chars in the dataspec like this:
* "abc[0-9]" -- which allow numbers and the letters a,b,c
*
* Dataspecifier rules:
* abc -- allows a set of characters
* [a-z] -- allows all chars in given range
* [*-*] -- allows all characters
* ^abc -- disallows a set of characters //NOT_YET_IMPLEMENTED
* [a^z] -- disallows all characters in given range //NOT_YET_IMPLEMENTED
* [a*b] -- specifies a delimiter pair for the entire token
* [a+b] -- specifies a delimiter pair for substrings in the token
*
* One other note: there is an optional argument called allowQuoting, which tells
* the tokenizer whether to allow quoted strings within your fields. If you set this
* to TRUE, then we allow nested quoted strings, which themselves can contain any data.
* It's considered an error to set allowQuoting=TRUE and use a quote as a token or record delimiter.
*
* The other thing you need to set up for the tokenizer to work correctly are the delimiters.
* They seperate fields and records, and be different. You can also have more than one kind
* of delimiter for each. The distinguishment between tokens are records allows the caller
* to deal with multi-line text files (where \n is the record seperator). Again, you don't have
* to have a record seperator if it doesn't make sense in the context of your input dataset.
*
*
* ================================================
* How To Iterate Tokens
* ================================================
*
* There are 2 ways to iterate tokens, either manually or automatically.
* The manual method requires that you call a set of methods in the right order,
* but gives you slightly more control. Here's the calling pattern:
*
* {
* nsString theBuffer("xxxxxxx");
* nsStringTokenizer tok(...);
* tok.SetBuffer(theBuffer);
* tok.FirstRecord();
* while(tok.HasNextToken()){
* while(tok.HasNextToken()){
* nsAutoString theToken;
* tok.GetNextToken(theToken);
* //do something with your token here...
* } //while
* tok.NextRecord();
* } //while
* }
*
* The automatic method handles all the iteration for you. You provide a callback functor
* and you'll get called once for each token per record. To use that technique, you need
* to define an object that provides the ITokenizeFunctor interface (1 method). Then
* call the tokenizer method Iterate(...). Voila.
*
*/
#ifndef nsStringTokenizer_
#define nsStringTokenizer_
#include "nsString.h"
class ITokenizeFunctor {
public:
virtual operator ()(nsString& aToken,PRInt32 aRecordCount,PRInt32 aTokenCount)=0;
};
class nsStringTokenizer {
public:
nsStringTokenizer(const char* aFieldSep=",",const char* aRecordSep="\n");
~nsStringTokenizer();
//Call these methods if you want to iterate the tokens yourself
void SetBuffer(nsString& aBuffer);
void AddTokenSpec(const char* aTokenSpec="");
PRBool FirstRecord(void);
PRBool NextRecord(void);
PRBool HasNextToken(void);
PRInt32 GetNextToken(nsString& aToken);
//Call this one (exclusively) if you want to be called back iteratively
PRInt32 Iterate(nsString& aBuffer,ITokenizeFunctor& aFunctor);
protected:
enum eCharTypes {eUnknown,eDataChar,eFieldSeparator,eDataDelimiter,eRecordSeparator};
enum eCharSpec {eGivenChars,eAllChars,eExceptChars};
PRInt32 SkipOver(nsString& aSkipSet);
PRInt32 SkipOver(PRUnichar aSkipChar);
PRInt32 ReadUntil(nsString& aString,nsString& aTermSet,PRBool aState);
PRInt32 ReadUntil(nsString& aString,PRUnichar aChar,PRBool aState);
PRBool More(void);
PRInt32 GetChar(PRUnichar& aChar);
void UnGetChar(PRUnichar aChar);
PRBool SkipToValidData(void);
void ExpandDataSpecifier(const char* aDataSpec) ;
inline PRBool IsValidDataChar(PRUnichar aChar);
eCharTypes DetermineCharType(PRUnichar aChar);
PRInt32 mValidChars[4];
PRInt32 mInvalidChars[4];
nsString mDataStartDelimiter;
nsString mDataEndDelimiter;
nsString mSubstrStartDelimiter;
nsString mSubstrEndDelimiter;
nsString mFieldSeparator;
nsString mRecordSeparator;
PRInt32 mOffset;
eCharSpec mCharSpec;
nsString* mBuffer;
};
#endif