mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-05 08:35:26 +00:00
151 lines
5.8 KiB
C++
151 lines
5.8 KiB
C++
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
/*
|
|
* The contents of this file are subject to the Netscape Public
|
|
* License Version 1.1 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at http://www.mozilla.org/NPL/
|
|
*
|
|
* Software distributed under the License is distributed on an "AS
|
|
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
|
|
* implied. See the License for the specific language governing
|
|
* rights and limitations under the License.
|
|
*
|
|
* The Original Code is mozilla.org code.
|
|
*
|
|
* The Initial Developer of the Original Code is Netscape
|
|
* Communications Corporation. Portions created by Netscape are
|
|
* Copyright (C) 1998 Netscape Communications Corporation. All
|
|
* Rights Reserved.
|
|
*
|
|
* Contributor(s):
|
|
*/
|
|
|
|
/**
|
|
* MODULE NOTES:
|
|
* @update gess 4/1/98
|
|
*
|
|
* This class knows how to read delimited data from a string.
|
|
* Here are the 2 things you need to know to use this class effectively:
|
|
*
|
|
* ================================================
|
|
* How To Setup The Tokenizer
|
|
* ================================================
|
|
*
|
|
* The input charset can be either constrained or uncontrained. Constrained means
|
|
* that you've chosen to allow only certain chars into your tokens. Unconstrained
|
|
* means that any char (other than delimiters) are legal in your tokens.
|
|
* If you want unconstrained input, use [*-*] your dataspec. To contrain your token
|
|
* charset, you set ranges or single chars in the dataspec like this:
|
|
* "abc[0-9]" -- which allow numbers and the letters a,b,c
|
|
*
|
|
* Dataspecifier rules:
|
|
* abc -- allows a set of characters
|
|
* [a-z] -- allows all chars in given range
|
|
* [*-*] -- allows all characters
|
|
* ^abc -- disallows a set of characters //NOT_YET_IMPLEMENTED
|
|
* [a^z] -- disallows all characters in given range //NOT_YET_IMPLEMENTED
|
|
* [a*b] -- specifies a delimiter pair for the entire token
|
|
* [a+b] -- specifies a delimiter pair for substrings in the token
|
|
*
|
|
* One other note: there is an optional argument called allowQuoting, which tells
|
|
* the tokenizer whether to allow quoted strings within your fields. If you set this
|
|
* to TRUE, then we allow nested quoted strings, which themselves can contain any data.
|
|
* It's considered an error to set allowQuoting=TRUE and use a quote as a token or record delimiter.
|
|
*
|
|
* The other thing you need to set up for the tokenizer to work correctly are the delimiters.
|
|
* They seperate fields and records, and be different. You can also have more than one kind
|
|
* of delimiter for each. The distinguishment between tokens are records allows the caller
|
|
* to deal with multi-line text files (where \n is the record seperator). Again, you don't have
|
|
* to have a record seperator if it doesn't make sense in the context of your input dataset.
|
|
*
|
|
*
|
|
* ================================================
|
|
* How To Iterate Tokens
|
|
* ================================================
|
|
*
|
|
* There are 2 ways to iterate tokens, either manually or automatically.
|
|
* The manual method requires that you call a set of methods in the right order,
|
|
* but gives you slightly more control. Here's the calling pattern:
|
|
*
|
|
* {
|
|
* nsString theBuffer("xxxxxxx");
|
|
* nsStringTokenizer tok(...);
|
|
* tok.SetBuffer(theBuffer);
|
|
* tok.FirstRecord();
|
|
* while(tok.HasNextToken()){
|
|
* while(tok.HasNextToken()){
|
|
* nsAutoString theToken;
|
|
* tok.GetNextToken(theToken);
|
|
* //do something with your token here...
|
|
* } //while
|
|
* tok.NextRecord();
|
|
* } //while
|
|
* }
|
|
*
|
|
* The automatic method handles all the iteration for you. You provide a callback functor
|
|
* and you'll get called once for each token per record. To use that technique, you need
|
|
* to define an object that provides the ITokenizeFunctor interface (1 method). Then
|
|
* call the tokenizer method Iterate(...). Voila.
|
|
*
|
|
*/
|
|
|
|
|
|
#ifndef nsStringTokenizer_
|
|
#define nsStringTokenizer_
|
|
|
|
#include "nsString.h"
|
|
|
|
class ITokenizeFunctor {
|
|
public:
|
|
virtual operator ()(nsString& aToken,PRInt32 aRecordCount,PRInt32 aTokenCount)=0;
|
|
};
|
|
|
|
class nsStringTokenizer {
|
|
public:
|
|
nsStringTokenizer(const char* aFieldSep=",",const char* aRecordSep="\n");
|
|
~nsStringTokenizer();
|
|
|
|
|
|
//Call these methods if you want to iterate the tokens yourself
|
|
void SetBuffer(nsString& aBuffer);
|
|
void AddTokenSpec(const char* aTokenSpec="");
|
|
PRBool FirstRecord(void);
|
|
PRBool NextRecord(void);
|
|
PRBool HasNextToken(void);
|
|
PRInt32 GetNextToken(nsString& aToken);
|
|
|
|
//Call this one (exclusively) if you want to be called back iteratively
|
|
PRInt32 Iterate(nsString& aBuffer,ITokenizeFunctor& aFunctor);
|
|
|
|
protected:
|
|
|
|
enum eCharTypes {eUnknown,eDataChar,eFieldSeparator,eDataDelimiter,eRecordSeparator};
|
|
enum eCharSpec {eGivenChars,eAllChars,eExceptChars};
|
|
|
|
PRInt32 SkipOver(nsString& aSkipSet);
|
|
PRInt32 SkipOver(PRUnichar aSkipChar);
|
|
PRInt32 ReadUntil(nsString& aString,nsString& aTermSet,PRBool aState);
|
|
PRInt32 ReadUntil(nsString& aString,PRUnichar aChar,PRBool aState);
|
|
PRBool More(void);
|
|
PRInt32 GetChar(PRUnichar& aChar);
|
|
void UnGetChar(PRUnichar aChar);
|
|
PRBool SkipToValidData(void);
|
|
void ExpandDataSpecifier(const char* aDataSpec) ;
|
|
inline PRBool IsValidDataChar(PRUnichar aChar);
|
|
eCharTypes DetermineCharType(PRUnichar aChar);
|
|
|
|
PRInt32 mValidChars[4];
|
|
PRInt32 mInvalidChars[4];
|
|
nsString mDataStartDelimiter;
|
|
nsString mDataEndDelimiter;
|
|
nsString mSubstrStartDelimiter;
|
|
nsString mSubstrEndDelimiter;
|
|
nsString mFieldSeparator;
|
|
nsString mRecordSeparator;
|
|
PRInt32 mOffset;
|
|
eCharSpec mCharSpec;
|
|
nsString* mBuffer;
|
|
};
|
|
|
|
#endif
|