gecko-dev/xpcom/string/obsolete/nsString2.h
shanjian%netscape.com a6b06cd95b #134053 utf8 conversion problem in nsString.h
Change conversion to handle surrogates
r=yokoyama, sr=scc
2002-10-08 02:10:52 +00:00

721 lines
23 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Netscape Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Rick Gessner <rickg@netscape.com> (original author)
* Scott Collins <scc@mozilla.org>
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the NPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the NPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* nsString2.h --- rickg's original strings of 2-byte chars, |nsString|
* and |nsAutoString|; these classes will be replaced by the new
* shared-buffer string (see bug #53065)
*/
#ifndef nsString2_h__
#define nsString2_h__
#include "prtypes.h"
#include "nscore.h"
#include <stdio.h>
#ifndef nsAString_h__
#include "nsAString.h"
#endif
#ifndef nsAFlatString_h___
#include "nsAFlatString.h"
#endif
#ifndef nsLiteralString_h__
#include "nsLiteralString.h"
#endif
#ifndef nsDependentSubstring_h__
#include "nsDependentSubstring.h"
#endif
#ifndef nsPromiseFlatString_h__
#include "nsPromiseFlatString.h"
#endif
#ifndef nsXPIDLString_h__
#include "nsXPIDLString.h"
#endif
#include "nsStr.h"
class UTF8traits
{
public:
static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
};
#ifdef STANDALONE_MI_STRING_TESTS
class nsAFlatString { public: virtual ~nsAString() { } };
#endif
class nsISizeOfHandler;
class nsCString;
class NS_COM nsString :
public nsAFlatString,
public nsStr {
public:
friend class nsCString;
friend class nsLinebreakConverter;
friend void ToLowerCase( nsString& );
friend void ToUpperCase( nsString& );
protected:
virtual const nsBufferHandle<PRUnichar>* GetFlatBufferHandle() const;
virtual const PRUnichar* GetReadableFragment( nsReadableFragment<PRUnichar>&, nsFragmentRequest, PRUint32 ) const;
virtual PRUnichar* GetWritableFragment( nsWritableFragment<PRUnichar>&, nsFragmentRequest, PRUint32 );
public:
virtual const PRUnichar* get() const;
public:
/**
* Default constructor.
*/
nsString();
/**
* This is our copy constructor
* @param reference to another nsString
*/
nsString(const nsString& aString);
explicit nsString(const nsAString&);
explicit nsString(const PRUnichar*);
nsString(const PRUnichar*, PRInt32);
/**
* Destructor
*
*/
virtual ~nsString();
/**
* Retrieve the length of this string
* @return string length
*/
virtual PRUint32 Length() const { return mLength; }
/**
* Call this method if you want to force a different string length
* @update gess7/30/98
* @param aLength -- contains new length for mStr
* @return
*/
void SetLength(PRUint32 aLength);
/**
* Sets the new length of the string.
* @param aLength is new string length.
* @return nada
*/
void SetCapacity(PRUint32 aLength);
/**********************************************************************
Getters/Setters...
*********************************************************************/
/**
* Set nth character.
*/
PRBool SetCharAt(PRUnichar aChar,PRUint32 anIndex);
/**********************************************************************
Lexomorphic transforms...
*********************************************************************/
/**
* This method is used to remove all occurances of the
* characters found in aSet from this string.
*
* @param aSet -- characters to be cut from this
* @return *this
*/
void StripChars( const char* aSet );
void StripChar( PRUnichar aChar, PRInt32 anOffset=0 );
/**
* This method strips whitespace throughout the string
*
* @return this
*/
void StripWhitespace();
/**
* swaps occurence of 1 string for another
*
* @return this
*/
void ReplaceChar( PRUnichar anOldChar, PRUnichar aNewChar );
void ReplaceChar( const char* aSet, PRUnichar aNewChar );
void ReplaceSubstring( const nsString& aTarget, const nsString& aNewValue );
void ReplaceSubstring( const PRUnichar* aTarget, const PRUnichar* aNewValue );
/**
* This method trims characters found in aTrimSet from
* either end of the underlying string.
*
* @param aTrimSet -- contains chars to be trimmed from
* both ends
* @param aEliminateLeading
* @param aEliminateTrailing
* @param aIgnoreQuotes
* @return this
*/
void Trim(const char* aSet,PRBool aEliminateLeading=PR_TRUE,PRBool aEliminateTrailing=PR_TRUE,PRBool aIgnoreQuotes=PR_FALSE);
/**
* This method strips whitespace from string.
* You can control whether whitespace is yanked from
* start and end of string as well.
*
* @param aEliminateLeading controls stripping of leading ws
* @param aEliminateTrailing controls stripping of trailing ws
* @return this
*/
void CompressWhitespace( PRBool aEliminateLeading=PR_TRUE,PRBool aEliminateTrailing=PR_TRUE);
/**********************************************************************
string conversion methods...
*********************************************************************/
/**
* Copies data from internal buffer onto given char* buffer
* NOTE: This only copies as many chars as will fit in given buffer (clips)
* @param aBuf is the buffer where data is stored
* @param aBuflength is the max # of chars to move to buffer
* @return ptr to given buffer
*/
char* ToCString(char* aBuf,PRUint32 aBufLength,PRUint32 anOffset=0) const;
/**
* Perform string to float conversion.
* @param aErrorCode will contain error if one occurs
* @return float rep of string value
*/
float ToFloat(PRInt32* aErrorCode) const;
/**
* Perform string to int conversion.
* @param aErrorCode will contain error if one occurs
* @param aRadix tells us which radix to assume; kAutoDetect tells us to determine the radix for you.
* @return int rep of string value, and possible (out) error code
*/
PRInt32 ToInteger(PRInt32* aErrorCode,PRUint32 aRadix=kRadix10) const;
/**********************************************************************
String manipulation methods...
*********************************************************************/
/**
* assign given string to this string
* @param aStr: buffer to be assigned to this
* @param aCount is the length of the given str (or -1) if you want me to determine its length
* NOTE: IFF you pass -1 as aCount, then your buffer must be null terminated.
* @return this
*/
nsString& operator=( const nsString& aString ) { Assign(aString); return *this; }
nsString& operator=( const nsAString& aReadable ) { Assign(aReadable); return *this; }
//nsString& operator=( const nsPromiseReadable<PRUnichar>& aReadable ) { Assign(aReadable); return *this; }
nsString& operator=( const PRUnichar* aPtr ) { Assign(aPtr); return *this; }
nsString& operator=( PRUnichar aChar ) { Assign(aChar); return *this; }
void AssignWithConversion(const char*);
void AssignWithConversion(const char*, PRInt32);
/*
* Appends n characters from given string to this,
* This version computes the length of your given string
*
* @param aString is the source to be appended to this
* @return number of chars copied
*/
void AppendInt(PRInt32, PRInt32=10); //radix=8,10 or 16
void AppendFloat(double);
void AppendWithConversion(const char*, PRInt32=-1);
virtual void do_AppendFromElement( PRUnichar );
//void InsertWithConversion(char);
void InsertWithConversion(const char*, PRUint32, PRInt32=-1);
// Takes ownership of aPtr, sets the current length to aLength if specified.
void Adopt( PRUnichar* aPtr, PRInt32 aLength = -1 );
/*
|Left|, |Mid|, and |Right| are annoying signatures that seem better almost
any _other_ way than they are now. Consider these alternatives
aWritable = aReadable.Left(17); // ...a member function that returns a |Substring|
aWritable = Left(aReadable, 17); // ...a global function that returns a |Substring|
Left(aReadable, 17, aWritable); // ...a global function that does the assignment
as opposed to the current signature
aReadable.Left(aWritable, 17); // ...a member function that does the assignment
or maybe just stamping them out in favor of |Substring|, they are just duplicate functionality
aWritable = Substring(aReadable, 0, 17);
*/
size_type Left( self_type&, size_type ) const;
size_type Mid( self_type&, PRUint32, PRUint32 ) const;
size_type Right( self_type&, size_type ) const;
/**********************************************************************
Searching methods...
*********************************************************************/
/**
* Search for given character within this string
*
* @param aChar is the character to search for
* @param anOffset tells us where in this string to start searching
(optional parameter)
* @param aCount tells us how far from the offset we are to search. Use
-1 to search the whole string. (optional parameter)
* @return offset in string, or -1 (kNotFound)
*/
PRInt32 FindChar(PRUnichar aChar, PRInt32 anOffset=0, PRInt32 aCount=-1) const;
/**
* Search for given substring within this string
*
* @param aString is substring to be sought in this
* @param aIgnoreCase selects case sensitivity
* @param anOffset tells us where in this string to start searching
* @param aCount tells us how far from the offset we are to search. Use
-1 to search the whole string.
* @return offset in string, or -1 (kNotFound)
*/
PRInt32 Find(const nsCString& aString,PRBool aIgnoreCase=PR_FALSE,PRInt32 anOffset=0,PRInt32 aCount=-1) const;
PRInt32 Find(const char* aString,PRBool aIgnoreCase=PR_FALSE,PRInt32 anOffset=0,PRInt32 aCount=-1) const;
PRInt32 Find(const nsAFlatString& aString, PRInt32 anOffset=0, PRInt32 aCount=-1) const;
PRInt32 Find(const PRUnichar* aString, PRInt32 anOffset=0, PRInt32 aCount=-1) const;
/**
* This method searches this string for the first character
* found in the given charset
* @param aString contains set of chars to be found
* @param anOffset tells us where to start searching in this
* @return -1 if not found, else the offset in this
*/
PRInt32 FindCharInSet(const char* aString,PRInt32 anOffset=0) const;
PRInt32 FindCharInSet(const PRUnichar* aString,PRInt32 anOffset=0) const;
/**
* This methods scans the string backwards, looking for the given string
* @param aString is substring to be sought in this
* @param aIgnoreCase tells us whether or not to do caseless compare
* @param anOffset tells us where in this strig to start searching (counting from left)
* @param aCount tells us how many iterations to make starting at the given offset
* @return offset in string, or -1 (kNotFound)
*/
PRInt32 RFind(const char* aCString,PRBool aIgnoreCase=PR_FALSE,PRInt32 anOffset=-1,PRInt32 aCount=-1) const;
PRInt32 RFind(const nsAFlatString& aString, PRInt32 anOffset=-1,PRInt32 aCount=-1) const;
PRInt32 RFind(const PRUnichar* aString,PRInt32 anOffset=-1,PRInt32 aCount=-1) const;
/**
* Search for given char within this string
*
* @param aString is substring to be sought in this
* @param anOffset tells us where in this strig to start searching (counting from left)
* @param aIgnoreCase selects case sensitivity
* @param aCount tells us how many iterations to make starting at the given offset
* @return find pos in string, or -1 (kNotFound)
*/
PRInt32 RFindChar(PRUnichar aChar,PRInt32 anOffset=-1,PRInt32 aCount=-1) const;
/**
* This method searches this string for the last character
* found in the given string
* @param aString contains set of chars to be found
* @param anOffset tells us where in this strig to start searching (counting from left)
* @return -1 if not found, else the offset in this
*/
PRInt32 RFindCharInSet(const PRUnichar* aString,PRInt32 anOffset=-1) const;
/**********************************************************************
Comparison methods...
*********************************************************************/
/**
* Compares a given string type to this string.
* @update gess 7/27/98
* @param S is the string to be compared
* @param aIgnoreCase tells us how to treat case
* @param aCount tells us how many chars to compare
* @return -1,0,1
*/
PRInt32 CompareWithConversion(const char* aString, PRBool aIgnoreCase=PR_FALSE, PRInt32 aCount=-1) const;
PRBool EqualsWithConversion(const char* aString,PRBool aIgnoreCase=PR_FALSE,PRInt32 aCount=-1) const;
PRBool EqualsIgnoreCase(const char* aString,PRInt32 aCount=-1) const;
/**
* Determine if given buffer is plain ascii
*
* @param aBuffer -- if null, then we test *this, otherwise we test given buffer
* @return TRUE if is all ascii chars or if strlen==0
*/
PRBool IsASCII(const PRUnichar* aBuffer=0);
/**
* Determine if given char is a valid space character
*
* @param aChar is character to be tested
* @return TRUE if is valid space char
*/
static PRBool IsSpace(PRUnichar ch);
#ifdef DEBUG
/**
* Retrieve the size of this string
* @return string length
*/
virtual void SizeOf(nsISizeOfHandler* aHandler, PRUint32* aResult) const;
#endif
private:
// NOT TO BE IMPLEMENTED
// these signatures help clients not accidentally call the wrong thing helped by C++ automatic integral promotion
void operator=( char );
void AssignWithConversion( const PRUnichar*, PRInt32=-1 );
void AppendWithConversion( const PRUnichar*, PRInt32=-1 );
void InsertWithConversion( const PRUnichar*, PRUint32, PRInt32=-1 );
};
inline
nsString::size_type
nsString::Left( nsAString& aResult, size_type aLengthToCopy ) const
{
return Mid(aResult, 0, aLengthToCopy);
}
inline
nsString::size_type
nsString::Right( self_type& aResult, size_type aLengthToCopy ) const
{
size_type myLength = Length();
aLengthToCopy = NS_MIN(myLength, aLengthToCopy);
return Mid(aResult, myLength-aLengthToCopy, aLengthToCopy);
}
// NS_DEF_STRING_COMPARISON_OPERATORS(nsString, PRUnichar)
// NS_DEF_DERIVED_STRING_OPERATOR_PLUS(nsString, PRUnichar)
/**************************************************************
Here comes the AutoString class which uses internal memory
(typically found on the stack) for its default buffer.
If the buffer needs to grow, it gets reallocated on the heap.
**************************************************************/
class NS_COM nsAutoString : public nsString {
public:
virtual ~nsAutoString() {}
nsAutoString();
nsAutoString(const nsAutoString& aString);
explicit nsAutoString(const nsAString& aString);
explicit nsAutoString(const nsString& aString);
explicit nsAutoString(const PRUnichar* aString);
nsAutoString(const PRUnichar* aString,PRInt32 aLength);
explicit nsAutoString(PRUnichar aChar);
explicit nsAutoString(const CBufDescriptor& aBuffer);
nsAutoString& operator=( const nsAutoString& aString ) { Assign(aString); return *this; }
private:
void operator=( char ); // NOT TO BE IMPLEMENTED
public:
nsAutoString& operator=( const nsAString& aReadable ) { Assign(aReadable); return *this; }
// nsAutoString& operator=( const nsPromiseReadable<PRUnichar>& aReadable ) { Assign(aReadable); return *this; }
nsAutoString& operator=( const PRUnichar* aPtr ) { Assign(aPtr); return *this; }
nsAutoString& operator=( PRUnichar aChar ) { Assign(aChar); return *this; }
#ifdef DEBUG
/**
* Retrieve the size of this string
* @return string length
*/
virtual void SizeOf(nsISizeOfHandler* aHandler, PRUint32* aResult) const;
#endif
PRUnichar mBuffer[kDefaultStringSize];
};
// NS_DEF_DERIVED_STRING_OPERATOR_PLUS(nsAutoString, PRUnichar)
class NS_COM NS_ConvertASCIItoUCS2
: public nsAutoString
/*
...
*/
{
public:
explicit
NS_ConvertASCIItoUCS2( const nsACString& aCString );
explicit
NS_ConvertASCIItoUCS2( const nsAFlatCString& aCString )
{
Init( aCString.get(), aCString.Length() );
}
explicit
NS_ConvertASCIItoUCS2( const char* aCString )
{
Init( aCString, ~PRUint32(0) /* MAXINT */ );
}
NS_ConvertASCIItoUCS2( const char* aCString, PRUint32 aLength )
{
Init( aCString, aLength );
}
#if 0
operator const nsDependentString() const
{
return nsDependentString(mUStr, mLength);
}
#endif
protected:
void Init( const char* aCString, PRUint32 aLength );
private:
// NOT TO BE IMPLEMENTED
NS_ConvertASCIItoUCS2( PRUnichar );
};
class NS_COM NS_ConvertUTF8toUCS2
: public nsAutoString
{
public:
explicit
NS_ConvertUTF8toUCS2( const nsACString& aCString )
{
Init( aCString );
}
explicit
NS_ConvertUTF8toUCS2( const char* aCString )
{
Init( nsDependentCString( aCString ) );
}
NS_ConvertUTF8toUCS2( const char* aCString, PRUint32 aLength )
{
Init( Substring( aCString, aCString + aLength ) );
}
protected:
void Init( const nsACString& aCString );
private:
NS_ConvertUTF8toUCS2( PRUnichar );
};
#define PLANE1_BASE 0x00010000
#define UCS2_REPLACEMENT_CHAR 0xfffd
class ConvertUTF8toUCS2
{
public:
typedef nsACString::char_type value_type;
typedef nsAString::char_type buffer_type;
ConvertUTF8toUCS2( buffer_type* aBuffer )
: mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
size_t Length() const { return mBuffer - mStart; }
PRUint32 write( const value_type* start, PRUint32 N )
{
if ( mErrorEncountered )
return N;
// algorithm assumes utf8 units won't
// be spread across fragments
const value_type* p = start;
const value_type* end = start + N;
for ( ; p != end /* && *p */; )
{
char c = *p++;
if ( UTF8traits::isASCII(c) )
{
*mBuffer++ = buffer_type(c);
continue;
}
PRUint32 ucs4;
PRUint32 minUcs4;
PRInt32 state = 0;
if ( UTF8traits::is2byte(c) )
{
ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
state = 1;
minUcs4 = 0x00000080;
}
else if ( UTF8traits::is3byte(c) )
{
ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
state = 2;
minUcs4 = 0x00000800;
}
else if ( UTF8traits::is4byte(c) )
{
ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
state = 3;
minUcs4 = 0x00010000;
}
else if ( UTF8traits::is5byte(c) )
{
ucs4 = (PRUint32(c) << 24) & 0x03000000L;
state = 4;
minUcs4 = 0x00200000;
}
else if ( UTF8traits::is6byte(c) )
{
ucs4 = (PRUint32(c) << 30) & 0x40000000L;
state = 5;
minUcs4 = 0x04000000;
}
else
{
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
mErrorEncountered = PR_TRUE;
return N;
}
while ( state-- )
{
c = *p++;
if ( UTF8traits::isInSeq(c) )
{
PRInt32 shift = state * 6;
ucs4 |= (PRUint32(c) & 0x3F) << shift;
}
else
{
NS_ERROR("not a UTF8 string");
mErrorEncountered = PR_TRUE;
return N;
}
}
if ( ucs4 < minUcs4 )
{
// Overlong sequence
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 <= 0xD7FF )
{
*mBuffer++ = ucs4;
}
else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
{
// Surrogates
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
{
// Prohibited characters
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 >= PLANE1_BASE )
{
if ( ucs4 >= 0x00110000 )
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
else {
// surrogate, see unicode specification 3.7 for following math.
ucs4 -= PLANE1_BASE;
*mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
*mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
}
}
else
{
if ( ucs4 != 0xFEFF ) // ignore BOM
*mBuffer++ = ucs4;
}
}
return p - start;
}
private:
buffer_type* mStart;
buffer_type* mBuffer;
PRBool mErrorEncountered;
};
#endif /* !defined(nsString2_h__) */