mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-25 22:01:30 +00:00
bug 191542 : Add UTF-8 equivalent of |IsASCII|, IsUTF8. r=smontagu, sr=alecf
This commit is contained in:
parent
adb09c10c3
commit
a5ddc8b721
@ -191,9 +191,8 @@ nsresult nsTextToSubURI::convertURItoUnicode(const nsAFlatCString &aCharset,
|
||||
}
|
||||
|
||||
if (!isStatefulCharset && aIRI) {
|
||||
NS_ConvertUTF8toUCS2 ucs2(aURI);
|
||||
if (aURI.Equals(NS_ConvertUCS2toUTF8(ucs2))) {
|
||||
_retval.Assign(ucs2);
|
||||
if (IsUTF8(aURI)) {
|
||||
_retval.Assign(NS_ConvertUTF8toUCS2(aURI));
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
|
@ -174,6 +174,26 @@ NS_COM PRBool IsASCII( const nsAString& aString );
|
||||
NS_COM PRBool IsASCII( const nsACString& aString );
|
||||
|
||||
|
||||
/**
|
||||
* Returns |PR_TRUE| if |aString| is a valid UTF-8 string.
|
||||
* XXX This is not bullet-proof and nor an all-purpose UTF-8 validator.
|
||||
* It is mainly written to replace and roughly equivalent to
|
||||
*
|
||||
* str.Equals(NS_ConvertUCS2toUTF8(NS_ConvertUTF8toUCS2(str)))
|
||||
*
|
||||
* (see bug 191541)
|
||||
* As such, it does not check for non-UTF-8 7bit encodings such as
|
||||
* ISO-2022-JP and HZ. However, it filters out UTF-8 representation
|
||||
* of surrogate codepoints and non-characters ( 0xFFFE and 0xFFFF
|
||||
* in planes 0 through 16.) as well as overlong UTF-8 sequences.
|
||||
* Also note that it regards UTF-8 sequences corresponding to
|
||||
* codepoints above 0x10FFFF as invalid in accordance with
|
||||
* http://www.ietf.org/internet-drafts/draft-yergeau-rfc2279bis-04.txt
|
||||
*
|
||||
* @param aString an 8-bit wide string to scan
|
||||
*/
|
||||
NS_COM PRBool IsUTF8( const nsACString& aString );
|
||||
|
||||
|
||||
/**
|
||||
* Converts case in place in the argument string.
|
||||
|
@ -367,7 +367,102 @@ IsASCII( const nsACString& aString )
|
||||
return PR_TRUE;
|
||||
}
|
||||
|
||||
NS_COM
|
||||
PRBool
|
||||
IsUTF8( const nsACString& aString )
|
||||
{
|
||||
nsReadingIterator<char> done_reading;
|
||||
aString.EndReading(done_reading);
|
||||
|
||||
PRInt32 state = 0;
|
||||
PRBool overlong = PR_FALSE;
|
||||
PRBool surrogate = PR_FALSE;
|
||||
PRBool nonchar = PR_FALSE;
|
||||
PRUint16 olupper = 0; // overlong byte upper bound.
|
||||
PRUint16 slower = 0; // surrogate byte lower bound.
|
||||
|
||||
// for each chunk of |aString|...
|
||||
PRUint32 fragmentLength = 0;
|
||||
nsReadingIterator<char> iter;
|
||||
|
||||
for ( aString.BeginReading(iter); iter != done_reading; iter.advance( PRInt32(fragmentLength) ) )
|
||||
{
|
||||
fragmentLength = PRUint32(iter.size_forward());
|
||||
const char* ptr = iter.get();
|
||||
const char* fragmentEnd = ptr + fragmentLength;
|
||||
|
||||
// for each character in this chunk...
|
||||
while ( ptr < fragmentEnd )
|
||||
{
|
||||
PRUint8 c;
|
||||
|
||||
if (0 == state)
|
||||
{
|
||||
c = *ptr++;
|
||||
|
||||
if ( UTF8traits::isASCII(c) )
|
||||
continue;
|
||||
|
||||
if ( c <= 0xC1 ) // [80-BF] where not expected, [C0-C1] for overlong.
|
||||
return PR_FALSE;
|
||||
else if ( UTF8traits::is2byte(c) )
|
||||
state = 1;
|
||||
else if ( UTF8traits::is3byte(c) )
|
||||
{
|
||||
state = 2;
|
||||
if ( c == 0xE0 ) // to exclude E0[80-9F][80-BF]
|
||||
{
|
||||
overlong = PR_TRUE;
|
||||
olupper = 0x9F;
|
||||
}
|
||||
else if ( c == 0xED ) // ED[A0-BF][80-BF] : surrogate codepoint
|
||||
{
|
||||
surrogate = PR_TRUE;
|
||||
slower = 0xA0;
|
||||
}
|
||||
else if ( c == 0xEF ) // EF BF [BE-BF] : non-character
|
||||
nonchar = PR_TRUE;
|
||||
}
|
||||
else if ( c <= 0xF4 ) // XXX replace /w UTF8traits::is4byte when it's updated to exclude [F5-F7].(bug 199090)
|
||||
{
|
||||
state = 3;
|
||||
nonchar = PR_TRUE;
|
||||
if ( c == 0xF0 ) // to exclude F0[80-8F][80-BF]{2}
|
||||
{
|
||||
overlong = PR_TRUE;
|
||||
olupper = 0x8F;
|
||||
}
|
||||
else if ( c == 0xF4 ) // to exclude F4[90-BF][80-BF]
|
||||
{
|
||||
// actually not surrogates but codepoints beyond 0x10FFFF
|
||||
surrogate = PR_TRUE;
|
||||
slower = 0x90;
|
||||
}
|
||||
}
|
||||
else
|
||||
return PR_FALSE; // Not UTF8 string
|
||||
}
|
||||
|
||||
while (ptr < fragmentEnd && state)
|
||||
{
|
||||
c = *ptr++;
|
||||
--state;
|
||||
|
||||
// non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]
|
||||
if ( nonchar && ( !state && c < 0xBE ||
|
||||
state == 1 && c != 0xBF ||
|
||||
state == 2 && 0x0F != (0x0F & c) ))
|
||||
nonchar = PR_FALSE;
|
||||
|
||||
if ( !UTF8traits::isInSeq(c) || overlong && c <= olupper ||
|
||||
surrogate && slower <= c || nonchar && !state )
|
||||
return PR_FALSE; // Not UTF8 string
|
||||
overlong = surrogate = PR_FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return !state; // state != 0 at the end indicates an invalid UTF-8 seq.
|
||||
}
|
||||
|
||||
/**
|
||||
* A character sink for in-place case conversion.
|
||||
|
@ -174,6 +174,26 @@ NS_COM PRBool IsASCII( const nsAString& aString );
|
||||
NS_COM PRBool IsASCII( const nsACString& aString );
|
||||
|
||||
|
||||
/**
|
||||
* Returns |PR_TRUE| if |aString| is a valid UTF-8 string.
|
||||
* XXX This is not bullet-proof and nor an all-purpose UTF-8 validator.
|
||||
* It is mainly written to replace and roughly equivalent to
|
||||
*
|
||||
* str.Equals(NS_ConvertUCS2toUTF8(NS_ConvertUTF8toUCS2(str)))
|
||||
*
|
||||
* (see bug 191541)
|
||||
* As such, it does not check for non-UTF-8 7bit encodings such as
|
||||
* ISO-2022-JP and HZ. However, it filters out UTF-8 representation
|
||||
* of surrogate codepoints and non-characters ( 0xFFFE and 0xFFFF
|
||||
* in planes 0 through 16.) as well as overlong UTF-8 sequences.
|
||||
* Also note that it regards UTF-8 sequences corresponding to
|
||||
* codepoints above 0x10FFFF as invalid in accordance with
|
||||
* http://www.ietf.org/internet-drafts/draft-yergeau-rfc2279bis-04.txt
|
||||
*
|
||||
* @param aString an 8-bit wide string to scan
|
||||
*/
|
||||
NS_COM PRBool IsUTF8( const nsACString& aString );
|
||||
|
||||
|
||||
/**
|
||||
* Converts case in place in the argument string.
|
||||
|
@ -367,7 +367,102 @@ IsASCII( const nsACString& aString )
|
||||
return PR_TRUE;
|
||||
}
|
||||
|
||||
NS_COM
|
||||
PRBool
|
||||
IsUTF8( const nsACString& aString )
|
||||
{
|
||||
nsReadingIterator<char> done_reading;
|
||||
aString.EndReading(done_reading);
|
||||
|
||||
PRInt32 state = 0;
|
||||
PRBool overlong = PR_FALSE;
|
||||
PRBool surrogate = PR_FALSE;
|
||||
PRBool nonchar = PR_FALSE;
|
||||
PRUint16 olupper = 0; // overlong byte upper bound.
|
||||
PRUint16 slower = 0; // surrogate byte lower bound.
|
||||
|
||||
// for each chunk of |aString|...
|
||||
PRUint32 fragmentLength = 0;
|
||||
nsReadingIterator<char> iter;
|
||||
|
||||
for ( aString.BeginReading(iter); iter != done_reading; iter.advance( PRInt32(fragmentLength) ) )
|
||||
{
|
||||
fragmentLength = PRUint32(iter.size_forward());
|
||||
const char* ptr = iter.get();
|
||||
const char* fragmentEnd = ptr + fragmentLength;
|
||||
|
||||
// for each character in this chunk...
|
||||
while ( ptr < fragmentEnd )
|
||||
{
|
||||
PRUint8 c;
|
||||
|
||||
if (0 == state)
|
||||
{
|
||||
c = *ptr++;
|
||||
|
||||
if ( UTF8traits::isASCII(c) )
|
||||
continue;
|
||||
|
||||
if ( c <= 0xC1 ) // [80-BF] where not expected, [C0-C1] for overlong.
|
||||
return PR_FALSE;
|
||||
else if ( UTF8traits::is2byte(c) )
|
||||
state = 1;
|
||||
else if ( UTF8traits::is3byte(c) )
|
||||
{
|
||||
state = 2;
|
||||
if ( c == 0xE0 ) // to exclude E0[80-9F][80-BF]
|
||||
{
|
||||
overlong = PR_TRUE;
|
||||
olupper = 0x9F;
|
||||
}
|
||||
else if ( c == 0xED ) // ED[A0-BF][80-BF] : surrogate codepoint
|
||||
{
|
||||
surrogate = PR_TRUE;
|
||||
slower = 0xA0;
|
||||
}
|
||||
else if ( c == 0xEF ) // EF BF [BE-BF] : non-character
|
||||
nonchar = PR_TRUE;
|
||||
}
|
||||
else if ( c <= 0xF4 ) // XXX replace /w UTF8traits::is4byte when it's updated to exclude [F5-F7].(bug 199090)
|
||||
{
|
||||
state = 3;
|
||||
nonchar = PR_TRUE;
|
||||
if ( c == 0xF0 ) // to exclude F0[80-8F][80-BF]{2}
|
||||
{
|
||||
overlong = PR_TRUE;
|
||||
olupper = 0x8F;
|
||||
}
|
||||
else if ( c == 0xF4 ) // to exclude F4[90-BF][80-BF]
|
||||
{
|
||||
// actually not surrogates but codepoints beyond 0x10FFFF
|
||||
surrogate = PR_TRUE;
|
||||
slower = 0x90;
|
||||
}
|
||||
}
|
||||
else
|
||||
return PR_FALSE; // Not UTF8 string
|
||||
}
|
||||
|
||||
while (ptr < fragmentEnd && state)
|
||||
{
|
||||
c = *ptr++;
|
||||
--state;
|
||||
|
||||
// non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]
|
||||
if ( nonchar && ( !state && c < 0xBE ||
|
||||
state == 1 && c != 0xBF ||
|
||||
state == 2 && 0x0F != (0x0F & c) ))
|
||||
nonchar = PR_FALSE;
|
||||
|
||||
if ( !UTF8traits::isInSeq(c) || overlong && c <= olupper ||
|
||||
surrogate && slower <= c || nonchar && !state )
|
||||
return PR_FALSE; // Not UTF8 string
|
||||
overlong = surrogate = PR_FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return !state; // state != 0 at the end indicates an invalid UTF-8 seq.
|
||||
}
|
||||
|
||||
/**
|
||||
* A character sink for in-place case conversion.
|
||||
|
Loading…
Reference in New Issue
Block a user