Treat all empty and incomplete sequences as encoding errors, and some other clean-up. Bug 381412, r=jshin, sr=dveditz, b1.9=jst

This commit is contained in:
smontagu@smontagu.org 2007-09-05 22:02:17 -07:00
parent 0b5f01f577
commit 18da58e54b
9 changed files with 134 additions and 14 deletions

View File

@ -79,6 +79,7 @@
nsHZToUnicode::nsHZToUnicode() : nsBufferDecoderSupport(1)
{
mHZState = HZ_STATE_ASCII; // per HZ spec, default to ASCII state
mRunLength = 0;
}
//Overwriting the ConvertNoBuff() in nsUCvCnSupport.cpp.
NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
@ -102,8 +103,13 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
}
if ( *aSrc & 0x80 ) // if it is a 8-bit byte
{
// The source is a 8-bit GBCode
*aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
if (UINT8_IN_RANGE(0x81, aSrc[0], 0xFE) &&
UINT8_IN_RANGE(0x40, aSrc[1], 0xFE)) {
// The source is a 8-bit GBCode
*aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
} else {
*aDest = UCS2_NO_MAPPING;
}
aSrc += 2;
i++;
iDestlen++;
@ -123,6 +129,7 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
// we got a '~{'
// we are switching to HZ state
mHZState = HZ_STATE_GB;
mRunLength = 0;
aSrc += 2;
i++;
break;
@ -132,6 +139,12 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
mHZState = HZ_STATE_ASCII;
aSrc += 2;
i++;
if (mRunLength == 0) {
*aDest = UCS2_NO_MAPPING;
iDestlen++;
aDest++;
}
mRunLength = 0;
break;
case HZLEAD1:
// we got a '~~', process like an ASCII, but no state change
@ -141,6 +154,7 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
i++;
iDestlen++;
aDest++;
mRunLength++;
break;
case HZLEAD4:
// we got a "~\n", it means maintain double byte mode cross lines, ignore the '~' itself
@ -152,6 +166,9 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
default:
// undefined ESC sequence '~X' are ignored since this is a illegal combination
aSrc += 2;
*aDest = UCS2_NO_MAPPING;
iDestlen++;
aDest++;
break;
};
continue;// go for next loop
@ -166,6 +183,7 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
i++;
iDestlen++;
aDest++;
mRunLength++;
break;
case HZ_STATE_ASCII:
default:

View File

@ -70,6 +70,7 @@ protected:
private:
PRInt16 mHZState;
PRUint32 mRunLength; // length of a run of 8-bit GB-encoded characters
};

View File

@ -164,6 +164,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
case eState_ESC_24_29_A: // ESC $ ) A
if(SO == *src) {
mState = eState_GB2312_1980;
mRunLength = 0;
} else {
if(dest+5 >= destEnd)
goto error1;
@ -180,6 +181,12 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
case eState_GB2312_1980: // ESC $ ) A SO
if(SI == *src) { // Shift-In (SI)
mState = eState_ESC_24_29_A_SO_SI;
if (mRunLength == 0) {
if(dest+1 >= destEnd)
goto error1;
*dest++ = 0xFFFD;
}
mRunLength = 0;
} else if(ESC == *src) {
mState = eState_ESC;
} else {
@ -204,6 +211,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
aLen = destEnd - dest;
rv = GB2312_To_Unicode(gb, gbLen, dest, &aLen);
++mRunLength;
if(rv == NS_OK_UDEC_MOREOUTPUT) {
goto error1;
} else if(NS_FAILED(rv)) {
@ -223,6 +231,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
case eState_ESC_24_29_A_SO_SI: // ESC $ ) A SO SI
if(SO == *src) {
mState = eState_GB2312_1980;
mRunLength = 0;
} else if(ESC == *src) {
mState = eState_ESC;
} else {
@ -237,6 +246,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
case eState_ESC_24_29_G: // ESC $ ) G
if(SO == *src) {
mState = eState_CNS11643_1;
mRunLength = 0;
} else {
if(dest+5 >= destEnd)
goto error1;
@ -253,6 +263,12 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
case eState_CNS11643_1: // ESC $ ) G SO
if(SI == *src) { // Shift-In (SI)
mState = eState_ESC_24_29_G_SO_SI;
if (mRunLength == 0) {
if(dest+1 >= destEnd)
goto error1;
*dest++ = 0xFFFD;
}
mRunLength = 0;
} else if(ESC == *src) {
mState = eState_ESC;
} else {
@ -277,6 +293,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
aLen = destEnd - dest;
rv = EUCTW_To_Unicode(cns, cnsLen, dest, &aLen);
++mRunLength;
if(rv == NS_OK_UDEC_MOREOUTPUT) {
goto error1;
} else if(NS_FAILED(rv)) {
@ -296,6 +313,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
case eState_ESC_24_29_G_SO_SI: // ESC $ ) G SO SI
if(SO == *src) {
mState = eState_CNS11643_1;
mRunLength = 0;
} else if(ESC == *src) {
mState = eState_ESC;
} else {
@ -341,6 +359,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
case eState_ESC_24_2A_H_ESC: // ESC $ * H ESC
if(SS2 == *src) {
mState = eState_CNS11643_2;
mRunLength = 0;
} else if('$' == *src) {
mState = eState_ESC_24;
} else {
@ -360,6 +379,12 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
case eState_CNS11643_2: // ESC $ * H ESC SS2
if(SI == *src) { // Shift-In (SI)
mState = eState_ESC_24_2A_H_ESC_SS2_SI;
if (mRunLength == 0) {
if(dest+1 >= destEnd)
goto error1;
*dest++ = 0xFFFD;
}
mRunLength = 0;
} else if(ESC == *src) {
mState = eState_ESC_24_2A_H_ESC;
} else {
@ -386,6 +411,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
aLen = destEnd - dest;
rv = EUCTW_To_Unicode(cns, cnsLen, dest, &aLen);
++mRunLength;
if(rv == NS_OK_UDEC_MOREOUTPUT) {
goto error1;
} else if(NS_FAILED(rv)) {
@ -417,6 +443,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
case eState_ESC_24_2A_H_ESC_SS2_SI_ESC: // ESC $ * H ESC SS2 SI ESC
if(SS2 == *src) {
mState = eState_CNS11643_2;
mRunLength = 0;
} else if('$' == *src) {
mState = eState_ESC_24;
} else {
@ -463,6 +490,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
case eState_ESC_24_2B_I_ESC: // ESC $ + I ESC
if(SS3 == *src) {
mState = eState_CNS11643_3;
mRunLength = 0;
} else if('$' == *src) {
mState = eState_ESC_24;
} else {
@ -482,6 +510,12 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
case eState_CNS11643_3: // ESC $ + I ESC SS3
if(SI == *src) { // Shift-In (SI)
mState = eState_ESC_24_2B_I_ESC_SS3_SI;
if (mRunLength == 0) {
if(dest+1 >= destEnd)
goto error1;
*dest++ = 0xFFFD;
}
mRunLength = 0;
} else if(ESC == *src) {
mState = eState_ESC_24_2B_I_ESC;
} else {
@ -509,6 +543,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
aLen = destEnd - dest;
rv = EUCTW_To_Unicode(cns, cnsLen, dest, &aLen);
++mRunLength;
if(rv == NS_OK_UDEC_MOREOUTPUT) {
goto error1;
} else if(NS_FAILED(rv)) {
@ -540,6 +575,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
case eState_ESC_24_2B_I_ESC_SS3_SI_ESC: // ESC $ + I ESC SS3 SI ESC
if(SS3 == *src) {
mState = eState_CNS11643_3;
mRunLength = 0;
} else if('$' == *src) {
mState = eState_ESC_24;
} else {

View File

@ -56,7 +56,8 @@ class nsISO2022CNToUnicode : public nsBasicDecoderSupport
public:
nsISO2022CNToUnicode() :
mState(eState_ASCII),
mPlaneID(0) { }
mPlaneID(0),
mRunLength(0) { }
virtual ~nsISO2022CNToUnicode() {}
@ -74,6 +75,7 @@ public:
{
mState = eState_ASCII;
mPlaneID = 0;
mRunLength = 0;
return NS_OK;
}
@ -118,6 +120,9 @@ private:
// Plane number for CNS11643 code
int mPlaneID;
// Length of non-ASCII run
PRUint32 mRunLength;
// Decoder handler
nsCOMPtr<nsIUnicodeDecoder> mGB2312_Decoder;
nsCOMPtr<nsIUnicodeDecoder> mEUCTW_Decoder;

View File

@ -120,7 +120,7 @@ NS_IMETHODIMP nsShiftJISToUnicode::Convert(
{
case 0:
if(*src & 0x80 && *src != (unsigned char)0xa0)
if(*src & 0x80)
{
mData = SJIS_INDEX[*src & 0x7F];
if(mData < 0xE000 )
@ -130,15 +130,33 @@ NS_IMETHODIMP nsShiftJISToUnicode::Convert(
if( mData > 0xFF00)
{
if(0xFFFD == mData) {
// IE convert fd-ff as single byte and convert to
// U+f8f1 to U+f8f3
if((0xfd == *src) || (0xfe == *src) || (0xff == *src))
{
*dest++ = (PRUnichar) 0xf8f1 +
// IE-compatible handling of undefined codepoints:
// 0x80 --> U+0080
// 0xa0 --> U+F8F0
// 0xfd --> U+F8F1
// 0xfe --> U+F8F2
// 0xff --> U+F8F3
switch (*src) {
case 0x80:
*dest++ = (PRUnichar) *src;
break;
case 0xa0:
*dest++ = (PRUnichar) 0xf8f0;
break;
case 0xfd:
case 0xfe:
case 0xff:
*dest++ = (PRUnichar) 0xf8f1 +
(*src - (unsigned char)(0xfd));
if(dest >= destEnd)
goto error1;
break;
default:
*dest++ = 0x30FB;
}
if(dest >= destEnd)
goto error1;
} else {
*dest++ = mData; // JIS 0201
if(dest >= destEnd)
@ -517,10 +535,16 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
case mState_ESC_28: // ESC (
if( 'B' == *src) {
mState = mState_ASCII;
if (mRunLength == 0) {
goto error2;
}
mRunLength = 0;
} else if ('J' == *src) {
mState = mState_JISX0201_1976Roman;
mRunLength = 0;
} else if ('I' == *src) {
mState = mState_JISX0201_1976Kana;
mRunLength = 0;
} else {
if((dest+3) >= destEnd)
goto error1;
@ -536,10 +560,13 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
case mState_ESC_24: // ESC $
if( '@' == *src) {
mState = mState_JISX0208_1978;
mRunLength = 0;
} else if ('A' == *src) {
mState = mState_GB2312_1980;
mRunLength = 0;
} else if ('B' == *src) {
mState = mState_JISX0208_1983;
mRunLength = 0;
} else if ('(' == *src) {
mState = mState_ESC_24_28;
} else {
@ -557,8 +584,10 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
case mState_ESC_24_28: // ESC $ (
if( 'C' == *src) {
mState = mState_KSC5601_1987;
mRunLength = 0;
} else if ('D' == *src) {
mState = mState_JISX0212_1990;
mRunLength = 0;
} else {
if((dest+4) >= destEnd)
goto error1;
@ -583,6 +612,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
// we may need a if statement here for '\' and '~'
// to map them to Yen and Overbar
*dest++ = (PRUnichar) *src;
++mRunLength;
if(dest >= destEnd)
goto error1;
}
@ -595,6 +625,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
} else {
if((0x21 <= *src) && (*src <= 0x5F)) {
*dest++ = (0xFF61-0x0021) + *src;
++mRunLength;
} else {
goto error2;
}
@ -687,6 +718,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
// XXX We need to map from JIS X 0208 1983 to 1987
// in the next line before pass to *dest++
*dest++ = gJapaneseMap[mData+off];
++mRunLength;
}
mState = mState_JISX0208_1978;
if(dest >= destEnd)
@ -724,6 +756,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
mGB2312Decoder->Convert((const char *)gb, &gbLen,
&uni, &uniLen);
*dest++ = uni;
++mRunLength;
}
}
mState = mState_GB2312_1980;
@ -739,6 +772,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
goto error2;
} else {
*dest++ = gJapaneseMap[mData+off];
++mRunLength;
}
mState = mState_JISX0208_1983;
if(dest >= destEnd)
@ -776,6 +810,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
mEUCKRDecoder->Convert((const char *)ksc, &kscLen,
&uni, &uniLen);
*dest++ = uni;
++mRunLength;
}
}
mState = mState_KSC5601_1987;
@ -791,6 +826,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
goto error2;
} else {
*dest++ = gJapaneseMap[mData+off];
++mRunLength;
}
mState = mState_JISX0212_1990;
if(dest >= destEnd)
@ -824,6 +860,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
if((0x20 <= *src) && (*src <= 0x7F)) {
if (G2_ISO88591 == G2charset) {
*dest++ = *src | 0x80;
++mRunLength;
} else if (G2_ISO88597 == G2charset) {
if (!mISO88597Decoder) {
// creating a delegate converter (ISO-8859-7)
@ -845,6 +882,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
mISO88597Decoder->Convert((const char *)&gr, &grLen,
&uni, &uniLen);
*dest++ = uni;
++mRunLength;
}
} else {// G2charset is G2_unknown (not designated yet)
goto error2;
@ -864,6 +902,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
case mState_ERROR:
mState = mLastLegalState;
mRunLength = 0;
goto error2;
break;

View File

@ -123,6 +123,7 @@ public:
mState = mState_ASCII;
mLastLegalState = mState_ASCII;
mData = 0;
mRunLength = 0;
G2charset = G2_unknown;
mGB2312Decoder = nsnull;
mEUCKRDecoder = nsnull;
@ -148,6 +149,7 @@ public:
{
mState = mState_ASCII;
mLastLegalState = mState_ASCII;
mRunLength = 0;
setMapMode();
return NS_OK;
}
@ -176,6 +178,7 @@ private:
mState_ERROR
} mState, mLastLegalState;
PRInt32 mData;
PRInt32 mRunLength; // the length of a non-ASCII run
enum {
G2_unknown,
G2_ISO88591,

View File

@ -59,6 +59,7 @@ NS_IMETHODIMP nsISO2022KRToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
}
else if(0x0e == *src) { // Shift-Out
mState = mState_KSX1001_1992;
mRunLength = 0;
}
else if(*src & 0x80) {
*dest++ = 0xFFFD;
@ -103,6 +104,12 @@ NS_IMETHODIMP nsISO2022KRToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
mState = mLastLegalState;
if('C' == *src) {
mState = mState_ASCII;
if (mRunLength == 0) {
if(dest+1 >= destEnd)
goto error1;
*dest++ = 0xFFFD;
}
mRunLength = 0;
}
else {
if((dest+4) >= destEnd)
@ -122,11 +129,18 @@ NS_IMETHODIMP nsISO2022KRToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
}
else if (0x0f == *src) { // Shift-In (SI)
mState = mState_ASCII;
if (mRunLength == 0) {
if(dest+1 >= destEnd)
goto error1;
*dest++ = 0xFFFD;
}
mRunLength = 0;
}
else if ((PRUint8) *src == 0x20 || (PRUint8) *src == 0x09) {
// Allow space and tab between SO and SI (i.e. in Hangul segment)
mState = mState_KSX1001_1992;
*dest++ = (PRUnichar) *src;
++mRunLength;
if(dest >= destEnd)
goto error1;
}
@ -164,6 +178,7 @@ NS_IMETHODIMP nsISO2022KRToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
// Convert EUC-KR to unicode.
mEUCKRDecoder->Convert((const char *)ksx, &ksxLen, &uni, &uniLen);
*dest++ = uni;
++mRunLength;
}
if(dest >= destEnd)
goto error1;

View File

@ -50,6 +50,7 @@ public:
mLastLegalState = mState_ASCII;
mData = 0;
mEUCKRDecoder = nsnull;
mRunLength = 0;
}
virtual ~nsISO2022KRToUnicode()
@ -71,6 +72,7 @@ public:
{
mState = mState_ASCII;
mLastLegalState = mState_ASCII;
mRunLength = 0;
return NS_OK;
}
@ -87,6 +89,9 @@ private:
PRUint8 mData;
// Length of non-ASCII run
PRUint32 mRunLength;
nsIUnicodeDecoder *mEUCKRDecoder;
};
#endif // nsISO2022KRToUnicode_h__

View File

@ -787,8 +787,6 @@ PRIVATE PRBool uCheckAndScan4BytesGB18030(
(in[2] - 0x81)) * 10 ) + (in[3] - 0x30);
*inscanlen = 4;
if(data >= 0x00010000)
return PR_FALSE;
*out = (PRUint16) data;
*out = (data < 0x00010000) ? data : 0xFFFD;
return PR_TRUE;
}