Treat all empty and incomplete sequences as encoding errors, and some other clean-up. Bug 381412, r=jshin, sr=dveditz, b1.9=jst

2025-04-02 20:42:49 +00:00 · 2007-09-05 22:02:17 -07:00 · 2007-09-05 22:02:17 -07:00 · 18da58e54b
commit 18da58e54b
parent 0b5f01f577
9 changed files with 134 additions and 14 deletions
--- a/intl/uconv/ucvcn/nsHZToUnicode.cpp
+++ b/intl/uconv/ucvcn/nsHZToUnicode.cpp
@ -79,6 +79,7 @@
 nsHZToUnicode::nsHZToUnicode() : nsBufferDecoderSupport(1)
 {
  mHZState = HZ_STATE_ASCII;	// per HZ spec, default to ASCII state 
+  mRunLength = 0;
 }
 //Overwriting the ConvertNoBuff() in nsUCvCnSupport.cpp.
 NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
@ -102,8 +103,13 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
    }
    if ( *aSrc & 0x80 ) // if it is a 8-bit byte
    {
-      // The source is a 8-bit GBCode
-      *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
+      if (UINT8_IN_RANGE(0x81, aSrc[0], 0xFE) &&
+          UINT8_IN_RANGE(0x40, aSrc[1], 0xFE)) {
+        // The source is a 8-bit GBCode
+        *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
+      } else {
+        *aDest = UCS2_NO_MAPPING;
+      }
      aSrc += 2;
      i++;
      iDestlen++;
@ -123,6 +129,7 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
          // we got a '~{'
          // we are switching to HZ state
          mHZState = HZ_STATE_GB;
+          mRunLength = 0;
          aSrc += 2;
          i++;
          break;
@ -132,6 +139,12 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
          mHZState = HZ_STATE_ASCII;
          aSrc += 2;
          i++;
+          if (mRunLength == 0) {
+            *aDest = UCS2_NO_MAPPING;
+            iDestlen++;
+            aDest++;
+          }
+          mRunLength = 0;
          break;
        case HZLEAD1: 
          // we got a '~~', process like an ASCII, but no state change
@ -141,6 +154,7 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
          i++;
          iDestlen++;
          aDest++;
+          mRunLength++;
          break;
        case HZLEAD4:	
          // we got a "~\n", it means maintain double byte mode cross lines, ignore the '~' itself
@ -152,6 +166,9 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
        default:
          // undefined ESC sequence '~X' are ignored since this is a illegal combination 
          aSrc += 2;
+          *aDest = UCS2_NO_MAPPING;
+          iDestlen++;
+          aDest++;
          break;
      };
      continue;// go for next loop
@ -166,6 +183,7 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
        i++;
        iDestlen++;
        aDest++;
+        mRunLength++;
        break;
      case HZ_STATE_ASCII:
      default:
--- a/intl/uconv/ucvcn/nsHZToUnicode.h
+++ b/intl/uconv/ucvcn/nsHZToUnicode.h
@ -70,6 +70,7 @@ protected:

 private:
  PRInt16 mHZState;
+  PRUint32 mRunLength; // length of a run of 8-bit GB-encoded characters

 };

--- a/intl/uconv/ucvcn/nsISO2022CNToUnicode.cpp
+++ b/intl/uconv/ucvcn/nsISO2022CNToUnicode.cpp
@ -164,6 +164,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
      case eState_ESC_24_29_A:  // ESC $ ) A
        if(SO == *src) {
           mState = eState_GB2312_1980;
+           mRunLength = 0;
        } else {
           if(dest+5 >= destEnd)
              goto error1;
@ -180,6 +181,12 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
      case eState_GB2312_1980:   // ESC $ ) A SO
        if(SI == *src) { // Shift-In (SI)
           mState = eState_ESC_24_29_A_SO_SI;
+           if (mRunLength == 0) {
+              if(dest+1 >= destEnd)
+                 goto error1;
+              *dest++ = 0xFFFD;
+           }
+           mRunLength = 0;
        } else if(ESC == *src) {
           mState = eState_ESC;
        } else {
@ -204,6 +211,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen

           aLen = destEnd - dest;
           rv = GB2312_To_Unicode(gb, gbLen, dest, &aLen);
+           ++mRunLength;
           if(rv == NS_OK_UDEC_MOREOUTPUT) {
              goto error1;
           } else if(NS_FAILED(rv)) {
@ -223,6 +231,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
      case eState_ESC_24_29_A_SO_SI:  // ESC $ ) A SO SI
        if(SO == *src) {
           mState = eState_GB2312_1980;
+           mRunLength = 0;
        } else if(ESC == *src) {
           mState = eState_ESC;
        } else {
@ -237,6 +246,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
      case eState_ESC_24_29_G:   // ESC $ ) G
        if(SO == *src) {
           mState = eState_CNS11643_1;
+           mRunLength = 0;
        } else {
           if(dest+5 >= destEnd)
              goto error1;
@ -253,6 +263,12 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
      case eState_CNS11643_1:   // ESC $ ) G SO
        if(SI == *src) { // Shift-In (SI)
           mState = eState_ESC_24_29_G_SO_SI;
+           if (mRunLength == 0) {
+              if(dest+1 >= destEnd)
+                 goto error1;
+              *dest++ = 0xFFFD;
+           }
+           mRunLength = 0;
        } else if(ESC == *src) {
           mState = eState_ESC;
        } else {
@ -277,6 +293,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen

           aLen = destEnd - dest;
           rv = EUCTW_To_Unicode(cns, cnsLen, dest, &aLen);
+           ++mRunLength;
           if(rv == NS_OK_UDEC_MOREOUTPUT) {
              goto error1;
           } else if(NS_FAILED(rv)) {
@ -296,6 +313,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
      case eState_ESC_24_29_G_SO_SI: // ESC $ ) G SO SI
        if(SO == *src) {
           mState = eState_CNS11643_1;
+           mRunLength = 0;
        } else if(ESC == *src) {
           mState = eState_ESC;
        } else {
@ -341,6 +359,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
      case eState_ESC_24_2A_H_ESC:  // ESC $ * H ESC
        if(SS2 == *src) {
           mState = eState_CNS11643_2;
+           mRunLength = 0;
        } else if('$' == *src) {
           mState = eState_ESC_24;
        } else {
@ -360,6 +379,12 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
      case eState_CNS11643_2:  // ESC $ * H ESC SS2
        if(SI == *src) { // Shift-In (SI)
           mState = eState_ESC_24_2A_H_ESC_SS2_SI;
+           if (mRunLength == 0) {
+              if(dest+1 >= destEnd)
+                 goto error1;
+              *dest++ = 0xFFFD;
+           }
+           mRunLength = 0;
        } else if(ESC == *src) {
           mState = eState_ESC_24_2A_H_ESC;
        } else {
@ -386,6 +411,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
 
           aLen = destEnd - dest;
           rv = EUCTW_To_Unicode(cns, cnsLen, dest, &aLen);
+           ++mRunLength;
           if(rv == NS_OK_UDEC_MOREOUTPUT) {
              goto error1;
           } else if(NS_FAILED(rv)) {
@ -417,6 +443,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
      case eState_ESC_24_2A_H_ESC_SS2_SI_ESC:  // ESC $ * H ESC SS2 SI ESC
        if(SS2 == *src) {
           mState = eState_CNS11643_2;
+           mRunLength = 0;
        } else if('$' == *src) {
           mState = eState_ESC_24;
        } else {
@ -463,6 +490,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
      case eState_ESC_24_2B_I_ESC:  // ESC $ + I ESC
        if(SS3 == *src) {
           mState = eState_CNS11643_3;
+           mRunLength = 0;
        } else if('$' == *src) {
           mState = eState_ESC_24;
        } else {
@ -482,6 +510,12 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
      case eState_CNS11643_3:   // ESC $ + I ESC SS3
        if(SI == *src) { // Shift-In (SI)
           mState = eState_ESC_24_2B_I_ESC_SS3_SI;
+           if (mRunLength == 0) {
+              if(dest+1 >= destEnd)
+                 goto error1;
+              *dest++ = 0xFFFD;
+           }
+           mRunLength = 0;
        } else if(ESC == *src) {
           mState = eState_ESC_24_2B_I_ESC;
        } else {
@ -509,6 +543,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen

           aLen = destEnd - dest;
           rv = EUCTW_To_Unicode(cns, cnsLen, dest, &aLen);
+           ++mRunLength;
           if(rv == NS_OK_UDEC_MOREOUTPUT) {
              goto error1;
           } else if(NS_FAILED(rv)) {
@ -540,6 +575,7 @@ NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
      case eState_ESC_24_2B_I_ESC_SS3_SI_ESC:  // ESC $ + I ESC SS3 SI ESC
        if(SS3 == *src) {
           mState = eState_CNS11643_3;
+           mRunLength = 0;
        } else if('$' == *src) {
           mState = eState_ESC_24;
        } else {
--- a/intl/uconv/ucvcn/nsISO2022CNToUnicode.h
+++ b/intl/uconv/ucvcn/nsISO2022CNToUnicode.h
@ -56,7 +56,8 @@ class nsISO2022CNToUnicode : public nsBasicDecoderSupport
 public:
  nsISO2022CNToUnicode() : 
        mState(eState_ASCII), 
-        mPlaneID(0) { }
+        mPlaneID(0),
+        mRunLength(0) { }

  virtual ~nsISO2022CNToUnicode() {}

@ -74,6 +75,7 @@ public:
  {
    mState = eState_ASCII;
    mPlaneID = 0;
+    mRunLength = 0;

    return NS_OK;
  }
@ -118,6 +120,9 @@ private:
  // Plane number for CNS11643 code
  int mPlaneID;

+  // Length of non-ASCII run
+  PRUint32 mRunLength;
+
  // Decoder handler
  nsCOMPtr<nsIUnicodeDecoder> mGB2312_Decoder;
  nsCOMPtr<nsIUnicodeDecoder> mEUCTW_Decoder;
--- a/intl/uconv/ucvja/nsJapaneseToUnicode.cpp
+++ b/intl/uconv/ucvja/nsJapaneseToUnicode.cpp
@ -120,7 +120,7 @@ NS_IMETHODIMP nsShiftJISToUnicode::Convert(
       {

          case 0:
-          if(*src & 0x80 && *src != (unsigned char)0xa0)
+          if(*src & 0x80)
          {
            mData = SJIS_INDEX[*src & 0x7F];
            if(mData < 0xE000 )
@ -130,15 +130,33 @@ NS_IMETHODIMP nsShiftJISToUnicode::Convert(
               if( mData > 0xFF00)
               {
                 if(0xFFFD == mData) {
-                   // IE convert fd-ff as single byte and convert to
-                   // U+f8f1 to U+f8f3
-                   if((0xfd == *src) || (0xfe == *src) || (0xff == *src))
-                   {
-                     *dest++ = (PRUnichar) 0xf8f1 + 
+                   // IE-compatible handling of undefined codepoints:
+                   // 0x80 --> U+0080
+                   // 0xa0 --> U+F8F0
+                   // 0xfd --> U+F8F1
+                   // 0xfe --> U+F8F2
+                   // 0xff --> U+F8F3
+                   switch (*src) {
+                     case 0x80:
+                       *dest++ = (PRUnichar) *src;
+                       break;
+
+                     case 0xa0:
+                       *dest++ = (PRUnichar) 0xf8f0;
+                       break;
+
+                     case 0xfd:
+                     case 0xfe:
+                     case 0xff:
+                       *dest++ = (PRUnichar) 0xf8f1 + 
                                   (*src - (unsigned char)(0xfd));
-                     if(dest >= destEnd)
-                        goto error1;
+                       break;
+
+                     default:
+                       *dest++ = 0x30FB;
                   }
+                   if(dest >= destEnd)
+                     goto error1;
                 } else {
                   *dest++ = mData; // JIS 0201
                   if(dest >= destEnd)
@ -517,10 +535,16 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
          case mState_ESC_28: // ESC (
            if( 'B' == *src) {
              mState = mState_ASCII;
+              if (mRunLength == 0) {
+                goto error2;
+              }
+              mRunLength = 0;
            } else if ('J' == *src)  {
              mState = mState_JISX0201_1976Roman;
+              mRunLength = 0;
            } else if ('I' == *src)  {
              mState = mState_JISX0201_1976Kana;
+              mRunLength = 0;
            } else  {
              if((dest+3) >= destEnd)
                goto error1;
@ -536,10 +560,13 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
          case mState_ESC_24: // ESC $
            if( '@' == *src) {
              mState = mState_JISX0208_1978;
+              mRunLength = 0;
            } else if ('A' == *src)  {
              mState = mState_GB2312_1980;
+              mRunLength = 0;
            } else if ('B' == *src)  {
              mState = mState_JISX0208_1983;
+              mRunLength = 0;
            } else if ('(' == *src)  {
              mState = mState_ESC_24_28;
            } else  {
@ -557,8 +584,10 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
          case mState_ESC_24_28: // ESC $ (
            if( 'C' == *src) {
              mState = mState_KSC5601_1987;
+              mRunLength = 0;
            } else if ('D' == *src) {
              mState = mState_JISX0212_1990;
+              mRunLength = 0;
            } else  {
              if((dest+4) >= destEnd)
                goto error1;
@ -583,6 +612,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
              // we may need a if statement here for '\' and '~' 
              // to map them to Yen and Overbar
              *dest++ = (PRUnichar) *src;
+              ++mRunLength;
              if(dest >= destEnd)
                goto error1;
            }
@ -595,6 +625,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
            } else {
              if((0x21 <= *src) && (*src <= 0x5F)) {
                *dest++ = (0xFF61-0x0021) + *src;
+                ++mRunLength;
              } else {
                goto error2;
              }
@ -687,6 +718,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
               // XXX We need to map from JIS X 0208 1983 to 1987 
               // in the next line before pass to *dest++
               *dest++ = gJapaneseMap[mData+off];
+               ++mRunLength;
            }
            mState = mState_JISX0208_1978;
            if(dest >= destEnd)
@ -724,6 +756,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
                mGB2312Decoder->Convert((const char *)gb, &gbLen,
                                        &uni, &uniLen);
                *dest++ = uni;
+                ++mRunLength;
              }
            }
            mState = mState_GB2312_1980;
@ -739,6 +772,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
               goto error2;
            } else {
               *dest++ = gJapaneseMap[mData+off];
+               ++mRunLength;
            }
            mState = mState_JISX0208_1983;
            if(dest >= destEnd)
@ -776,6 +810,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
                mEUCKRDecoder->Convert((const char *)ksc, &kscLen,
                                       &uni, &uniLen);
                *dest++ = uni;
+                ++mRunLength;
              }
            }
            mState = mState_KSC5601_1987;
@ -791,6 +826,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
               goto error2;
            } else {
               *dest++ = gJapaneseMap[mData+off];
+               ++mRunLength;
            }
            mState = mState_JISX0212_1990;
            if(dest >= destEnd)
@ -824,6 +860,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
            if((0x20 <= *src) && (*src <= 0x7F)) {
              if (G2_ISO88591 == G2charset) {
                *dest++ = *src | 0x80;
+                ++mRunLength;
              } else if (G2_ISO88597 == G2charset) {
                if (!mISO88597Decoder) {
                  // creating a delegate converter (ISO-8859-7)
@ -845,6 +882,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
                  mISO88597Decoder->Convert((const char *)&gr, &grLen,
                                            &uni, &uniLen);
                  *dest++ = uni;
+                  ++mRunLength;
                }
              } else {// G2charset is G2_unknown (not designated yet)
                goto error2;
@ -864,6 +902,7 @@ NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(

          case mState_ERROR:
             mState = mLastLegalState;
+             mRunLength = 0;
             goto error2;
          break;

--- a/intl/uconv/ucvja/nsJapaneseToUnicode.h
+++ b/intl/uconv/ucvja/nsJapaneseToUnicode.h
@ -123,6 +123,7 @@ public:
        mState = mState_ASCII;
        mLastLegalState = mState_ASCII;
        mData = 0;
+        mRunLength = 0;
        G2charset = G2_unknown;
        mGB2312Decoder = nsnull;
        mEUCKRDecoder = nsnull;
@ -148,6 +149,7 @@ public:
     {
        mState = mState_ASCII;
        mLastLegalState = mState_ASCII;
+        mRunLength = 0;
        setMapMode();
        return NS_OK;
     }
@ -176,6 +178,7 @@ private:
   mState_ERROR
 } mState, mLastLegalState;
 PRInt32 mData;
+ PRInt32 mRunLength; // the length of a non-ASCII run
 enum {
   G2_unknown,
   G2_ISO88591,
--- a/intl/uconv/ucvko/nsISO2022KRToUnicode.cpp
+++ b/intl/uconv/ucvko/nsISO2022KRToUnicode.cpp
@ -59,6 +59,7 @@ NS_IMETHODIMP nsISO2022KRToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
        } 
        else if(0x0e == *src) { // Shift-Out 
          mState = mState_KSX1001_1992;
+          mRunLength = 0;
        } 
        else if(*src & 0x80) {
          *dest++ = 0xFFFD;
@ -103,6 +104,12 @@ NS_IMETHODIMP nsISO2022KRToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
        mState = mLastLegalState;
        if('C' == *src) {
          mState = mState_ASCII;
+          if (mRunLength == 0) {
+            if(dest+1 >= destEnd)
+              goto error1;
+            *dest++ = 0xFFFD;
+          }
+          mRunLength = 0;
        } 
        else  {
          if((dest+4) >= destEnd)
@ -122,11 +129,18 @@ NS_IMETHODIMP nsISO2022KRToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
        } 
        else if (0x0f == *src) { // Shift-In (SI)
          mState = mState_ASCII;
+          if (mRunLength == 0) {
+            if(dest+1 >= destEnd)
+              goto error1;
+            *dest++ = 0xFFFD;
+          }
+          mRunLength = 0;
        } 
        else if ((PRUint8) *src == 0x20 || (PRUint8) *src == 0x09) {
          // Allow space and tab between SO and SI (i.e. in Hangul segment)
          mState = mState_KSX1001_1992;
          *dest++ = (PRUnichar) *src;
+          ++mRunLength;
          if(dest >= destEnd)
          goto error1;
        } 
@ -164,6 +178,7 @@ NS_IMETHODIMP nsISO2022KRToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLen
            // Convert EUC-KR to unicode.
            mEUCKRDecoder->Convert((const char *)ksx, &ksxLen, &uni, &uniLen);
            *dest++ = uni;
+            ++mRunLength;
          }
          if(dest >= destEnd)
            goto error1;
--- a/intl/uconv/ucvko/nsISO2022KRToUnicode.h
+++ b/intl/uconv/ucvko/nsISO2022KRToUnicode.h
@ -50,6 +50,7 @@ public:
    mLastLegalState = mState_ASCII;
    mData = 0;
    mEUCKRDecoder = nsnull;
+    mRunLength = 0;
  }

  virtual ~nsISO2022KRToUnicode()
@ -71,6 +72,7 @@ public:
  {
    mState = mState_ASCII;
    mLastLegalState = mState_ASCII;
+    mRunLength = 0;
    return NS_OK;
  }

@ -87,6 +89,9 @@ private:

  PRUint8 mData;

+  // Length of non-ASCII run
+  PRUint32 mRunLength;
+
  nsIUnicodeDecoder *mEUCKRDecoder;
 };
 #endif // nsISO2022KRToUnicode_h__
--- a/intl/uconv/util/uscan.c
+++ b/intl/uconv/util/uscan.c
@ -787,8 +787,6 @@ PRIVATE PRBool uCheckAndScan4BytesGB18030(
    (in[2] - 0x81)) * 10 ) + (in[3] - 0x30);
  
  *inscanlen = 4;
-  if(data >= 0x00010000)  
-    return PR_FALSE;
-  *out = (PRUint16) data;
+  *out = (data < 0x00010000) ? data : 0xFFFD;
  return PR_TRUE;
 }