1. detect BOM, 2. Implement XML 1.0 Appendex F, 3. move hack from nsParser.cpp to nsScanner.cpp so we won't damage UCS2 data

2024-12-02 01:48:05 +00:00 · 1999-08-30 22:25:17 +00:00 · 1999-08-30 22:25:17 +00:00 · 49da4f2996
commit 49da4f2996
parent 50f4479ca0
4 changed files with 272 additions and 2 deletions
--- a/htmlparser/src/nsParser.cpp
+++ b/htmlparser/src/nsParser.cpp
@ -16,7 +16,8 @@
 * Reserved.
 */
  
-
+#define DEBUG_XMLENCODING
+#define XMLENCODING_PEEKBYTES 64

 #include "nsParser.h"
 #include "nsIContentSink.h" 
@ -1086,6 +1087,109 @@ nsresult nsParser::OnStartRequest(nsIURI* aURL, const char *aSourceType)
  return NS_OK;
 }

+
+#define UCS2_BE "X-ISO-10646-UCS-2-BE"
+#define UCS2_LE "X-ISO-10646-UCS-2-LE"
+#define UCS4_BE "X-ISO-10646-UCS-4-BE"
+#define UCS4_LE "X-ISO-10646-UCS-4-LE"
+#define UCS4_2143 "X-ISO-10646-UCS-4-2143"
+#define UCS4_3412 "X-ISO-10646-UCS-4-3412"
+
+static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen,
+   nsString& oCharset, nsCharsetSource& oCharsetSource)
+{
+ oCharsetSource= kCharsetFromAutoDetection;
+ oCharset = "";
+ // see http://www.w3.org/TR/1998/REC-xml-19980210#sec-oCharseting
+ // for details
+ switch(aBytes[0])
+	 {
+   case 0x00:
+     if(0x00==aBytes[1]) {
+        // 00 00
+        if((0x00==aBytes[2]) && (0x3C==aBytes[3])) {
+           // 00 00 00 3C UCS-4, big-endian machine (1234 order)
+           oCharset = UCS4_BE;
+        } else if((0x3C==aBytes[2]) && (0x00==aBytes[3])) {
+           // 00 00 3C 00 UCS-4, unusual octet order (2143)
+           oCharset = UCS4_2143;
+        } 
+     } else if(0x3C==aBytes[1]) {
+        // 00 3C
+        if((0x00==aBytes[2]) && (0x00==aBytes[3])) {
+           // 00 3C 00 00 UCS-4, unusual octet order (3412)
+           oCharset = UCS4_3412;
+        } else if((0x3C==aBytes[2]) && (0x3F==aBytes[3])) {
+           // 00 3C 00 3F UTF-16, big-endian, no Byte Order Mark
+           oCharset = UCS2_BE; // should change to UTF-16BE
+        } 
+     }
+   break;
+   case 0x3C:
+     if(0x00==aBytes[1]) {
+        // 3C 00
+        if((0x00==aBytes[2]) && (0x00==aBytes[3])) {
+           // 3C 00 00 00 UCS-4, little-endian machine (4321 order)
+           oCharset = UCS4_LE;
+        } else if((0x3F==aBytes[2]) && (0x00==aBytes[3])) {
+           // 3C 00 3F 00 UTF-16, little-endian, no Byte Order Mark
+           oCharset = UCS2_LE; // should change to UTF-16LE
+        } 
+     } else if((0x3C==aBytes[0]) && (0x3F==aBytes[1]) &&
+               (0x78==aBytes[2]) && (0x6D==aBytes[3]) &&
+               (0 == PL_strncmp("<?xml version", (char*)aBytes, 13 ))) {
+       // 3C 3F 78 6D
+       nsAutoString firstXbytes("");
+       firstXbytes.Append((const char*)aBytes, (PRInt32)
+                       ((aLen > XMLENCODING_PEEKBYTES)?
+                       XMLENCODING_PEEKBYTES:
+                       aLen)); 
+       PRInt32 xmlDeclEnd = firstXbytes.Find("?>", PR_FALSE, 13);
+	   // 27 == strlen("<xml? version="1" encoding=");
+       if((kNotFound != xmlDeclEnd) &&(xmlDeclEnd > 27 )){
+           firstXbytes.Cut(xmlDeclEnd, firstXbytes.Length()-xmlDeclEnd);
+           PRInt32 encStart = firstXbytes.Find("encoding", PR_FALSE,13);
+           if(kNotFound != encStart) {
+             encStart = firstXbytes.FindCharInSet("\"'", encStart+8);
+                              // 8 == strlen("encoding")
+             if(kNotFound != encStart) {
+                PRUnichar q = firstXbytes.CharAt(encStart); 
+                PRInt32 encEnd = firstXbytes.FindChar(q, PR_FALSE, encStart+1);
+                if(kNotFound != encEnd) {
+                   PRInt32 count = encEnd - encStart -1;
+                   if(count >0) {
+                      firstXbytes.Mid(oCharset,(encStart+1), count);
+                      oCharsetSource= kCharsetFromMetaTag;
+                   }
+                }
+             }
+           }
+       }
+     }
+   break;
+   case 0xFE:
+     if(0xFF==aBytes[1]) {
+        // FE FF
+        // UTF-16, big-endian 
+        oCharset = UCS2_BE; // should change to UTF-16BE
+     }
+   break;
+   case 0xFF:
+     if(0xFE==aBytes[1]) {
+        // FF FE
+        // UTF-16, little-endian 
+        oCharset = UCS2_LE; // should change to UTF-16LE
+     }
+   break;
+   // case 0x4C: if((0x6F==aBytes[1]) && ((0xA7==aBytes[2] && (0x94==aBytes[3])) {
+   //   We do not care EBCIDIC here....
+   // }
+   // break;
+ }  // switch
+ return oCharset.Length() > 0;
+}
+
+
 /**
 *  
 *  
@ -1126,13 +1230,33 @@ nsresult nsParser::OnDataAvailable(nsIURI* aURL, nsIInputStream *pIStream, PRUin
  int       theStartPos=0;
  nsresult result=NS_OK;

+  PRBool needCheckFirst4Bytes = 
+          ((0 == sourceOffset) && (mCharsetSource<kCharsetFromAutoDetection));
  while ((theNumRead>0) && (aLength>theTotalRead) && (NS_OK==result)) {
    result = pIStream->Read(mParserContext->mTransferBuffer, aLength, &theNumRead);
    if((result == NS_OK) && (theNumRead>0)) {
+      if(needCheckFirst4Bytes && (theNumRead >= 4)) {
+         nsCharsetSource guessSource;
+         nsAutoString guess("");
+         
+         needCheckFirst4Bytes = PR_FALSE;
+         if(detectByteOrderMark((const unsigned char*)mParserContext->mTransferBuffer,
+                                theNumRead, guess, guessSource)) 
+         {
+#ifdef DEBUG_XMLENCODING
+            printf("xmlencoding detect- %s\n", guess.ToNewCString());
+#endif
+            this->SetDocumentCharset(guess, guessSource);
+			mParserContext->mScanner->SetDocumentCharset(guess, guessSource);
+         }
+      }
      theTotalRead+=theNumRead;
      if(mParserFilter)
         mParserFilter->RawBuffer(mParserContext->mTransferBuffer, &theNumRead);

+#if 0
+      // The following Hack have moved to nsScanner.cpp
+      // Remove that Hack if you feel this hack is not necessary
      // XXX Hack --- NULL character(s) is(are) seen in the middle of the buffer!!!
      // For now, I'm conditioning the raw buffer by removing the unwanted null chars.
      // Problem could be NECKO related
@ -1141,6 +1265,7 @@ nsresult nsParser::OnDataAvailable(nsIURI* aURL, nsIInputStream *pIStream, PRUin
        if(mParserContext->mTransferBuffer[i]==kNullCh)
          mParserContext->mTransferBuffer[i]=kSpace;
      }
+#endif 

 #ifdef  NS_DEBUG
      int index=0;
--- a/htmlparser/src/nsScanner.cpp
+++ b/htmlparser/src/nsScanner.cpp
@ -257,6 +257,16 @@ PRBool nsScanner::Append(const char* aBuffer, PRUint32 aLen){
 		  PRInt32 unicharLength = unicharBufLen;
 		  res = mUnicodeDecoder->Convert(unichars, 0, &unicharLength,aBuffer, 0, &srcLength );
      unichars[unicharLength]=0;  //add this since the unicode converters can't be trusted to do so.
+
+
+                  // Move the nsParser.cpp 00 -> space hack to here so
+                  // it won't break UCS2 file
+                  // Hack Start
+                  for(PRInt32 i=0;i<unicharLength;i++)
+                     if(0x0000 == unichars[i])
+                        unichars[i] = 0x0020;
+                  // Hack End
+
 		  mBuffer.Append(unichars, unicharLength);
 		  mTotalRead += unicharLength;
                  // if we failed, we consume one byte by replace it with U+FFFD
--- a/parser/htmlparser/src/nsParser.cpp
+++ b/parser/htmlparser/src/nsParser.cpp
@ -16,7 +16,8 @@
 * Reserved.
 */
  
-
+#define DEBUG_XMLENCODING
+#define XMLENCODING_PEEKBYTES 64

 #include "nsParser.h"
 #include "nsIContentSink.h" 
@ -1086,6 +1087,109 @@ nsresult nsParser::OnStartRequest(nsIURI* aURL, const char *aSourceType)
  return NS_OK;
 }

+
+#define UCS2_BE "X-ISO-10646-UCS-2-BE"
+#define UCS2_LE "X-ISO-10646-UCS-2-LE"
+#define UCS4_BE "X-ISO-10646-UCS-4-BE"
+#define UCS4_LE "X-ISO-10646-UCS-4-LE"
+#define UCS4_2143 "X-ISO-10646-UCS-4-2143"
+#define UCS4_3412 "X-ISO-10646-UCS-4-3412"
+
+static PRBool detectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen,
+   nsString& oCharset, nsCharsetSource& oCharsetSource)
+{
+ oCharsetSource= kCharsetFromAutoDetection;
+ oCharset = "";
+ // see http://www.w3.org/TR/1998/REC-xml-19980210#sec-oCharseting
+ // for details
+ switch(aBytes[0])
+	 {
+   case 0x00:
+     if(0x00==aBytes[1]) {
+        // 00 00
+        if((0x00==aBytes[2]) && (0x3C==aBytes[3])) {
+           // 00 00 00 3C UCS-4, big-endian machine (1234 order)
+           oCharset = UCS4_BE;
+        } else if((0x3C==aBytes[2]) && (0x00==aBytes[3])) {
+           // 00 00 3C 00 UCS-4, unusual octet order (2143)
+           oCharset = UCS4_2143;
+        } 
+     } else if(0x3C==aBytes[1]) {
+        // 00 3C
+        if((0x00==aBytes[2]) && (0x00==aBytes[3])) {
+           // 00 3C 00 00 UCS-4, unusual octet order (3412)
+           oCharset = UCS4_3412;
+        } else if((0x3C==aBytes[2]) && (0x3F==aBytes[3])) {
+           // 00 3C 00 3F UTF-16, big-endian, no Byte Order Mark
+           oCharset = UCS2_BE; // should change to UTF-16BE
+        } 
+     }
+   break;
+   case 0x3C:
+     if(0x00==aBytes[1]) {
+        // 3C 00
+        if((0x00==aBytes[2]) && (0x00==aBytes[3])) {
+           // 3C 00 00 00 UCS-4, little-endian machine (4321 order)
+           oCharset = UCS4_LE;
+        } else if((0x3F==aBytes[2]) && (0x00==aBytes[3])) {
+           // 3C 00 3F 00 UTF-16, little-endian, no Byte Order Mark
+           oCharset = UCS2_LE; // should change to UTF-16LE
+        } 
+     } else if((0x3C==aBytes[0]) && (0x3F==aBytes[1]) &&
+               (0x78==aBytes[2]) && (0x6D==aBytes[3]) &&
+               (0 == PL_strncmp("<?xml version", (char*)aBytes, 13 ))) {
+       // 3C 3F 78 6D
+       nsAutoString firstXbytes("");
+       firstXbytes.Append((const char*)aBytes, (PRInt32)
+                       ((aLen > XMLENCODING_PEEKBYTES)?
+                       XMLENCODING_PEEKBYTES:
+                       aLen)); 
+       PRInt32 xmlDeclEnd = firstXbytes.Find("?>", PR_FALSE, 13);
+	   // 27 == strlen("<xml? version="1" encoding=");
+       if((kNotFound != xmlDeclEnd) &&(xmlDeclEnd > 27 )){
+           firstXbytes.Cut(xmlDeclEnd, firstXbytes.Length()-xmlDeclEnd);
+           PRInt32 encStart = firstXbytes.Find("encoding", PR_FALSE,13);
+           if(kNotFound != encStart) {
+             encStart = firstXbytes.FindCharInSet("\"'", encStart+8);
+                              // 8 == strlen("encoding")
+             if(kNotFound != encStart) {
+                PRUnichar q = firstXbytes.CharAt(encStart); 
+                PRInt32 encEnd = firstXbytes.FindChar(q, PR_FALSE, encStart+1);
+                if(kNotFound != encEnd) {
+                   PRInt32 count = encEnd - encStart -1;
+                   if(count >0) {
+                      firstXbytes.Mid(oCharset,(encStart+1), count);
+                      oCharsetSource= kCharsetFromMetaTag;
+                   }
+                }
+             }
+           }
+       }
+     }
+   break;
+   case 0xFE:
+     if(0xFF==aBytes[1]) {
+        // FE FF
+        // UTF-16, big-endian 
+        oCharset = UCS2_BE; // should change to UTF-16BE
+     }
+   break;
+   case 0xFF:
+     if(0xFE==aBytes[1]) {
+        // FF FE
+        // UTF-16, little-endian 
+        oCharset = UCS2_LE; // should change to UTF-16LE
+     }
+   break;
+   // case 0x4C: if((0x6F==aBytes[1]) && ((0xA7==aBytes[2] && (0x94==aBytes[3])) {
+   //   We do not care EBCIDIC here....
+   // }
+   // break;
+ }  // switch
+ return oCharset.Length() > 0;
+}
+
+
 /**
 *  
 *  
@ -1126,13 +1230,33 @@ nsresult nsParser::OnDataAvailable(nsIURI* aURL, nsIInputStream *pIStream, PRUin
  int       theStartPos=0;
  nsresult result=NS_OK;

+  PRBool needCheckFirst4Bytes = 
+          ((0 == sourceOffset) && (mCharsetSource<kCharsetFromAutoDetection));
  while ((theNumRead>0) && (aLength>theTotalRead) && (NS_OK==result)) {
    result = pIStream->Read(mParserContext->mTransferBuffer, aLength, &theNumRead);
    if((result == NS_OK) && (theNumRead>0)) {
+      if(needCheckFirst4Bytes && (theNumRead >= 4)) {
+         nsCharsetSource guessSource;
+         nsAutoString guess("");
+         
+         needCheckFirst4Bytes = PR_FALSE;
+         if(detectByteOrderMark((const unsigned char*)mParserContext->mTransferBuffer,
+                                theNumRead, guess, guessSource)) 
+         {
+#ifdef DEBUG_XMLENCODING
+            printf("xmlencoding detect- %s\n", guess.ToNewCString());
+#endif
+            this->SetDocumentCharset(guess, guessSource);
+			mParserContext->mScanner->SetDocumentCharset(guess, guessSource);
+         }
+      }
      theTotalRead+=theNumRead;
      if(mParserFilter)
         mParserFilter->RawBuffer(mParserContext->mTransferBuffer, &theNumRead);

+#if 0
+      // The following Hack have moved to nsScanner.cpp
+      // Remove that Hack if you feel this hack is not necessary
      // XXX Hack --- NULL character(s) is(are) seen in the middle of the buffer!!!
      // For now, I'm conditioning the raw buffer by removing the unwanted null chars.
      // Problem could be NECKO related
@ -1141,6 +1265,7 @@ nsresult nsParser::OnDataAvailable(nsIURI* aURL, nsIInputStream *pIStream, PRUin
        if(mParserContext->mTransferBuffer[i]==kNullCh)
          mParserContext->mTransferBuffer[i]=kSpace;
      }
+#endif 

 #ifdef  NS_DEBUG
      int index=0;
--- a/parser/htmlparser/src/nsScanner.cpp
+++ b/parser/htmlparser/src/nsScanner.cpp
@ -257,6 +257,16 @@ PRBool nsScanner::Append(const char* aBuffer, PRUint32 aLen){
 		  PRInt32 unicharLength = unicharBufLen;
 		  res = mUnicodeDecoder->Convert(unichars, 0, &unicharLength,aBuffer, 0, &srcLength );
      unichars[unicharLength]=0;  //add this since the unicode converters can't be trusted to do so.
+
+
+                  // Move the nsParser.cpp 00 -> space hack to here so
+                  // it won't break UCS2 file
+                  // Hack Start
+                  for(PRInt32 i=0;i<unicharLength;i++)
+                     if(0x0000 == unichars[i])
+                        unichars[i] = 0x0020;
+                  // Hack End
+
 		  mBuffer.Append(unichars, unicharLength);
 		  mTotalRead += unicharLength;
                  // if we failed, we consume one byte by replace it with U+FFFD