Bug 301797: UTF-8 decoder drops byte on encoding error. r=jshin, sr=bzbarsky, a=bsmedberg

2024-10-09 19:35:51 +00:00 · 2005-07-26 09:38:50 +00:00 · 2005-07-26 09:38:50 +00:00 · 99011ec3ca
commit 99011ec3ca
parent d954808c1c
1 changed files with 16 additions and 1 deletions
--- a/intl/uconv/src/nsUTF8ToUnicode.cpp
+++ b/intl/uconv/src/nsUTF8ToUnicode.cpp
@ -76,11 +76,26 @@ nsUTF8ToUnicode::nsUTF8ToUnicode()
 //----------------------------------------------------------------------
 // Subclassing of nsTableDecoderSupport class [implementation]

+/**
+ * Normally the maximum length of the output of the UTF8 decoder in UTF16
+ *  code units is the same as the length of the input in UTF8 code units,
+ *  since 1-byte, 2-byte and 3-byte UTF-8 sequences decode to a single
+ *  UTF-16 character, and 4-byte UTF-8 sequences decode to a surrogate pair.
+ *
+ * However, there is an edge case where the output can be longer than the
+ *  input: if the previous buffer ended with an incomplete multi-byte
+ *  sequence and this buffer does not begin with a valid continuation
+ *  byte, we will return NS_ERROR_UNEXPECTED and the caller may insert a
+ *  replacement character in the output buffer which corresponds to no
+ *  character in the input buffer. So in the worst case the destination
+ *  will need to be one code unit longer than the source.
+ *  See bug 301797.
+ */
 NS_IMETHODIMP nsUTF8ToUnicode::GetMaxLength(const char * aSrc,
                                            PRInt32 aSrcLength,
                                            PRInt32 * aDestLength)
 {
-  *aDestLength = aSrcLength;
+  *aDestLength = aSrcLength + 1;
  return NS_OK;
 }