Bug 494099, HZ-GB-2312 converter reads beyond input buffer and omits characters at block boundaries. r=VYV03354@nifty.ne.jp

This commit is contained in:
Simon Montagu 2009-06-16 00:13:28 -07:00
parent 451e5accfb
commit 306b8cb3e6
3 changed files with 157 additions and 101 deletions

View File

@ -0,0 +1,67 @@
/* Test case for bug 90411
*
* Uses nsIConverterInputStream to decode GB_HK test.
*
* Sample text is:
* 问他谁是傻瓜了5分钟但是他谁不要求仍然是一个傻瓜永远
* 我听见 我忘记; 我看见 我记住; 我做 我了解
*/
const sample = "~{NJK{K-JGI59OAK~}5~{7VVS!#5+JG#,K{K-2;R*GsHTH;JGR;8vI59OS@T6!#~} ~{NRL}<{~} ~{NRM|<G~}; ~{NR?4<{~} ~{NR<GW!~}; ~{NRWv~} ~{NRAK=b!#~}";
const expected = "\u95EE\u4ED6\u8C01\u662F\u50BB\u74DC\u4E865\u5206\u949F\u3002\u4F46\u662F\uFF0C\u4ED6\u8C01\u4E0D\u8981\u6C42\u4ECD\u7136\u662F\u4E00\u4E2A\u50BB\u74DC\u6C38\u8FDC\u3002 \u6211\u542C\u89C1 \u6211\u5FD8\u8BB0; \u6211\u770B\u89C1 \u6211\u8BB0\u4F4F; \u6211\u505A \u6211\u4E86\u89E3\u3002";
const charset="HZ-GB-2312";
function testCase(bufferLength)
{
var dataURI = "data:text/plain;charset=" + charset + "," + sample;
var IOService = Components.Constructor("@mozilla.org/network/io-service;1",
"nsIIOService");
var ConverterInputStream =
Components.Constructor("@mozilla.org/intl/converter-input-stream;1",
"nsIConverterInputStream",
"init");
var ios = new IOService();
var channel = ios.newChannel(dataURI, "", null);
var testInputStream = channel.open();
var testConverter = new ConverterInputStream(testInputStream,
charset,
bufferLength,
0xFFFD);
if (!(testConverter instanceof
Components.interfaces.nsIUnicharLineInputStream))
throw "not line input stream";
var outStr = "";
var more;
do {
// read the line and check for eof
var line = {};
more = testConverter.readLine(line);
outStr += line.value;
} while (more);
if (outStr != expected) {
dump("Failed with bufferLength = " + bufferLength + "\n");
if (outStr.length == expected.length) {
for (i = 0; i < outStr.length; ++i) {
if (outStr.charCodeAt(i) != expected.charCodeAt(i)) {
dump(i + ": " + outStr.charCodeAt(i).toString(16) + " != " + expected.charCodeAt(i).toString(16) + "\n");
}
}
}
}
// escape the strings before comparing for better readability
do_check_eq(escape(outStr), escape(expected));
}
function run_test()
{
testCase(32);
testCase(33);
}

View File

@ -68,19 +68,23 @@
//----------------------------------------------------------------------
// Subclassing of nsTablesDecoderSupport class [implementation]
#define HZ_STATE_GB 1
#define HZ_STATE_ASCII 2
#define HZ_STATE_TILD 3
#define HZ_STATE_GB 1
#define HZ_STATE_ASCII 2
#define HZ_STATE_ODD_BYTE_FLAG 0x80
#define HZLEAD1 '~'
#define HZLEAD2 '{'
#define HZLEAD3 '}'
#define HZLEAD4 '\n'
#define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG))
#define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG))
nsHZToUnicode::nsHZToUnicode() : nsBufferDecoderSupport(1)
{
mHZState = HZ_STATE_ASCII; // per HZ spec, default to ASCII state
mHZState = HZ_STATE_ASCII; // per HZ spec, default to ASCII state
mRunLength = 0;
mOddByte = 0;
}
//Overwriting the ConvertNoBuff() in nsUCvCnSupport.cpp.
NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
const char* aSrc,
@ -91,114 +95,98 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
PRInt32 i=0;
PRInt32 iSrcLength = *aSrcLength;
PRInt32 iDestlen = 0;
PRUint8 ch1, ch2;
nsresult res = NS_OK;
*aSrcLength=0;
for (i=0;i<iSrcLength;i++)
{
if ( iDestlen >= (*aDestLength) )
{
nsresult res = NS_OK;
char oddByte = mOddByte;
for (i=0; i<iSrcLength; i++) {
if (iDestlen >= (*aDestLength)) {
res = NS_OK_UDEC_MOREOUTPUT;
break;
}
if ( *aSrc & 0x80 ) // if it is a 8-bit byte
{
if (UINT8_IN_RANGE(0x81, aSrc[0], 0xFE) &&
UINT8_IN_RANGE(0x40, aSrc[1], 0xFE)) {
// The source is a 8-bit GBCode
*aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
char srcByte = *aSrc++;
(*aSrcLength)++;
if (!HZ_ODD_BYTE_STATE) {
if (srcByte & 0x80 || srcByte == HZLEAD1 || HZ_ENCODING_STATE == HZ_STATE_GB) {
oddByte = srcByte;
mHZState |= HZ_STATE_ODD_BYTE_FLAG;
} else {
*aDest = UCS2_NO_MAPPING;
*aDest++ = CAST_CHAR_TO_UNICHAR(srcByte);
iDestlen++;
}
aSrc += 2;
i++;
iDestlen++;
aDest++;
*aSrcLength = i+1;
continue;
}
// otherwise, it is a 7-bit byte
// The source will be an ASCII or a 7-bit HZ code depending on ch1
ch1 = *aSrc;
ch2 = *(aSrc+1);
if (ch1 == HZLEAD1 ) // if it is lead by '~'
{
switch (ch2)
{
case HZLEAD2:
// we got a '~{'
// we are switching to HZ state
mHZState = HZ_STATE_GB;
mRunLength = 0;
aSrc += 2;
i++;
break;
case HZLEAD3:
// we got a '~}'
// we are switching to ASCII state
mHZState = HZ_STATE_ASCII;
aSrc += 2;
i++;
if (mRunLength == 0) {
*aDest = UCS2_NO_MAPPING;
} else {
if (oddByte & 0x80) { // if it is a 8-bit byte
if (UINT8_IN_RANGE(0x81, oddByte, 0xFE) &&
UINT8_IN_RANGE(0x40, srcByte, 0xFE)) {
// The source is a 8-bit GBCode
*aDest++ = mUtil.GBKCharToUnicode(oddByte, srcByte);
} else {
*aDest++ = UCS2_NO_MAPPING;
}
iDestlen++;
// otherwise, it is a 7-bit byte
// The source will be an ASCII or a 7-bit HZ code depending on oddByte
} else if (oddByte == HZLEAD1) { // if it is lead by '~'
switch (srcByte) {
case HZLEAD2:
// we got a '~{'
// we are switching to HZ state
mHZState = HZ_STATE_GB | HZ_ODD_BYTE_STATE;
mRunLength = 0;
break;
case HZLEAD3:
// we got a '~}'
// we are switching to ASCII state
mHZState = HZ_STATE_ASCII | HZ_ODD_BYTE_STATE;
if (mRunLength == 0) {
*aDest++ = UCS2_NO_MAPPING;
iDestlen++;
}
mRunLength = 0;
break;
case HZLEAD1:
// we got a '~~', process like an ASCII, but no state change
*aDest++ = CAST_CHAR_TO_UNICHAR(srcByte);
iDestlen++;
aDest++;
}
mRunLength = 0;
break;
case HZLEAD1:
// we got a '~~', process like an ASCII, but no state change
aSrc++;
*aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
aSrc++;
i++;
iDestlen++;
aDest++;
mRunLength++;
break;
case HZLEAD4:
// we got a "~\n", it means maintain double byte mode cross lines, ignore the '~' itself
// mHZState = HZ_STATE_GB;
// I find that "~\n" should interpreted as line continuation without mode change
// It should not be interpreted as line continuation with double byte mode on
aSrc++;
break;
default:
// undefined ESC sequence '~X' are ignored since this is a illegal combination
aSrc += 2;
*aDest = UCS2_NO_MAPPING;
iDestlen++;
aDest++;
break;
};
continue;// go for next loop
}
// ch1 != '~'
switch (mHZState)
{
case HZ_STATE_GB:
// the following chars are HZ
*aDest = mUtil.GBKCharToUnicode(aSrc[0]|0x80, aSrc[1]|0x80);
aSrc += 2;
i++;
iDestlen++;
aDest++;
mRunLength++;
break;
case HZLEAD4:
// we got a "~\n", it means maintain double byte mode cross lines,
// ignore the '~' itself
// mHZState = HZ_STATE_GB;
// I find that "~\n" should interpreted as line continuation
// without mode change
// It should not be interpreted as line continuation with double
// byte mode on
break;
default:
// undefined ESC sequence '~X' are ignored since this is an
// illegal combination
*aDest++ = UCS2_NO_MAPPING;
iDestlen++;
break;
}
} else if (HZ_ENCODING_STATE == HZ_STATE_GB) {
*aDest++ = mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80);
mRunLength++;
break;
case HZ_STATE_ASCII:
default:
// default behavior also like an ASCII
// when the source is an ASCII
*aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
aSrc++;
iDestlen++;
aDest++;
break;
} else {
NS_NOTREACHED("2-byte sequence that we don't know how to handle");
*aDest++ = UCS2_NO_MAPPING;
iDestlen++;
}
oddByte = 0;
mHZState &= ~HZ_STATE_ODD_BYTE_FLAG;
}
*aSrcLength = i+1;
}// for loop
} // for loop
mOddByte = HZ_ODD_BYTE_STATE ? oddByte : 0;
*aDestLength = iDestlen;
return NS_OK;
return res;
}

View File

@ -71,6 +71,7 @@ protected:
private:
PRInt16 mHZState;
PRUint32 mRunLength; // length of a run of 8-bit GB-encoded characters
char mOddByte; // first byte of a multi-byte sequence from a previous buffer
};