Bug 255990 Characters below U+0100 are not subject to line-breaking rules at all r=jshin+mrbkap, sr=roc

This commit is contained in:
masayuki%d-toybox.com 2006-07-14 04:57:05 +00:00
parent 2dbafa80ee
commit 42ba5d983c
7 changed files with 76 additions and 85 deletions

View File

@ -43,10 +43,10 @@
#define NS_LINEBREAKER_NEED_MORE_TEXT -1
// {E86B3375-BF89-11d2-B3AF-00805F8A6670}
// {7509772F-770C-44e8-AAFA-8032E5A35370}
#define NS_ILINEBREAKER_IID \
{ 0xe86b3375, 0xbf89, 0x11d2, \
{ 0xb3, 0xaf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
{ 0x7509772f, 0x770c, 0x44e8, \
{ 0xaa, 0xfa, 0x80, 0x32, 0xe5, 0xa3, 0x53, 0x70 } }
class nsILineBreaker : public nsISupports
@ -57,6 +57,10 @@ public:
const PRUnichar* aText2 ,
PRUint32 aTextLen2) = 0;
virtual PRBool CanBreakBetweenLatin1(PRUnichar aChar1,
PRUnichar aChar2) = 0;
virtual PRInt32 Next( const PRUnichar* aText, PRUint32 aLen,
PRUint32 aPos) = 0;

View File

@ -44,7 +44,7 @@ static const PRUint32 gLBClass00[32] = {
0x55555555, // U+0010 - U+0017
0x55555555, // U+0018 - U+001F
0x88438815, // U+0020 - U+0027
0x81515810, // U+0028 - U+002F
0x11515810, // U+0028 - U+002F
0x66666666, // U+0030 - U+0037
0x11501166, // U+0038 - U+003F
0x88888888, // U+0040 - U+0047

View File

@ -350,12 +350,19 @@ nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
NS_IMPL_ISUPPORTS1(nsJISx4051LineBreaker, nsILineBreaker)
#define U_PERIOD ((PRUnichar) '.')
#define U_COMMA ((PRUnichar) ',')
#define U_SPACE ((PRUnichar) ' ')
#define U_RIGHT_SINGLE_QUOTATION_MARK ((PRUnichar) 0x2019)
#define U_PERIOD PRUnichar('.')
#define U_COMMA PRUnichar(',')
#define U_COLON PRUnichar(':')
#define U_SEMICOLON PRUnichar(';')
#define U_SLASH PRUnichar('/')
#define U_SPACE PRUnichar(' ')
#define U_NULL PRUnichar(0x0000)
#define U_RIGHT_SINGLE_QUOTATION_MARK PRUnichar(0x2019)
#define NEED_CONTEXTUAL_ANALYSIS(c) ((c) == U_PERIOD || \
(c) == U_COMMA || \
(c) == U_COLON || \
(c) == U_SEMICOLON || \
(c) == U_SLASH || \
(c) == U_RIGHT_SINGLE_QUOTATION_MARK)
#define NUMERIC_CLASS 6 // JIS x4051 class 15 is now map to simplified class 6
#define CHARACTER_CLASS 8 // JIS x4051 class 18 is now map to simplified class 8
@ -365,17 +372,17 @@ PRInt8 nsJISx4051LineBreaker::ContextualAnalysis(
PRUnichar prev, PRUnichar cur, PRUnichar next
)
{
if(U_COMMA == cur)
if(U_COMMA == cur || U_COLON == cur || U_SEMICOLON == cur)
{
if(IS_ASCII_DIGIT (prev) && IS_ASCII_DIGIT (next))
if((IS_ASCII_DIGIT(prev) || prev == U_NULL) && IS_ASCII_DIGIT(next))
return NUMERIC_CLASS;
}
else if(U_PERIOD == cur)
{
if((IS_ASCII_DIGIT (prev) || (0x0020 == prev)) &&
IS_ASCII_DIGIT (next))
if((IS_ASCII_DIGIT(prev) || prev == U_SPACE || prev == U_NULL) &&
IS_ASCII_DIGIT(next))
return NUMERIC_CLASS;
// By assigning a full stop character class only when it's followed by
// class 6 (numeric), 7, and 8 (character). Note that class 9 (Thai)
// doesn't matter, either way, we prevent lines from breaking around
@ -387,6 +394,12 @@ PRInt8 nsJISx4051LineBreaker::ContextualAnalysis(
if((pc > 5 || pc == 0) && GetClass(next) > 5)
return CHARACTER_CLASS;
}
else if(U_SLASH == cur)
{
// We don't need to check prev character. Because SLASH breaks only after.
if (IS_ASCII_DIGIT(next))
return NUMERIC_CLASS;
}
else if(U_RIGHT_SINGLE_QUOTATION_MARK == cur)
{
// somehow people use this as ' in "it's" sometimes...
@ -396,6 +409,25 @@ PRInt8 nsJISx4051LineBreaker::ContextualAnalysis(
return this->GetClass(cur);
}
PRBool nsJISx4051LineBreaker::CanBreakBetweenLatin1(PRUnichar aChar1,
PRUnichar aChar2)
{
NS_ASSERTION(aChar1 < 256 && aChar2 < 256, "invalid input");
PRInt8 c1, c2;
if(NEED_CONTEXTUAL_ANALYSIS(aChar1))
c1 = this->ContextualAnalysis(U_NULL, aChar1, aChar2);
else
c1 = this->GetClass(aChar1);
if(NEED_CONTEXTUAL_ANALYSIS(aChar2))
c2 = this->ContextualAnalysis(aChar1, aChar2, U_NULL);
else
c2 = this->GetClass(aChar2);
return GetPair(c1, c2);
}
PRBool nsJISx4051LineBreaker::BreakInBetween(
const PRUnichar* aText1 , PRUint32 aTextLen1,
@ -408,34 +440,9 @@ PRBool nsJISx4051LineBreaker::BreakInBetween(
return PR_FALSE;
}
//search for CJK characters until a space is found.
//if CJK char is found before space, use 4051, otherwise western
PRInt32 cur;
for (cur= aTextLen1-1; cur>=0; cur--)
{
if (IS_SPACE(aText1[cur]))
break;
if (IS_CJK_CHAR(aText1[cur]))
goto ROUTE_CJK_BETWEEN;
}
for (cur= 0; cur < (PRInt32)aTextLen2; cur++)
{
if (IS_SPACE(aText2[cur]))
break;
if (IS_CJK_CHAR(aText2[cur]))
goto ROUTE_CJK_BETWEEN;
}
//now apply western rule.
return IS_SPACE(aText1[aTextLen1-1]) || IS_SPACE(aText2[0]);
ROUTE_CJK_BETWEEN:
PRInt8 c1, c2;
if(NEED_CONTEXTUAL_ANALYSIS(aText1[aTextLen1-1]))
c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:0,
c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:U_NULL,
aText1[aTextLen1-1],
aText2[0]);
else
@ -444,7 +451,7 @@ ROUTE_CJK_BETWEEN:
if(NEED_CONTEXTUAL_ANALYSIS(aText2[0]))
c2 = this->ContextualAnalysis(aText1[aTextLen1-1],
aText2[0],
(aTextLen2>1)?aText2[1]:0);
(aTextLen2>1)?aText2[1]:U_NULL);
else
c2 = this->GetClass(aText2[0]);
@ -466,26 +473,13 @@ PRInt32 nsJISx4051LineBreaker::Next(
NS_ASSERTION(aText, "aText shouldn't be null");
NS_ASSERTION(aLen > aPos, "Illegal value (length > position)");
//forward check for CJK characters until a space is found.
//if CJK char is found before space, use 4051, otherwise western
PRUint32 cur;
for (cur = aPos; cur < aLen; ++cur)
{
if (IS_SPACE(aText[cur]))
return cur;
if (IS_CJK_CHAR(aText[cur]))
goto ROUTE_CJK_NEXT;
}
return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text
ROUTE_CJK_NEXT:
PRInt8 c1, c2;
cur = aPos;
PRUint32 cur = aPos;
if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
{
c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:0,
c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL,
aText[cur],
(cur<(aLen-1)) ?aText[cur+1]:0);
(cur<(aLen-1)) ?aText[cur+1]:U_NULL);
} else {
c1 = this->GetClass(aText[cur]);
}
@ -497,9 +491,9 @@ ROUTE_CJK_NEXT:
{
if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
{
c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:0,
c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL,
aText[cur],
(cur<(aLen-1)) ?aText[cur+1]:0);
(cur<(aLen-1)) ?aText[cur+1]:U_NULL);
} else {
c2 = this->GetClass(aText[cur]);
}
@ -517,31 +511,13 @@ PRInt32 nsJISx4051LineBreaker::Prev(
{
NS_ASSERTION(aText, "aText shouldn't be null");
//backward check for CJK characters until a space is found.
//if CJK char is found before space, use 4051, otherwise western
PRUint32 cur;
for (cur = aPos - 1; cur > 0; --cur)
{
if (IS_SPACE(aText[cur]))
{
if (cur != aPos - 1) // XXXldb Why?
++cur;
return cur;
}
if (IS_CJK_CHAR(aText[cur]))
goto ROUTE_CJK_PREV;
}
return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text
ROUTE_CJK_PREV:
cur = aPos;
PRUint32 cur = aPos;
PRInt8 c1, c2;
if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
{
c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0,
c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL,
aText[cur-1],
(cur<aLen) ?aText[cur]:0);
(cur<aLen) ?aText[cur]:U_NULL);
} else {
c2 = this->GetClass(aText[cur-1]);
}
@ -553,9 +529,9 @@ ROUTE_CJK_PREV:
{
if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
{
c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0,
c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL,
aText[cur-1],
(cur<aLen) ?aText[cur]:0);
(cur<aLen) ?aText[cur]:U_NULL);
} else {
c1 = this->GetClass(aText[cur-1]);
}

View File

@ -48,6 +48,9 @@ public:
nsJISx4051LineBreaker();
virtual ~nsJISx4051LineBreaker();
PRBool CanBreakBetweenLatin1(PRUnichar aChar1,
PRUnichar aChar2);
PRBool BreakInBetween( const PRUnichar* aText1 , PRUint32 aTextLen1,
const PRUnichar* aText2 , PRUint32 aTextLen2);

View File

@ -1,4 +1,5 @@
0028;;1
002F;;2
005B;;1
007B;;1
2018;;1

View File

@ -348,8 +348,11 @@ nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen,
bp2 += mBufferPos;
}
PRUnichar prevCh;
PRUnichar ch = 0;
for (; offset < fragLen; offset++) {
unsigned char ch = *cp++;
prevCh = (ch == ' ') ? CH_NBSP : ch;
ch = *cp++;
if (XP_IS_SPACE(ch)) {
break;
}
@ -357,6 +360,10 @@ nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen,
ch = ' ';
*aWasTransformed = PR_TRUE;
}
else if (offset != mOffset &&
nsContentUtils::LineBreaker()->CanBreakBetweenLatin1(prevCh, ch)) {
break;
}
else if (IS_DISCARDED(ch)) {
// Strip discarded characters from the transformed output
continue;

View File

@ -1,7 +1,7 @@
This is a mail with a couple of long lines and
then a sig. This is used as test of the
format=flowed output in the nsHTMLToTXTSinkstream.
If this test fails and none else, it's likely the
then a sig. This is used as test of the format=
flowed output in the nsHTMLToTXTSinkstream. If
this test fails and none else, it's likely the
spaces at the ends of the lines that are missing.
They aren't easily seen without looking at the
data in an editor and checking where the end of