diff --git a/intl/lwbrk/public/nsILineBreaker.h b/intl/lwbrk/public/nsILineBreaker.h index 2c1a8658f435..85078823ae36 100644 --- a/intl/lwbrk/public/nsILineBreaker.h +++ b/intl/lwbrk/public/nsILineBreaker.h @@ -43,10 +43,10 @@ #define NS_LINEBREAKER_NEED_MORE_TEXT -1 -// {7509772F-770C-44e8-AAFA-8032E5A35370} +// {E86B3375-BF89-11d2-B3AF-00805F8A6670} #define NS_ILINEBREAKER_IID \ -{ 0x7509772f, 0x770c, 0x44e8, \ - { 0xaa, 0xfa, 0x80, 0x32, 0xe5, 0xa3, 0x53, 0x70 } } +{ 0xe86b3375, 0xbf89, 0x11d2, \ + { 0xb3, 0xaf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } class nsILineBreaker : public nsISupports @@ -57,10 +57,6 @@ public: const PRUnichar* aText2 , PRUint32 aTextLen2) = 0; - virtual PRBool CanBreakBetweenLatin1(PRUnichar aChar1, - PRUnichar aChar2) = 0; - - virtual PRInt32 Next( const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos) = 0; diff --git a/intl/lwbrk/src/jisx4501class.h b/intl/lwbrk/src/jisx4501class.h index a82b098d18ea..470671d98552 100644 --- a/intl/lwbrk/src/jisx4501class.h +++ b/intl/lwbrk/src/jisx4501class.h @@ -44,7 +44,7 @@ static const PRUint32 gLBClass00[32] = { 0x55555555, // U+0010 - U+0017 0x55555555, // U+0018 - U+001F 0x88438815, // U+0020 - U+0027 -0x11515810, // U+0028 - U+002F +0x81515810, // U+0028 - U+002F 0x66666666, // U+0030 - U+0037 0x11501166, // U+0038 - U+003F 0x88888888, // U+0040 - U+0047 diff --git a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp index e82fd5526633..15cf6505d107 100644 --- a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp +++ b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp @@ -350,19 +350,12 @@ nsJISx4051LineBreaker::~nsJISx4051LineBreaker() NS_IMPL_ISUPPORTS1(nsJISx4051LineBreaker, nsILineBreaker) -#define U_PERIOD PRUnichar('.') -#define U_COMMA PRUnichar(',') -#define U_COLON PRUnichar(':') -#define U_SEMICOLON PRUnichar(';') -#define U_SLASH PRUnichar('/') -#define U_SPACE PRUnichar(' ') -#define U_NULL PRUnichar(0x0000) -#define U_RIGHT_SINGLE_QUOTATION_MARK PRUnichar(0x2019) +#define U_PERIOD ((PRUnichar) '.') +#define U_COMMA ((PRUnichar) ',') +#define U_SPACE ((PRUnichar) ' ') +#define U_RIGHT_SINGLE_QUOTATION_MARK ((PRUnichar) 0x2019) #define NEED_CONTEXTUAL_ANALYSIS(c) ((c) == U_PERIOD || \ (c) == U_COMMA || \ - (c) == U_COLON || \ - (c) == U_SEMICOLON || \ - (c) == U_SLASH || \ (c) == U_RIGHT_SINGLE_QUOTATION_MARK) #define NUMERIC_CLASS 6 // JIS x4051 class 15 is now map to simplified class 6 #define CHARACTER_CLASS 8 // JIS x4051 class 18 is now map to simplified class 8 @@ -372,17 +365,17 @@ PRInt8 nsJISx4051LineBreaker::ContextualAnalysis( PRUnichar prev, PRUnichar cur, PRUnichar next ) { - if(U_COMMA == cur || U_COLON == cur || U_SEMICOLON == cur) + if(U_COMMA == cur) { - if((IS_ASCII_DIGIT(prev) || prev == U_NULL) && IS_ASCII_DIGIT(next)) + if(IS_ASCII_DIGIT (prev) && IS_ASCII_DIGIT (next)) return NUMERIC_CLASS; } else if(U_PERIOD == cur) { - if((IS_ASCII_DIGIT(prev) || prev == U_SPACE || prev == U_NULL) && - IS_ASCII_DIGIT(next)) + if((IS_ASCII_DIGIT (prev) || (0x0020 == prev)) && + IS_ASCII_DIGIT (next)) return NUMERIC_CLASS; - + // By assigning a full stop character class only when it's followed by // class 6 (numeric), 7, and 8 (character). Note that class 9 (Thai) // doesn't matter, either way, we prevent lines from breaking around @@ -394,12 +387,6 @@ PRInt8 nsJISx4051LineBreaker::ContextualAnalysis( if((pc > 5 || pc == 0) && GetClass(next) > 5) return CHARACTER_CLASS; } - else if(U_SLASH == cur) - { - // We don't need to check prev character. Because SLASH breaks only after. - if (IS_ASCII_DIGIT(next)) - return NUMERIC_CLASS; - } else if(U_RIGHT_SINGLE_QUOTATION_MARK == cur) { // somehow people use this as ' in "it's" sometimes... @@ -409,25 +396,6 @@ PRInt8 nsJISx4051LineBreaker::ContextualAnalysis( return this->GetClass(cur); } -PRBool nsJISx4051LineBreaker::CanBreakBetweenLatin1(PRUnichar aChar1, - PRUnichar aChar2) -{ - NS_ASSERTION(aChar1 < 256 && aChar2 < 256, "invalid input"); - - PRInt8 c1, c2; - if(NEED_CONTEXTUAL_ANALYSIS(aChar1)) - c1 = this->ContextualAnalysis(U_NULL, aChar1, aChar2); - else - c1 = this->GetClass(aChar1); - - if(NEED_CONTEXTUAL_ANALYSIS(aChar2)) - c2 = this->ContextualAnalysis(aChar1, aChar2, U_NULL); - else - c2 = this->GetClass(aChar2); - - return GetPair(c1, c2); -} - PRBool nsJISx4051LineBreaker::BreakInBetween( const PRUnichar* aText1 , PRUint32 aTextLen1, @@ -440,9 +408,34 @@ PRBool nsJISx4051LineBreaker::BreakInBetween( return PR_FALSE; } + //search for CJK characters until a space is found. + //if CJK char is found before space, use 4051, otherwise western + PRInt32 cur; + + for (cur= aTextLen1-1; cur>=0; cur--) + { + if (IS_SPACE(aText1[cur])) + break; + if (IS_CJK_CHAR(aText1[cur])) + goto ROUTE_CJK_BETWEEN; + } + + for (cur= 0; cur < (PRInt32)aTextLen2; cur++) + { + if (IS_SPACE(aText2[cur])) + break; + if (IS_CJK_CHAR(aText2[cur])) + goto ROUTE_CJK_BETWEEN; + } + + //now apply western rule. + return IS_SPACE(aText1[aTextLen1-1]) || IS_SPACE(aText2[0]); + +ROUTE_CJK_BETWEEN: + PRInt8 c1, c2; if(NEED_CONTEXTUAL_ANALYSIS(aText1[aTextLen1-1])) - c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:U_NULL, + c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:0, aText1[aTextLen1-1], aText2[0]); else @@ -451,7 +444,7 @@ PRBool nsJISx4051LineBreaker::BreakInBetween( if(NEED_CONTEXTUAL_ANALYSIS(aText2[0])) c2 = this->ContextualAnalysis(aText1[aTextLen1-1], aText2[0], - (aTextLen2>1)?aText2[1]:U_NULL); + (aTextLen2>1)?aText2[1]:0); else c2 = this->GetClass(aText2[0]); @@ -473,13 +466,26 @@ PRInt32 nsJISx4051LineBreaker::Next( NS_ASSERTION(aText, "aText shouldn't be null"); NS_ASSERTION(aLen > aPos, "Illegal value (length > position)"); + //forward check for CJK characters until a space is found. + //if CJK char is found before space, use 4051, otherwise western + PRUint32 cur; + for (cur = aPos; cur < aLen; ++cur) + { + if (IS_SPACE(aText[cur])) + return cur; + if (IS_CJK_CHAR(aText[cur])) + goto ROUTE_CJK_NEXT; + } + return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text + +ROUTE_CJK_NEXT: PRInt8 c1, c2; - PRUint32 cur = aPos; + cur = aPos; if(NEED_CONTEXTUAL_ANALYSIS(aText[cur])) { - c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL, + c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:0, aText[cur], - (cur<(aLen-1)) ?aText[cur+1]:U_NULL); + (cur<(aLen-1)) ?aText[cur+1]:0); } else { c1 = this->GetClass(aText[cur]); } @@ -491,9 +497,9 @@ PRInt32 nsJISx4051LineBreaker::Next( { if(NEED_CONTEXTUAL_ANALYSIS(aText[cur])) { - c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL, + c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:0, aText[cur], - (cur<(aLen-1)) ?aText[cur+1]:U_NULL); + (cur<(aLen-1)) ?aText[cur+1]:0); } else { c2 = this->GetClass(aText[cur]); } @@ -511,13 +517,31 @@ PRInt32 nsJISx4051LineBreaker::Prev( { NS_ASSERTION(aText, "aText shouldn't be null"); - PRUint32 cur = aPos; + //backward check for CJK characters until a space is found. + //if CJK char is found before space, use 4051, otherwise western + PRUint32 cur; + for (cur = aPos - 1; cur > 0; --cur) + { + if (IS_SPACE(aText[cur])) + { + if (cur != aPos - 1) // XXXldb Why? + ++cur; + return cur; + } + if (IS_CJK_CHAR(aText[cur])) + goto ROUTE_CJK_PREV; + } + + return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text + +ROUTE_CJK_PREV: + cur = aPos; PRInt8 c1, c2; if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1])) { - c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL, + c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0, aText[cur-1], - (curGetClass(aText[cur-1]); } @@ -529,9 +553,9 @@ PRInt32 nsJISx4051LineBreaker::Prev( { if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1])) { - c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL, + c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0, aText[cur-1], - (curGetClass(aText[cur-1]); } diff --git a/intl/lwbrk/src/nsJISx4501LineBreaker.h b/intl/lwbrk/src/nsJISx4501LineBreaker.h index b1a58d5c5506..6ad374f200e2 100644 --- a/intl/lwbrk/src/nsJISx4501LineBreaker.h +++ b/intl/lwbrk/src/nsJISx4501LineBreaker.h @@ -48,9 +48,6 @@ public: nsJISx4051LineBreaker(); virtual ~nsJISx4051LineBreaker(); - PRBool CanBreakBetweenLatin1(PRUnichar aChar1, - PRUnichar aChar2); - PRBool BreakInBetween( const PRUnichar* aText1 , PRUint32 aTextLen1, const PRUnichar* aText2 , PRUint32 aTextLen2); diff --git a/intl/lwbrk/tools/jisx4501class.txt b/intl/lwbrk/tools/jisx4501class.txt index c94d0d9a9559..3a7125cea8a1 100644 --- a/intl/lwbrk/tools/jisx4501class.txt +++ b/intl/lwbrk/tools/jisx4501class.txt @@ -1,5 +1,4 @@ 0028;;1 -002F;;2 005B;;1 007B;;1 2018;;1 diff --git a/layout/generic/nsTextTransformer.cpp b/layout/generic/nsTextTransformer.cpp index 07ef3cb1f8d9..fc17f13b81f2 100644 --- a/layout/generic/nsTextTransformer.cpp +++ b/layout/generic/nsTextTransformer.cpp @@ -348,11 +348,8 @@ nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen, bp2 += mBufferPos; } - PRUnichar prevCh; - PRUnichar ch = 0; for (; offset < fragLen; offset++) { - prevCh = (ch == ' ') ? CH_NBSP : ch; - ch = *cp++; + unsigned char ch = *cp++; if (XP_IS_SPACE(ch)) { break; } @@ -360,10 +357,6 @@ nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen, ch = ' '; *aWasTransformed = PR_TRUE; } - else if (offset != mOffset && - nsContentUtils::LineBreaker()->CanBreakBetweenLatin1(prevCh, ch)) { - break; - } else if (IS_DISCARDED(ch)) { // Strip discarded characters from the transformed output continue; diff --git a/parser/htmlparser/tests/outsinks/simplemail.out b/parser/htmlparser/tests/outsinks/simplemail.out index 72ef0be28907..0de2867e6e27 100644 --- a/parser/htmlparser/tests/outsinks/simplemail.out +++ b/parser/htmlparser/tests/outsinks/simplemail.out @@ -1,7 +1,7 @@ This is a mail with a couple of long lines and -then a sig. This is used as test of the format= -flowed output in the nsHTMLToTXTSinkstream. If -this test fails and none else, it's likely the +then a sig. This is used as test of the +format=flowed output in the nsHTMLToTXTSinkstream. +If this test fails and none else, it's likely the spaces at the ends of the lines that are missing. They aren't easily seen without looking at the data in an editor and checking where the end of