Bug 255990 Characters below U+0100 are not subject to line-breaking rules at all r=jshin+mrbkap, sr=roc

2024-10-09 11:25:00 +00:00 · 2006-07-14 04:57:05 +00:00 · 2006-07-14 04:57:05 +00:00 · 42ba5d983c
commit 42ba5d983c
parent 2dbafa80ee
7 changed files with 76 additions and 85 deletions
--- a/intl/lwbrk/public/nsILineBreaker.h
+++ b/intl/lwbrk/public/nsILineBreaker.h
@ -43,10 +43,10 @@

 #define NS_LINEBREAKER_NEED_MORE_TEXT -1

-// {E86B3375-BF89-11d2-B3AF-00805F8A6670}
+// {7509772F-770C-44e8-AAFA-8032E5A35370}
 #define NS_ILINEBREAKER_IID \
-{ 0xe86b3375, 0xbf89, 0x11d2, \
-    { 0xb3, 0xaf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
+{ 0x7509772f, 0x770c, 0x44e8, \
+    { 0xaa, 0xfa, 0x80, 0x32, 0xe5, 0xa3, 0x53, 0x70 } }


 class nsILineBreaker : public nsISupports
@ -57,6 +57,10 @@ public:
                                 const PRUnichar* aText2 , 
                                 PRUint32 aTextLen2) = 0;

+  virtual PRBool CanBreakBetweenLatin1(PRUnichar aChar1,
+                                       PRUnichar aChar2) = 0; 
+
+
  virtual PRInt32 Next( const PRUnichar* aText, PRUint32 aLen, 
                        PRUint32 aPos) = 0;

--- a/intl/lwbrk/src/jisx4501class.h
+++ b/intl/lwbrk/src/jisx4501class.h
@ -44,7 +44,7 @@ static const PRUint32 gLBClass00[32] = {
 0x55555555, // U+0010 - U+0017
 0x55555555, // U+0018 - U+001F
 0x88438815, // U+0020 - U+0027
-0x81515810, // U+0028 - U+002F
+0x11515810, // U+0028 - U+002F
 0x66666666, // U+0030 - U+0037
 0x11501166, // U+0038 - U+003F
 0x88888888, // U+0040 - U+0047
--- a/intl/lwbrk/src/nsJISx4501LineBreaker.cpp
+++ b/intl/lwbrk/src/nsJISx4501LineBreaker.cpp
@ -350,12 +350,19 @@ nsJISx4051LineBreaker::~nsJISx4051LineBreaker()

 NS_IMPL_ISUPPORTS1(nsJISx4051LineBreaker, nsILineBreaker)

-#define U_PERIOD ((PRUnichar) '.')
-#define U_COMMA ((PRUnichar) ',')
-#define U_SPACE ((PRUnichar) ' ')
-#define U_RIGHT_SINGLE_QUOTATION_MARK ((PRUnichar) 0x2019)
+#define U_PERIOD    PRUnichar('.')
+#define U_COMMA     PRUnichar(',')
+#define U_COLON     PRUnichar(':')
+#define U_SEMICOLON PRUnichar(';')
+#define U_SLASH     PRUnichar('/')
+#define U_SPACE     PRUnichar(' ')
+#define U_NULL      PRUnichar(0x0000)
+#define U_RIGHT_SINGLE_QUOTATION_MARK PRUnichar(0x2019)
 #define NEED_CONTEXTUAL_ANALYSIS(c) ((c) == U_PERIOD || \
                                     (c) == U_COMMA || \
+                                     (c) == U_COLON || \
+                                     (c) == U_SEMICOLON || \
+                                     (c) == U_SLASH || \
                                     (c) == U_RIGHT_SINGLE_QUOTATION_MARK)
 #define NUMERIC_CLASS  6 // JIS x4051 class 15 is now map to simplified class 6
 #define CHARACTER_CLASS  8 // JIS x4051 class 18 is now map to simplified class 8
@ -365,17 +372,17 @@ PRInt8  nsJISx4051LineBreaker::ContextualAnalysis(
  PRUnichar prev, PRUnichar cur, PRUnichar next
 )
 {
-   if(U_COMMA == cur)
+   if(U_COMMA == cur || U_COLON == cur || U_SEMICOLON == cur)
   {
-     if(IS_ASCII_DIGIT (prev) && IS_ASCII_DIGIT (next))
+     if((IS_ASCII_DIGIT(prev) || prev == U_NULL) && IS_ASCII_DIGIT(next))
       return NUMERIC_CLASS;
   }
   else if(U_PERIOD == cur)
   {
-     if((IS_ASCII_DIGIT (prev) || (0x0020 == prev)) && 
-         IS_ASCII_DIGIT (next))
+     if((IS_ASCII_DIGIT(prev) || prev == U_SPACE || prev == U_NULL) &&
+        IS_ASCII_DIGIT(next))
       return NUMERIC_CLASS;
- 
+
     // By assigning a full stop  character class only when it's followed by
     // class 6 (numeric), 7, and 8 (character). Note that class 9 (Thai) 
     // doesn't matter, either way, we prevent lines from breaking around 
@ -387,6 +394,12 @@ PRInt8  nsJISx4051LineBreaker::ContextualAnalysis(
     if((pc > 5 || pc == 0)  && GetClass(next) > 5)
       return CHARACTER_CLASS;
   }
+   else if(U_SLASH == cur)
+   {
+     // We don't need to check prev character. Because SLASH breaks only after.
+     if (IS_ASCII_DIGIT(next))
+       return NUMERIC_CLASS;
+   }
   else if(U_RIGHT_SINGLE_QUOTATION_MARK == cur)
   {
     // somehow people use this as ' in "it's" sometimes...
@ -396,6 +409,25 @@ PRInt8  nsJISx4051LineBreaker::ContextualAnalysis(
   return this->GetClass(cur);
 }

+PRBool nsJISx4051LineBreaker::CanBreakBetweenLatin1(PRUnichar aChar1,
+                                                    PRUnichar aChar2)
+{
+  NS_ASSERTION(aChar1 < 256 && aChar2 < 256, "invalid input");
+
+  PRInt8 c1, c2;
+  if(NEED_CONTEXTUAL_ANALYSIS(aChar1))
+    c1 = this->ContextualAnalysis(U_NULL, aChar1, aChar2);
+  else 
+    c1 = this->GetClass(aChar1); 
+
+  if(NEED_CONTEXTUAL_ANALYSIS(aChar2))
+    c2 = this->ContextualAnalysis(aChar1, aChar2, U_NULL); 
+  else 
+    c2 = this->GetClass(aChar2); 
+
+  return GetPair(c1, c2);
+}
+

 PRBool nsJISx4051LineBreaker::BreakInBetween(
  const PRUnichar* aText1 , PRUint32 aTextLen1,
@ -408,34 +440,9 @@ PRBool nsJISx4051LineBreaker::BreakInBetween(
     return PR_FALSE;
  }

-  //search for CJK characters until a space is found. 
-  //if CJK char is found before space, use 4051, otherwise western
-  PRInt32 cur;
-
-  for (cur= aTextLen1-1; cur>=0; cur--)
-  {
-    if (IS_SPACE(aText1[cur]))
-      break;
-    if (IS_CJK_CHAR(aText1[cur]))
-      goto ROUTE_CJK_BETWEEN;
-  }
-
-  for (cur= 0; cur < (PRInt32)aTextLen2; cur++)
-  {
-    if (IS_SPACE(aText2[cur]))
-      break;
-    if (IS_CJK_CHAR(aText2[cur]))
-      goto ROUTE_CJK_BETWEEN;
-  }
-
-  //now apply western rule.
-  return IS_SPACE(aText1[aTextLen1-1]) || IS_SPACE(aText2[0]);
-
-ROUTE_CJK_BETWEEN:
-
  PRInt8 c1, c2;
  if(NEED_CONTEXTUAL_ANALYSIS(aText1[aTextLen1-1]))
-    c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:0,
+    c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:U_NULL,
                                  aText1[aTextLen1-1],
                                  aText2[0]);
  else 
@ -444,7 +451,7 @@ ROUTE_CJK_BETWEEN:
  if(NEED_CONTEXTUAL_ANALYSIS(aText2[0]))
    c2 = this->ContextualAnalysis(aText1[aTextLen1-1],
                                  aText2[0],
-                                  (aTextLen2>1)?aText2[1]:0);
+                                  (aTextLen2>1)?aText2[1]:U_NULL);
  else 
    c2 = this->GetClass(aText2[0]);

@ -466,26 +473,13 @@ PRInt32 nsJISx4051LineBreaker::Next(
  NS_ASSERTION(aText, "aText shouldn't be null");
  NS_ASSERTION(aLen > aPos, "Illegal value (length > position)");

-  //forward check for CJK characters until a space is found. 
-  //if CJK char is found before space, use 4051, otherwise western
-  PRUint32 cur;
-  for (cur = aPos; cur < aLen; ++cur)
-  {
-    if (IS_SPACE(aText[cur]))
-      return cur;
-    if (IS_CJK_CHAR(aText[cur]))
-      goto ROUTE_CJK_NEXT;
-  }
-  return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text
-
-ROUTE_CJK_NEXT:
  PRInt8 c1, c2;
-  cur = aPos;
+  PRUint32 cur = aPos;
  if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
  {
-    c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:0,
+    c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL,
                                  aText[cur],
-                                  (cur<(aLen-1)) ?aText[cur+1]:0);
+                                  (cur<(aLen-1)) ?aText[cur+1]:U_NULL);
  } else  {
    c1 = this->GetClass(aText[cur]);
  }
@ -497,9 +491,9 @@ ROUTE_CJK_NEXT:
  {
     if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
     {
-       c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:0,
+       c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:U_NULL,
                                  aText[cur],
-                                  (cur<(aLen-1)) ?aText[cur+1]:0);
+                                  (cur<(aLen-1)) ?aText[cur+1]:U_NULL);
     } else {
       c2 = this->GetClass(aText[cur]);
     }
@ -517,31 +511,13 @@ PRInt32 nsJISx4051LineBreaker::Prev(
 {
  NS_ASSERTION(aText, "aText shouldn't be null");

-  //backward check for CJK characters until a space is found. 
-  //if CJK char is found before space, use 4051, otherwise western
-  PRUint32 cur;
-  for (cur = aPos - 1; cur > 0; --cur)
-  {
-    if (IS_SPACE(aText[cur]))
-    {
-      if (cur != aPos - 1) // XXXldb Why?
-        ++cur;
-      return cur;
-    }
-    if (IS_CJK_CHAR(aText[cur]))
-      goto ROUTE_CJK_PREV;
-  }
-
-  return NS_LINEBREAKER_NEED_MORE_TEXT; // Need more text
-
-ROUTE_CJK_PREV:
-  cur = aPos;
+  PRUint32 cur = aPos;
  PRInt8 c1, c2;
  if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
  {
-    c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0,
+    c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL,
                                  aText[cur-1],
-                                  (cur<aLen) ?aText[cur]:0);
+                                  (cur<aLen) ?aText[cur]:U_NULL);
  } else  {
    c2 = this->GetClass(aText[cur-1]);
  }
@ -553,9 +529,9 @@ ROUTE_CJK_PREV:
  {
     if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
     {
-       c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0,
+       c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:U_NULL,
                                  aText[cur-1],
-                                  (cur<aLen) ?aText[cur]:0);
+                                  (cur<aLen) ?aText[cur]:U_NULL);
     } else {
       c1 = this->GetClass(aText[cur-1]);
     }
--- a/intl/lwbrk/src/nsJISx4501LineBreaker.h
+++ b/intl/lwbrk/src/nsJISx4501LineBreaker.h
@ -48,6 +48,9 @@ public:
  nsJISx4051LineBreaker();
  virtual ~nsJISx4051LineBreaker();

+  PRBool CanBreakBetweenLatin1(PRUnichar aChar1,
+                               PRUnichar aChar2);
+
  PRBool BreakInBetween( const PRUnichar* aText1 , PRUint32 aTextLen1,
                         const PRUnichar* aText2 , PRUint32 aTextLen2);

--- a/intl/lwbrk/tools/jisx4501class.txt
+++ b/intl/lwbrk/tools/jisx4501class.txt
@ -1,4 +1,5 @@
 0028;;1
+002F;;2
 005B;;1
 007B;;1
 2018;;1
--- a/layout/generic/nsTextTransformer.cpp
+++ b/layout/generic/nsTextTransformer.cpp
@ -348,8 +348,11 @@ nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen,
    bp2 += mBufferPos;
  }

+  PRUnichar prevCh;
+  PRUnichar ch = 0;
  for (; offset < fragLen; offset++) {
-    unsigned char ch = *cp++;
+    prevCh = (ch == ' ') ? CH_NBSP : ch;
+    ch = *cp++;
    if (XP_IS_SPACE(ch)) {
      break;
    }
@ -357,6 +360,10 @@ nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen,
      ch = ' ';
      *aWasTransformed = PR_TRUE;
    }
+    else if (offset != mOffset &&
+             nsContentUtils::LineBreaker()->CanBreakBetweenLatin1(prevCh, ch)) {
+      break;
+    }
    else if (IS_DISCARDED(ch)) {
      // Strip discarded characters from the transformed output
      continue;
--- a/parser/htmlparser/tests/outsinks/simplemail.out
+++ b/parser/htmlparser/tests/outsinks/simplemail.out
@ -1,7 +1,7 @@
 This is a mail with a couple of long lines and 
-then a sig. This is used as test of the 
-format=flowed output in the nsHTMLToTXTSinkstream. 
-If this test fails and none else, it's likely the 
+then a sig. This is used as test of the format=
+flowed output in the nsHTMLToTXTSinkstream. If 
+this test fails and none else, it's likely the 
 spaces at the ends of the lines that are missing. 
 They aren't easily seen without looking at the 
 data in an editor and checking where the end of