gecko-dev/layout/generic/nsTextTransformer.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
 *
 * The contents of this file are subject to the Netscape Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/NPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * The Original Code is Mozilla Communicator client code.
 *
 * The Initial Developer of the Original Code is Netscape Communications
 * Corporation.  Portions created by Netscape are
 * Copyright (C) 1998 Netscape Communications Corporation. All
 * Rights Reserved.
 *
 * Contributor(s):
 */
#include <ctype.h>
#include "nsCOMPtr.h"
#include "nsTextTransformer.h"
#include "nsIContent.h"
#include "nsIFrame.h"
#include "nsIStyleContext.h"
#include "nsITextContent.h"
#include "nsStyleConsts.h"
#include "nsILineBreaker.h"
#include "nsIWordBreaker.h"
#include "nsHTMLIIDs.h"
#include "nsIServiceManager.h"
#include "nsUnicharUtilCIID.h"
#include "nsICaseConversion.h"
#include "prenv.h"

nsAutoTextBuffer::nsAutoTextBuffer()
  : mBuffer(mAutoBuffer),
    mBufferLen(NS_TEXT_TRANSFORMER_AUTO_WORD_BUF_SIZE)
{
}

nsAutoTextBuffer::~nsAutoTextBuffer()
{
  if (mBuffer && (mBuffer != mAutoBuffer)) {
    delete [] mBuffer;
  }
}

nsresult
nsAutoTextBuffer::GrowBy(PRInt32 aAtLeast, PRBool aCopyToHead)
{
  PRInt32 newSize = mBufferLen * 2;
  if (newSize < mBufferLen + aAtLeast) {
    newSize = mBufferLen + aAtLeast + 100;
  }
  return GrowTo(newSize, aCopyToHead);
}

nsresult
nsAutoTextBuffer::GrowTo(PRInt32 aNewSize, PRBool aCopyToHead)
{
  if (aNewSize > mBufferLen) {
    PRUnichar* newBuffer = new PRUnichar[aNewSize];
    if (!newBuffer) {
      return NS_ERROR_OUT_OF_MEMORY;
    }
    nsCRT::memcpy(&newBuffer[aCopyToHead ? 0 : mBufferLen],
                  mBuffer, sizeof(PRUnichar) * mBufferLen);
    if (mBuffer != mAutoBuffer) {
      delete [] mBuffer;
    }
    mBuffer = newBuffer;
    mBufferLen = aNewSize;
  }
  return NS_OK;
}

//----------------------------------------------------------------------

static NS_DEFINE_IID(kUnicharUtilCID, NS_UNICHARUTIL_CID);
static NS_DEFINE_IID(kICaseConversionIID, NS_ICASECONVERSION_IID);

static nsICaseConversion* gCaseConv =  nsnull;

nsresult
nsTextTransformer::Initialize()
{
  nsresult res = NS_OK;
  if (!gCaseConv) {
    res = nsServiceManager::GetService(kUnicharUtilCID, kICaseConversionIID,
                                       (nsISupports**)&gCaseConv);
    NS_ASSERTION( NS_SUCCEEDED(res), "cannot get UnicharUtil");
    NS_ASSERTION( gCaseConv != NULL, "cannot get UnicharUtil");
  }
  return res;
}

void
nsTextTransformer::Shutdown()
{
  if (gCaseConv) {
    nsServiceManager::ReleaseService(kUnicharUtilCID, gCaseConv);
    gCaseConv = nsnull;
  }
}

// For now, we have only a single character to strip out. If we get
// any more, change this to use a bitset to lookup into.
//   CH_SHY - soft hyphen (discretionary hyphen)
#define IS_DISCARDED(_ch) \
  ((_ch) == CH_SHY)

#define MAX_UNIBYTE 127

MOZ_DECL_CTOR_COUNTER(nsTextTransformer);

nsTextTransformer::nsTextTransformer(nsILineBreaker* aLineBreaker,
                                     nsIWordBreaker* aWordBreaker)
  : mFrag(nsnull),
    mOffset(0),
    mMode(eNormal),
    mLineBreaker(aLineBreaker),
    mWordBreaker(aWordBreaker),
    mBufferPos(0),
    mLeaveAsAscii(PR_FALSE),
    mHasMultibyte(PR_FALSE),
    mTransformedTextIsAscii(PR_FALSE),
    mTextTransform(NS_STYLE_TEXT_TRANSFORM_NONE)
{
  MOZ_COUNT_CTOR(nsTextTransformer);

#ifdef DEBUG
  static PRBool firstTime = PR_TRUE;
  if (firstTime) {
    firstTime = PR_FALSE;
    SelfTest(aLineBreaker, aWordBreaker);
  }
#endif
}

nsTextTransformer::~nsTextTransformer()
{
  MOZ_COUNT_DTOR(nsTextTransformer);
}

nsresult
nsTextTransformer::Init(nsIFrame* aFrame,
                        nsIContent* aContent,
                        PRInt32 aStartingOffset,
                        PRBool aLeaveAsAscii)
{
  // Get the contents text content
  nsresult rv;
  nsCOMPtr<nsITextContent> tc = do_QueryInterface(aContent, &rv);
  if (tc.get()) {
    tc->GetText(&mFrag);

    // Sanitize aStartingOffset
    if (NS_WARN_IF_FALSE(aStartingOffset >= 0, "bad starting offset")) {
      aStartingOffset = 0;
    }
    else if (NS_WARN_IF_FALSE(aStartingOffset <= mFrag->GetLength(),
                              "bad starting offset")) {
      aStartingOffset = mFrag->GetLength();
    }
    mOffset = aStartingOffset;

    // Get the frames text style information
    const nsStyleText* styleText;
    aFrame->GetStyleData(eStyleStruct_Text, (const nsStyleStruct*&) styleText);
    if (NS_STYLE_WHITESPACE_PRE == styleText->mWhiteSpace) {
      mMode = ePreformatted;
    }
    else if (NS_STYLE_WHITESPACE_MOZ_PRE_WRAP == styleText->mWhiteSpace) {
      mMode = ePreWrap;
    }
    mTextTransform = styleText->mTextTransform;

    mLeaveAsAscii = aLeaveAsAscii;
    if (mLeaveAsAscii) {
      // See if the text fragment is 1-byte text
      // XXX Currently we only leave it as ascii for normal text and not for preformatted
      // or preformatted wrapped text
      if (mFrag->Is2b() || (eNormal != mMode)) {
        // We don't step down from Unicode to ascii
        mLeaveAsAscii = PR_FALSE;
      }
    }
  }
  return rv;
}

//----------------------------------------------------------------------

// wordlen==1, contentlen=newOffset-currentOffset, isWhitespace=t
PRInt32
nsTextTransformer::ScanNormalWhiteSpace_F()
{
  const nsTextFragment* frag = mFrag;
  PRInt32 fragLen = frag->GetLength();
  PRInt32 offset = mOffset;

  for (; offset < fragLen; offset++) {
    PRUnichar ch = frag->CharAt(offset);
    if (!XP_IS_SPACE(ch)) {
      // If character is not discardable then stop looping, otherwise
      // let the discarded character collapse with the other spaces.
      if (!IS_DISCARDED(ch)) {
        break;
      }
    }
  }

  // Make sure we have enough room in the transform buffer
  if (mBufferPos >= mTransformBuf.mBufferLen) {
    mTransformBuf.GrowBy(128);
  }

  if (mTransformedTextIsAscii) {
    unsigned char*  bp = (unsigned char*)mTransformBuf.mBuffer;
    bp[mBufferPos++] = ' ';
  } else {
    mTransformBuf.mBuffer[mBufferPos++] = PRUnichar(' ');
  }
  return offset;
}

void
nsTextTransformer::ConvertTransformedTextToUnicode()
{
  // Go backwards over the characters and convert them.
  PRInt32         lastChar = mBufferPos - 1;
  unsigned char*  cp1 = (unsigned char*)mTransformBuf.mBuffer + lastChar;
  PRUnichar*      cp2 = mTransformBuf.mBuffer + lastChar;

  NS_ASSERTION(mTransformBuf.mBufferLen >= mBufferPos,
               "transform buffer is too small");
  for (PRInt32 count = mBufferPos; count > 0; count--) {
    *cp2-- = PRUnichar(*cp1--);
  }
}

// wordlen==*aWordLen, contentlen=newOffset-currentOffset, isWhitespace=f
PRInt32
nsTextTransformer::ScanNormalAsciiText_F(PRInt32* aWordLen,
                                         PRBool*  aWasTransformed)
{
  const nsTextFragment* frag = mFrag;
  PRInt32 fragLen = frag->GetLength();
  PRInt32 offset = mOffset;
  PRInt32 prevBufferPos = mBufferPos;
  const unsigned char* cp = (const unsigned char*)frag->Get1b() + offset;
  union {
    unsigned char* bp1;
    PRUnichar* bp2;
  };
  bp2 = mTransformBuf.GetBuffer();
  if (mTransformedTextIsAscii) {
    bp1 += mBufferPos;
  } else {
    bp2 += mBufferPos;
  }

  for (; offset < fragLen; offset++) {
    unsigned char ch = *cp++;
    if (XP_IS_SPACE(ch)) {
      break;
    }
    if (CH_NBSP == ch) {
      ch = ' ';
      *aWasTransformed = PR_TRUE;
    }
    else if (IS_DISCARDED(ch)) {
      // Strip discarded characters from the transformed output
      continue;
    }
    if (ch > MAX_UNIBYTE) {
      // The text has a multibyte character so we can no longer leave the
      // text as ascii text
      mHasMultibyte = PR_TRUE;

      if (mTransformedTextIsAscii) {
        mTransformedTextIsAscii = PR_FALSE;
        *aWasTransformed = PR_TRUE;

        // Transform any existing ascii text to Unicode
        if (mBufferPos > 0) {
          ConvertTransformedTextToUnicode();
          bp2 = mTransformBuf.GetBuffer() + mBufferPos;
        }
      }
    }
    if (mBufferPos >= mTransformBuf.mBufferLen) {
      nsresult rv = mTransformBuf.GrowBy(128);
      if (NS_FAILED(rv)) {
        // If we run out of space then just truncate the text
        break;
      }
      bp2 = mTransformBuf.GetBuffer();
      if (mTransformedTextIsAscii) {
        bp1 += mBufferPos;
      } else {
        bp2 += mBufferPos;
      }
    }
    if (mTransformedTextIsAscii) {
      *bp1++ = ch;
    } else {
      *bp2++ = PRUnichar(ch);
    }
    mBufferPos++;
  }

  *aWordLen = mBufferPos - prevBufferPos;
  return offset;
}

// wordlen==*aWordLen, contentlen=newOffset-currentOffset, isWhitespace=f
PRInt32
nsTextTransformer::ScanNormalUnicodeText_F(PRBool   aForLineBreak,
                                           PRInt32* aWordLen,
                                           PRBool*  aWasTransformed)
{
  const nsTextFragment* frag = mFrag;
  const PRUnichar* cp0 = frag->Get2b();
  PRInt32 fragLen = frag->GetLength();
  PRInt32 offset = mOffset;

  PRUnichar firstChar = frag->CharAt(offset++);
  if (CH_NBSP == firstChar) {
    firstChar = ' ';
    *aWasTransformed = PR_TRUE;
  }
  mTransformBuf.mBuffer[mBufferPos++] = firstChar;
  if (firstChar > MAX_UNIBYTE) mHasMultibyte = PR_TRUE;

  // Only evaluate complex breaking logic if there are more characters
  // beyond the first to look at.
  PRInt32 numChars = 1;
  if (offset < fragLen) {
    const PRUnichar* cp = cp0 + offset;
    PRBool breakBetween = PR_FALSE;
    if (aForLineBreak) {
      mLineBreaker->BreakInBetween(&firstChar, 1, cp, (fragLen-offset), &breakBetween);
    }
    else {
      mWordBreaker->BreakInBetween(&firstChar, 1, cp, (fragLen-offset), &breakBetween);
    }

    if (!breakBetween) {
      // Find next position
      PRBool tryNextFrag;
      PRUint32 next;
      if (aForLineBreak) {
        mLineBreaker->Next(cp0, fragLen, offset, &next, &tryNextFrag);
      }
      else {
        mWordBreaker->Next(cp0, fragLen, offset, &next, &tryNextFrag);
      }
      numChars = (PRInt32) (next - (PRUint32) offset) + 1;

      // Since we know the number of characters we're adding grow the buffer
      // now before we start copying
      nsresult rv = mTransformBuf.GrowTo(mBufferPos + numChars);
      if (NS_FAILED(rv)) {
        numChars = mTransformBuf.GetBufferLength() - mBufferPos;
      }

      offset += numChars - 1;

      // 1. convert nbsp into space
      // 2. check for discarded characters
      // 3. check mHasMultibyte flag
      // 4. copy buffer
      PRUnichar* bp = &mTransformBuf.mBuffer[mBufferPos];
      const PRUnichar* end = cp + numChars - 1;
      while (cp < end) {
        PRUnichar ch = *cp++;
        if (CH_NBSP == ch) {
          ch = ' ';
        }
        else if (IS_DISCARDED(ch) || (ch == 0x0a) || (ch == 0x0d)) {
          // Strip discarded characters from the transformed output
          numChars--;
          continue;
        }
        if (ch > MAX_UNIBYTE) mHasMultibyte = PR_TRUE;
        *bp++ = ch;
        mBufferPos++;
      }
    }
  }

  *aWordLen = numChars;
  return offset;
}

// wordlen==*aWordLen, contentlen=newOffset-currentOffset, isWhitespace=t
PRInt32
nsTextTransformer::ScanPreWrapWhiteSpace_F(PRInt32* aWordLen)
{
  const nsTextFragment* frag = mFrag;
  PRInt32 fragLen = frag->GetLength();
  PRInt32 offset = mOffset;
  PRUnichar* bp = mTransformBuf.GetBuffer() + mBufferPos;
  PRUnichar* endbp = mTransformBuf.GetBufferEnd();
  PRInt32 prevBufferPos = mBufferPos;

  for (; offset < fragLen; offset++) {
    // This function is used for both Unicode and ascii strings so don't
    // make any assumptions about what kind of data it is
    PRUnichar ch = frag->CharAt(offset);
    if (!XP_IS_SPACE(ch) || (ch == '\t') || (ch == '\n')) {
      if (IS_DISCARDED(ch)) {
        // Keep looping if this is a discarded character
        continue;
      }
      break;
    }
    if (bp == endbp) {
      PRInt32 oldLength = bp - mTransformBuf.GetBuffer();
      nsresult rv = mTransformBuf.GrowBy(1000);
      if (NS_FAILED(rv)) {
        // If we run out of space (unlikely) then just chop the input
        break;
      }
      bp = mTransformBuf.GetBuffer() + oldLength;
      endbp = mTransformBuf.GetBufferEnd();
    }
    *bp++ = ' ';
    mBufferPos++;
  }

  *aWordLen = mBufferPos - prevBufferPos;
  return offset;
}

// wordlen==*aWordLen, contentlen=newOffset-currentOffset, isWhitespace=f
PRInt32
nsTextTransformer::ScanPreData_F(PRInt32* aWordLen,
                                 PRBool*  aWasTransformed)
{
  const nsTextFragment* frag = mFrag;
  PRInt32 fragLen = frag->GetLength();
  PRInt32 offset = mOffset;
  PRUnichar* bp = mTransformBuf.GetBuffer() + mBufferPos;
  PRUnichar* endbp = mTransformBuf.GetBufferEnd();
  PRInt32 prevBufferPos = mBufferPos;

  for (; offset < fragLen; offset++) {
    // This function is used for both Unicode and ascii strings so don't
    // make any assumptions about what kind of data it is
    PRUnichar ch = frag->CharAt(offset);
    if ((ch == '\t') || (ch == '\n')) {
      break;
    }
    if (CH_NBSP == ch) {
      ch = ' ';
      *aWasTransformed = PR_TRUE;
    }
    else if (IS_DISCARDED(ch)) {
      continue;
    }
    if (ch > MAX_UNIBYTE) mHasMultibyte = PR_TRUE;
    if (bp == endbp) {
      PRInt32 oldLength = bp - mTransformBuf.GetBuffer();
      nsresult rv = mTransformBuf.GrowBy(1000);
      if (NS_FAILED(rv)) {
        // If we run out of space (unlikely) then just chop the input
        break;
      }
      bp = mTransformBuf.GetBuffer() + oldLength;
      endbp = mTransformBuf.GetBufferEnd();
    }
    *bp++ = ch;
    mBufferPos++;
  }

  *aWordLen = mBufferPos - prevBufferPos;
  return offset;
}

// wordlen==*aWordLen, contentlen=newOffset-currentOffset, isWhitespace=f
PRInt32
nsTextTransformer::ScanPreAsciiData_F(PRInt32* aWordLen,
                                      PRBool*  aWasTransformed)
{
  const nsTextFragment* frag = mFrag;
  PRUnichar* bp = mTransformBuf.GetBuffer() + mBufferPos;
  PRUnichar* endbp = mTransformBuf.GetBufferEnd();
  const unsigned char* cp = (const unsigned char*) frag->Get1b();
  const unsigned char* end = cp + frag->GetLength();
  PRInt32 prevBufferPos = mBufferPos;
  cp += mOffset;

  while (cp < end) {
    PRUnichar ch = (PRUnichar) *cp++;
    if ((ch == '\t') || (ch == '\n')) {
      cp--;
      break;
    }
    if (CH_NBSP == ch) {
      ch = ' ';
      *aWasTransformed = PR_TRUE;
    }
    else if (IS_DISCARDED(ch)) {
      continue;
    }
    if (ch > MAX_UNIBYTE) mHasMultibyte = PR_TRUE;
    if (bp == endbp) {
      PRInt32 oldLength = bp - mTransformBuf.GetBuffer();
      nsresult rv = mTransformBuf.GrowBy(1000);
      if (NS_FAILED(rv)) {
        // If we run out of space (unlikely) then just chop the input
        break;
      }
      bp = mTransformBuf.GetBuffer() + oldLength;
      endbp = mTransformBuf.GetBufferEnd();
    }
    *bp++ = ch;
    mBufferPos++;
  }

  *aWordLen = mBufferPos - prevBufferPos;
  return cp - ((const unsigned char*)frag->Get1b());
}

//----------------------------------------

static void
AsciiToLowerCase(unsigned char* aText, PRInt32 aWordLen)
{
  while (aWordLen-- > 0) {
    *aText = tolower(*aText);
    aText++;
  }
}

static void
AsciiToUpperCase(unsigned char* aText, PRInt32 aWordLen)
{
  while (aWordLen-- > 0) {
    *aText = toupper(*aText);
    aText++;
  }
}

PRUnichar*
nsTextTransformer::GetNextWord(PRBool aInWord,
                               PRInt32* aWordLenResult,
                               PRInt32* aContentLenResult,
                               PRBool* aIsWhiteSpaceResult,
                               PRBool* aWasTransformed,
                               PRBool aResetTransformBuf,
                               PRBool aForLineBreak)
{
  const nsTextFragment* frag = mFrag;
  PRInt32 fragLen = frag->GetLength();
  PRInt32 offset = mOffset;
  PRInt32 wordLen = 0;
  PRBool isWhitespace = PR_FALSE;
  PRUnichar* result = nsnull;
  PRBool prevBufferPos;

  // Initialize OUT parameter
  *aWasTransformed = PR_FALSE;

  // See if we should reset the current buffer position back to the
  // beginning of the buffer
  if (aResetTransformBuf) {
    mBufferPos = 0;
    mTransformedTextIsAscii = mLeaveAsAscii;
  }
  prevBufferPos = mBufferPos;

  // Fix word breaking problem w/ PREFORMAT and PREWRAP
  // for word breaking, we should really go to the normal code
  if((! aForLineBreak) && (eNormal != mMode))
     mMode = eNormal;

  while (offset < fragLen) {
    PRUnichar firstChar = frag->CharAt(offset);

    // Eat up any discarded characters before dispatching
    if (IS_DISCARDED(firstChar)) {
      offset++;
      continue;
    }

    switch (mMode) {
      default:
      case eNormal:
        if (XP_IS_SPACE(firstChar)) {
          offset = ScanNormalWhiteSpace_F();
          if (firstChar != ' ') {
            *aWasTransformed = PR_TRUE;
          }
          wordLen = 1;
          isWhitespace = PR_TRUE;
        }
        else if (frag->Is2b()) {
          offset = ScanNormalUnicodeText_F(aForLineBreak, &wordLen, aWasTransformed);
        }
        else {
          offset = ScanNormalAsciiText_F(&wordLen, aWasTransformed);
        }
        break;

      case ePreformatted:
        if (('\n' == firstChar) || ('\t' == firstChar)) {
          mTransformBuf.mBuffer[mBufferPos++] = firstChar;
          offset++;
          wordLen = 1;
          isWhitespace = PR_TRUE;
        }
        else if (frag->Is2b()) {
          offset = ScanPreData_F(&wordLen, aWasTransformed);
        }
        else {
          offset = ScanPreAsciiData_F(&wordLen, aWasTransformed);
        }
        break;

      case ePreWrap:
        if (XP_IS_SPACE(firstChar)) {
          if (('\n' == firstChar) || ('\t' == firstChar)) {
            mTransformBuf.mBuffer[mBufferPos++] = firstChar;
            offset++;
            wordLen = 1;
          }
          else {
            offset = ScanPreWrapWhiteSpace_F(&wordLen);
          }
          isWhitespace = PR_TRUE;
        }
        else if (frag->Is2b()) {
          offset = ScanNormalUnicodeText_F(aForLineBreak, &wordLen, aWasTransformed);
        }
        else {
          offset = ScanNormalAsciiText_F(&wordLen, aWasTransformed);
        }
        break;
    }

    if (mTransformedTextIsAscii) {
      unsigned char* wordPtr = (unsigned char*)mTransformBuf.mBuffer + prevBufferPos;

      if (!isWhitespace) {
        switch (mTextTransform) {
        case NS_STYLE_TEXT_TRANSFORM_CAPITALIZE:
          *wordPtr = toupper(*wordPtr);
          break;
        case NS_STYLE_TEXT_TRANSFORM_LOWERCASE:
          AsciiToLowerCase(wordPtr, wordLen);
          break;
        case NS_STYLE_TEXT_TRANSFORM_UPPERCASE:
          AsciiToUpperCase(wordPtr, wordLen);
          break;
        }
      }
      result = (PRUnichar*)wordPtr;

    } else {
      result = &mTransformBuf.mBuffer[prevBufferPos];

      if (!isWhitespace) {
        switch (mTextTransform) {
        case NS_STYLE_TEXT_TRANSFORM_CAPITALIZE:
          gCaseConv->ToTitle(result, result, wordLen, !aInWord);
          break;
        case NS_STYLE_TEXT_TRANSFORM_LOWERCASE:
          gCaseConv->ToLower(result, result, wordLen);
          break;
        case NS_STYLE_TEXT_TRANSFORM_UPPERCASE:
          gCaseConv->ToUpper(result, result, wordLen);
          break;
        }
      }
    }

    break;
  }

  *aWordLenResult = wordLen;
  *aContentLenResult = offset - mOffset;
  *aIsWhiteSpaceResult = isWhitespace;

  // If the word length doesn't match the content length then we transformed
  // the text
  if ((mTextTransform != NS_STYLE_TEXT_TRANSFORM_NONE) ||
      (*aWordLenResult != *aContentLenResult)) {
    *aWasTransformed = PR_TRUE;
  }

  mOffset = offset;
  return result;
}

//----------------------------------------------------------------------

// wordlen==1, contentlen=newOffset-currentOffset, isWhitespace=t
PRInt32
nsTextTransformer::ScanNormalWhiteSpace_B()
{
  const nsTextFragment* frag = mFrag;
  PRInt32 offset = mOffset;

  while (--offset >= 0) {
    PRUnichar ch = frag->CharAt(offset);
    if (!XP_IS_SPACE(ch)) {
      // If character is not discardable then stop looping, otherwise
      // let the discarded character collapse with the other spaces.
      if (!IS_DISCARDED(ch)) {
        break;
      }
    }
  }

  mTransformBuf.mBuffer[mTransformBuf.mBufferLen - 1] = ' ';
  return offset;
}

// wordlen==*aWordLen, contentlen=newOffset-currentOffset, isWhitespace=f
PRInt32
nsTextTransformer::ScanNormalAsciiText_B(PRInt32* aWordLen)
{
  const nsTextFragment* frag = mFrag;
  PRInt32 offset = mOffset;
  PRUnichar* bp = mTransformBuf.GetBufferEnd();
  PRUnichar* startbp = mTransformBuf.GetBuffer();

  while (--offset >= 0) {
    PRUnichar ch = frag->CharAt(offset);
    if (XP_IS_SPACE(ch)) {
      break;
    }
    if (CH_NBSP == ch) {
      ch = ' ';
    }
    else if (IS_DISCARDED(ch)) {
      continue;
    }
    if (ch > MAX_UNIBYTE) mHasMultibyte = PR_TRUE;
    if (bp == startbp) {
      PRInt32 oldLength = mTransformBuf.mBufferLen;
      nsresult rv = mTransformBuf.GrowBy(1000);
      if (NS_FAILED(rv)) {
        // If we run out of space (unlikely) then just chop the input
        break;
      }
      bp = mTransformBuf.GetBufferEnd() - oldLength;
      startbp = mTransformBuf.GetBuffer();
    }
    *--bp = ch;
  }

  *aWordLen = mTransformBuf.GetBufferEnd() - bp;
  return offset;
}

// wordlen==*aWordLen, contentlen=newOffset-currentOffset, isWhitespace=f
PRInt32
nsTextTransformer::ScanNormalUnicodeText_B(PRBool aForLineBreak,
                                           PRInt32* aWordLen)
{
  const nsTextFragment* frag = mFrag;
  const PRUnichar* cp0 = frag->Get2b();
  PRInt32 offset = mOffset - 1;

  PRUnichar firstChar = frag->CharAt(offset);
  if (CH_NBSP == firstChar) {
    firstChar = ' ';
  }
  mTransformBuf.mBuffer[mTransformBuf.mBufferLen - 1] = firstChar;
  if (firstChar > MAX_UNIBYTE) mHasMultibyte = PR_TRUE;

  PRInt32 numChars = 1;
  if (offset > 0) {
    const PRUnichar* cp = cp0 + offset;
    PRBool breakBetween = PR_FALSE;
    if (aForLineBreak) {
      mLineBreaker->BreakInBetween(cp0, offset + 1,
                                   mTransformBuf.GetBufferEnd()-1, 1,
                                   &breakBetween);
    }
    else {
      mWordBreaker->BreakInBetween(cp0, offset + 1,
                                   mTransformBuf.GetBufferEnd()-1, 1,
                                   &breakBetween);
    }

    if (!breakBetween) {
      // Find next position
      PRBool tryPrevFrag;
      PRUint32 prev;
      if (aForLineBreak) {
        mLineBreaker->Prev(cp0, offset, offset, &prev, &tryPrevFrag);
      }
      else {
        mWordBreaker->Prev(cp0, offset, offset, &prev, &tryPrevFrag);
      }
      numChars = (PRInt32) ((PRUint32) offset - prev) + 1;

      // Grow buffer before copying
      nsresult rv = mTransformBuf.GrowTo(numChars);
      if (NS_FAILED(rv)) {
        numChars = mTransformBuf.GetBufferLength();
      }

      // 1. convert nbsp into space
      // 2. check mHasMultibyte flag
      // 3. copy buffer
      PRUnichar* bp = mTransformBuf.GetBufferEnd() - 1;
      const PRUnichar* end = cp - numChars + 1;
      while (cp > end) {
        PRUnichar ch = *--cp;
        if (CH_NBSP == ch) {
          ch = ' ';
        }
        else if (IS_DISCARDED(ch)) {
          continue;
        }
        if (ch > MAX_UNIBYTE) mHasMultibyte = PR_TRUE;
        *--bp = ch;
      }

      // Recompute offset and numChars in case we stripped something
      offset = offset - numChars;
      numChars =  mTransformBuf.GetBufferEnd() - bp;
    }
  }

  *aWordLen = numChars;
  return offset;
}

// wordlen==*aWordLen, contentlen=newOffset-currentOffset, isWhitespace=t
PRInt32
nsTextTransformer::ScanPreWrapWhiteSpace_B(PRInt32* aWordLen)
{
  const nsTextFragment* frag = mFrag;
  PRInt32 offset = mOffset;
  PRUnichar* bp = mTransformBuf.GetBufferEnd();
  PRUnichar* startbp = mTransformBuf.GetBuffer();

  while (--offset >= 0) {
    PRUnichar ch = frag->CharAt(offset);
    if (!XP_IS_SPACE(ch) || (ch == '\t') || (ch == '\n')) {
      // Keep looping if this is a discarded character
      if (IS_DISCARDED(ch)) {
        continue;
      }
      break;
    }
    if (bp == startbp) {
      PRInt32 oldLength = mTransformBuf.mBufferLen;
      nsresult rv = mTransformBuf.GrowBy(1000);
      if (NS_FAILED(rv)) {
        // If we run out of space (unlikely) then just chop the input
        break;
      }
      bp = mTransformBuf.GetBufferEnd() - oldLength;
      startbp = mTransformBuf.GetBuffer();
    }
    *--bp = ' ';
  }

  *aWordLen = mTransformBuf.GetBufferEnd() - bp;
  return offset;
}

// wordlen==*aWordLen, contentlen=newOffset-currentOffset, isWhitespace=f
PRInt32
nsTextTransformer::ScanPreData_B(PRInt32* aWordLen)
{
  const nsTextFragment* frag = mFrag;
  PRInt32 offset = mOffset;
  PRUnichar* bp = mTransformBuf.GetBufferEnd();
  PRUnichar* startbp = mTransformBuf.GetBuffer();

  while (--offset >= 0) {
    PRUnichar ch = frag->CharAt(offset);
    if ((ch == '\t') || (ch == '\n')) {
      break;
    }
    if (CH_NBSP == ch) {
      ch = ' ';
    }
    else if (IS_DISCARDED(ch)) {
      continue;
    }
    if (ch > MAX_UNIBYTE) mHasMultibyte = PR_TRUE;
    if (bp == startbp) {
      PRInt32 oldLength = mTransformBuf.mBufferLen;
      nsresult rv = mTransformBuf.GrowBy(1000);
      if (NS_FAILED(rv)) {
        // If we run out of space (unlikely) then just chop the input
        offset++;
        break;
      }
      bp = mTransformBuf.GetBufferEnd() - oldLength;
      startbp = mTransformBuf.GetBuffer();
    }
    *--bp = ch;
  }

  *aWordLen = mTransformBuf.GetBufferEnd() - bp;
  return offset;
}

//----------------------------------------

PRUnichar*
nsTextTransformer::GetPrevWord(PRBool aInWord,
                               PRInt32* aWordLenResult,
                               PRInt32* aContentLenResult,
                               PRBool* aIsWhiteSpaceResult,
                               PRBool aForLineBreak)
{
  const nsTextFragment* frag = mFrag;
  PRInt32 offset = mOffset;
  PRInt32 wordLen = 0;
  PRBool isWhitespace = PR_FALSE;
  PRUnichar* result = nsnull;

  // Fix word breaking problem w/ PREFORMAT and PREWRAP
  // for word breaking, we should really go to the normal code
  if((! aForLineBreak) && (eNormal != mMode))
     mMode = eNormal;

  while (--offset >= 0) {
    PRUnichar firstChar = frag->CharAt(offset);

    // Eat up any discarded characters before dispatching
    if (IS_DISCARDED(firstChar)) {
      continue;
    }

    switch (mMode) {
      default:
      case eNormal:
        if (XP_IS_SPACE(firstChar)) {
          offset = ScanNormalWhiteSpace_B();
          wordLen = 1;
          isWhitespace = PR_TRUE;
        }
        else if (frag->Is2b()) {
          offset = ScanNormalUnicodeText_B(aForLineBreak, &wordLen);
        }
        else {
          offset = ScanNormalAsciiText_B(&wordLen);
        }
        break;

      case ePreformatted:
        if (('\n' == firstChar) || ('\t' == firstChar)) {
          mTransformBuf.mBuffer[mTransformBuf.mBufferLen-1] = firstChar;
          offset--;  // make sure we overshoot
          wordLen = 1;
          isWhitespace = PR_TRUE;
        }
        else {
          offset = ScanPreData_B(&wordLen);
        }
        break;

      case ePreWrap:
        if (XP_IS_SPACE(firstChar)) {
          if (('\n' == firstChar) || ('\t' == firstChar)) {
            mTransformBuf.mBuffer[mTransformBuf.mBufferLen-1] = firstChar;
            offset--;  // make sure we overshoot
            wordLen = 1;
          }
          else {
            offset = ScanPreWrapWhiteSpace_B(&wordLen);
          }
          isWhitespace = PR_TRUE;
        }
        else if (frag->Is2b()) {
          offset = ScanNormalUnicodeText_B(aForLineBreak, &wordLen);
        }
        else {
          offset = ScanNormalAsciiText_B(&wordLen);
        }
        break;
    }

    // Backwards scanning routines *always* overshoot by one for the
    // returned offset value.
    offset = offset + 1;

    result = mTransformBuf.GetBufferEnd() - wordLen;

    if (!isWhitespace) {
      switch (mTextTransform) {
        case NS_STYLE_TEXT_TRANSFORM_CAPITALIZE:
          gCaseConv->ToTitle(result, result, wordLen, !aInWord);
          break;
        case NS_STYLE_TEXT_TRANSFORM_LOWERCASE:
          gCaseConv->ToLower(result, result, wordLen);
          break;
        case NS_STYLE_TEXT_TRANSFORM_UPPERCASE:
          gCaseConv->ToUpper(result, result, wordLen);
          break;
      }
    }
    break;
  }

  *aWordLenResult = wordLen;
  *aContentLenResult = mOffset - offset;
  *aIsWhiteSpaceResult = isWhitespace;

  mOffset = offset;
  return result;
}

//----------------------------------------------------------------------

// Self test logic for this class. This will (hopefully) make sure
// that the forward and backward word iterator methods continue to
// function as people change things...

#ifdef DEBUG
struct SelfTestSection {
  int length;
  int* data;
};

#define NUM_MODES 3

struct SelfTestData {
  const PRUnichar* text;
  SelfTestSection modes[NUM_MODES];
};

static PRUint8 preModeValue[NUM_MODES] = {
  NS_STYLE_WHITESPACE_NORMAL,
  NS_STYLE_WHITESPACE_PRE,
  NS_STYLE_WHITESPACE_MOZ_PRE_WRAP
};

static PRUnichar test1text[] = {
  'o', 'n', 'c', 'e', ' ', 'u', 'p', 'o', 'n', '\t',
  'a', ' ', 's', 'h', 'o', 'r', 't', ' ', 't', 'i', 'm', 'e', 0
};
static int test1Results[] = { 4, 1, 4, 1, 1, 1, 5, 1, 4 };
static int test1PreResults[] = { 9, 1, 12 };
static int test1PreWrapResults[] = { 4, 1, 4, 1, 1, 1, 5, 1, 4 };

static PRUnichar test2text[] = {
  0xF6, 'n', 'c', 'e', ' ', 0xFB, 'p', 'o', 'n', '\t',
  0xE3, ' ', 's', 'h', 0xF3, 'r', 't', ' ', 't', 0xEE, 'm', 'e', ' ', 0
};
static int test2Results[] = { 4, 1, 4, 1, 1, 1, 5, 1, 4, 1 };
static int test2PreResults[] = { 9, 1, 13 };
static int test2PreWrapResults[] = { 4, 1, 4, 1, 1, 1, 5, 1, 4, 1 };

static PRUnichar test3text[] = {
  0x0152, 'n', 'c', 'e', ' ', 'x', 'y', '\t', 'z', 'y', ' ', 0
};
static int test3Results[] = { 4, 1, 2, 1, 2, 1, };
static int test3PreResults[] = { 7, 1, 3, };
static int test3PreWrapResults[] = { 4, 1, 2, 1, 2, 1, };

static PRUnichar test4text[] = {
  'o', 'n', CH_SHY, 'c', 'e', ' ', CH_SHY, ' ', 'u', 'p', 'o', 'n', '\t',
  'a', ' ', 's', 'h', 'o', 'r', 't', ' ', 't', 'i', 'm', 'e', 0
};
static int test4Results[] = { 4, 1, 4, 1, 1, 1, 5, 1, 4 };
static int test4PreResults[] = { 10, 1, 12 };
static int test4PreWrapResults[] = { 4, 2, 4, 1, 1, 1, 5, 1, 4 };

static PRUnichar test5text[] = {
  CH_SHY, 0
};
static int test5Results[] = { 0 };
static int test5PreResults[] = { 0 };
static int test5PreWrapResults[] = { 0 };

#if 0
static PRUnichar test6text[] = {
  0x30d5, 0x30b8, 0x30c6, 0x30ec, 0x30d3, 0x306e, 0x97f3, 0x697d,
  0x756a, 0x7d44, 0x300c, 'H', 'E', 'Y', '!', ' ', 'H', 'E', 'Y', '!',
  '\t', 'H', 'E', 'Y', '!', 0x300d, 0x306e, 0x30db, 0x30fc, 0x30e0,
  0x30da, 0x30fc, 0x30b8, 0x3002, 0
};
static int test6Results[] = { 1, 1, 1, 1, 1,
                              1, 1, 1, 1, 1,
                              5, 1, 4, 1, 5,
                              1, 2, 1, 2, 2 };
static int test6PreResults[] = { 20, 1, 13 };
static int test6PreWrapResults[] = { 1, 1, 1, 1, 1,
                                     1, 1, 1, 1, 1,
                                     5, 1, 4, 1, 5,
                                     1, 2, 1, 2, 2 };
#endif

static SelfTestData tests[] = {
  { test1text,
    { { sizeof(test1Results)/sizeof(int), test1Results, },
      { sizeof(test1PreResults)/sizeof(int), test1PreResults, },
      { sizeof(test1PreWrapResults)/sizeof(int), test1PreWrapResults, } }
  },
  { test2text,
    { { sizeof(test2Results)/sizeof(int), test2Results, },
      { sizeof(test2PreResults)/sizeof(int), test2PreResults, },
      { sizeof(test2PreWrapResults)/sizeof(int), test2PreWrapResults, } }
  },
  { test3text,
    { { sizeof(test3Results)/sizeof(int), test3Results, },
      { sizeof(test3PreResults)/sizeof(int), test3PreResults, },
      { sizeof(test3PreWrapResults)/sizeof(int), test3PreWrapResults, } }
  },
  { test4text,
    { { sizeof(test4Results)/sizeof(int), test4Results, },
      { sizeof(test4PreResults)/sizeof(int), test4PreResults, },
      { sizeof(test4PreWrapResults)/sizeof(int), test4PreWrapResults, } }
  },
  { test5text,
    { { sizeof(test5Results)/sizeof(int), test5Results, },
      { sizeof(test5PreResults)/sizeof(int), test5PreResults, },
      { sizeof(test5PreWrapResults)/sizeof(int), test5PreWrapResults, } }
  },
#if 0
  { test6text,
    { { sizeof(test6Results)/sizeof(int), test6Results, },
      { sizeof(test6PreResults)/sizeof(int), test6PreResults, },
      { sizeof(test6PreWrapResults)/sizeof(int), test6PreWrapResults, } }
  },
#endif
};

#define NUM_TESTS (sizeof(tests) / sizeof(tests[0]))

void
nsTextTransformer::SelfTest(nsILineBreaker* aLineBreaker,
                            nsIWordBreaker* aWordBreaker)
{
  PRBool gNoisy = PR_FALSE;
  if (PR_GetEnv("GECKO_TEXT_TRANSFORMER_NOISY_SELF_TEST")) {
    gNoisy = PR_TRUE;
  }

  PRBool error = PR_FALSE;
  PRInt32 testNum = 0;
  SelfTestData* st = tests;
  SelfTestData* last = st + NUM_TESTS;
  for (; st < last; st++) {
    PRUnichar* bp;
    PRInt32 wordLen, contentLen;
    PRBool ws, transformed;

    PRBool isAsciiTest = PR_TRUE;
    const PRUnichar* cp = st->text;
    while (*cp) {
      if (*cp > 255) {
        isAsciiTest = PR_FALSE;
        break;
      }
      cp++;
    }

    nsTextFragment frag(st->text);
    nsTextTransformer tx(aLineBreaker, aWordBreaker);

    for (PRInt32 preMode = 0; preMode < NUM_MODES; preMode++) {
      // Do forwards test
      if (gNoisy) {
        nsAutoString uc2(st->text);
        printf("%s forwards test: '", isAsciiTest ? "ascii" : "unicode");
        fputs(uc2, stdout);
        printf("'\n");
      }
      tx.Init2(&frag, 0, preModeValue[preMode], NS_STYLE_TEXT_TRANSFORM_NONE);

      int* expectedResults = st->modes[preMode].data;
      int resultsLen = st->modes[preMode].length;

      while ((bp = tx.GetNextWord(PR_FALSE, &wordLen, &contentLen, &ws, &transformed))) {
        if (gNoisy) {
          nsAutoString tmp(bp, wordLen);
          printf("  '");
          fputs(tmp, stdout);
          printf("': ws=%s wordLen=%d (%d) contentLen=%d (offset=%d)\n",
                 ws ? "yes" : "no",
                 wordLen, *expectedResults, contentLen, tx.mOffset);
        }
        if (*expectedResults != wordLen) {
          error = PR_TRUE;
          break;
        }
        expectedResults++;
      }
      if (expectedResults != st->modes[preMode].data + resultsLen) {
        if (st->modes[preMode].data[0] != 0) {
          error = PR_TRUE;
        }
      }

      // Do backwards test
      if (gNoisy) {
        nsAutoString uc2(st->text);
        printf("%s backwards test: '", isAsciiTest ? "ascii" : "unicode");
        fputs(uc2, stdout);
        printf("'\n");
      }
      tx.Init2(&frag, frag.GetLength(), NS_STYLE_WHITESPACE_NORMAL,
               NS_STYLE_TEXT_TRANSFORM_NONE);
      expectedResults = st->modes[preMode].data + resultsLen;
      while ((bp = tx.GetPrevWord(PR_FALSE, &wordLen, &contentLen, &ws))) {
        --expectedResults;
        if (gNoisy) {
          nsAutoString tmp(bp, wordLen);
          printf("  '");
          fputs(tmp, stdout);
          printf("': ws=%s wordLen=%d contentLen=%d (offset=%d)\n",
                 ws ? "yes" : "no",
                 wordLen, contentLen, tx.mOffset);
        }
        if (*expectedResults != wordLen) {
          error = PR_TRUE;
          break;
        }
      }
      if (expectedResults != st->modes[preMode].data) {
        if (st->modes[preMode].data[0] != 0) {
          error = PR_TRUE;
        }
      }

      if (error) {
        fprintf(stderr, "nsTextTransformer: self test %d failed\n", testNum);
      }
      testNum++;
    }
  }
  if (error) {
    NS_ABORT();
  }
}

nsresult
nsTextTransformer::Init2(const nsTextFragment* aFrag,
                         PRInt32 aStartingOffset,
                         PRUint8 aWhiteSpace,
                         PRUint8 aTextTransform)
{
  mFrag = aFrag;

  // Sanitize aStartingOffset
  if (NS_WARN_IF_FALSE(aStartingOffset >= 0, "bad starting offset")) {
    aStartingOffset = 0;
  }
  else if (NS_WARN_IF_FALSE(aStartingOffset <= mFrag->GetLength(),
                            "bad starting offset")) {
    aStartingOffset = mFrag->GetLength();
  }
  mOffset = aStartingOffset;

  // Get the frames text style information
  if (NS_STYLE_WHITESPACE_PRE == aWhiteSpace) {
    mMode = ePreformatted;
  }
  else if (NS_STYLE_WHITESPACE_MOZ_PRE_WRAP == aWhiteSpace) {
    mMode = ePreWrap;
  }
  mTextTransform = aTextTransform;

  return NS_OK;
}
#endif /* DEBUG */