mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-12-05 12:05:22 +00:00
393 lines
14 KiB
C++
393 lines
14 KiB
C++
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
/* ***** BEGIN LICENSE BLOCK *****
|
|
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
*
|
|
* The contents of this file are subject to the Mozilla Public License Version
|
|
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
* the License. You may obtain a copy of the License at
|
|
* http://www.mozilla.org/MPL/
|
|
*
|
|
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
* for the specific language governing rights and limitations under the
|
|
* License.
|
|
*
|
|
* The Original Code is Mozilla Communicator client code.
|
|
*
|
|
* The Initial Developer of the Original Code is
|
|
* Netscape Communications Corporation.
|
|
* Portions created by the Initial Developer are Copyright (C) 1998
|
|
* the Initial Developer. All Rights Reserved.
|
|
*
|
|
* Contributor(s):
|
|
*
|
|
* Alternatively, the contents of this file may be used under the terms of
|
|
* either of the GNU General Public License Version 2 or later (the "GPL"),
|
|
* or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
* of those above. If you wish to allow use of your version of this file only
|
|
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
* use your version of this file under the terms of the MPL, indicate your
|
|
* decision by deleting the provisions above and replace them with the notice
|
|
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
* the provisions above, a recipient may use your version of this file under
|
|
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
*
|
|
* ***** END LICENSE BLOCK ***** */
|
|
#ifndef nsTextTransformer_h___
|
|
#define nsTextTransformer_h___
|
|
|
|
#include "nsTextFragment.h"
|
|
#include "nsISupports.h"
|
|
#include "nsPresContext.h"
|
|
#include "nsIObserver.h"
|
|
#ifdef IBMBIDI
|
|
#include "nsBidi.h"
|
|
#include "nsBidiUtils.h"
|
|
#endif
|
|
|
|
class nsIContent;
|
|
class nsIFrame;
|
|
class nsILineBreaker;
|
|
class nsIWordBreaker;
|
|
|
|
// XXX I'm sure there are other special characters
|
|
#define CH_NBSP 160
|
|
#define CH_ENSP 8194 //<!ENTITY ensp CDATA " " -- en space, U+2002 ISOpub -->
|
|
#define CH_EMSP 8195 //<!ENTITY emsp CDATA " " -- em space, U+2003 ISOpub -->
|
|
#define CH_THINSP 8291 //<!ENTITY thinsp CDATA " " -- thin space, U+2009 ISOpub -->
|
|
#define CH_ZWNJ 8204 //<!ENTITY zwnj CDATA "‌" -- zero width non-joiner, U+200C NEW RFC 2070
|
|
#define CH_SHY 173
|
|
|
|
#ifdef IBMBIDI
|
|
#define CH_ZWJ 8205 //<!ENTITY zwj CDATA "‍" -- zero width joiner, U+200D NEW RFC 2070 -->
|
|
#define CH_LRM 8206 //<!ENTITY lrm CDATA "‎" -- left-to-right mark, U+200E NEW RFC 2070 -->
|
|
#define CH_RLM 8207 //<!ENTITY rlm CDATA "‏" -- right-to-left mark, U+200F NEW RFC 2070 -->
|
|
#define CH_LRE 8234 //<!CDATA "‪" -- left-to-right embedding, U+202A -->
|
|
#define CH_RLE 8235 //<!CDATA "‫" -- right-to-left embedding, U+202B -->
|
|
#define CH_PDF 8236 //<!CDATA "‬" -- pop directional format, U+202C -->
|
|
#define CH_LRO 8237 //<!CDATA "‭" -- left-to-right override, U+202D -->
|
|
#define CH_RLO 8238 //<!CDATA "‮" -- right-to-left override, U+202E -->
|
|
|
|
#define IS_BIDI_CONTROL(_ch) \
|
|
(((_ch) >= CH_LRM && (_ch) <= CH_RLM) \
|
|
|| ((_ch) >= CH_LRE && (_ch) <= CH_RLO))
|
|
#endif // IBMBIDI
|
|
|
|
// For now, we have only a couple of characters to strip out. If we get
|
|
// any more, change this to use a bitset to lookup into.
|
|
// CH_SHY - soft hyphen (discretionary hyphen)
|
|
#ifdef IBMBIDI
|
|
// added BIDI formatting codes
|
|
#define IS_DISCARDED(_ch) \
|
|
(((_ch) == CH_SHY) || ((_ch) == '\r') || IS_BIDI_CONTROL(_ch))
|
|
#else
|
|
#define IS_DISCARDED(_ch) \
|
|
(((_ch) == CH_SHY) || ((_ch) == '\r'))
|
|
#endif
|
|
|
|
#define IS_ASCII_CHAR(ch) ((ch&0xff80) == 0)
|
|
|
|
#define NS_TEXT_TRANSFORMER_AUTO_WORD_BUF_SIZE 128 // used to be 256
|
|
|
|
// Indicates whether the transformed text should be left as ascii
|
|
#define NS_TEXT_TRANSFORMER_LEAVE_AS_ASCII 1
|
|
|
|
// If at any point during GetNextWord or GetPrevWord we
|
|
// run across a multibyte (> 127) unicode character.
|
|
#define NS_TEXT_TRANSFORMER_HAS_MULTIBYTE 2
|
|
|
|
// The text in the transform buffer is ascii
|
|
#define NS_TEXT_TRANSFORMER_TRANSFORMED_TEXT_IS_ASCII 4
|
|
|
|
#ifdef IBMBIDI
|
|
// The text in the transform buffer needs Arabic shaping
|
|
#define NS_TEXT_TRANSFORMER_DO_ARABIC_SHAPING 8
|
|
|
|
// The text in the transform buffer needs numeric shaping
|
|
#define NS_TEXT_TRANSFORMER_DO_NUMERIC_SHAPING 16
|
|
#endif
|
|
|
|
// A growable text buffer that tries to avoid using malloc by having a
|
|
// builtin buffer. Ideally used as an automatic variable.
|
|
class nsAutoTextBuffer {
|
|
public:
|
|
nsAutoTextBuffer();
|
|
~nsAutoTextBuffer();
|
|
|
|
nsresult GrowBy(PRInt32 aAtLeast, PRBool aCopyToHead = PR_TRUE);
|
|
|
|
nsresult GrowTo(PRInt32 aNewSize, PRBool aCopyToHead = PR_TRUE);
|
|
|
|
PRUnichar* GetBuffer() { return mBuffer; }
|
|
PRUnichar* GetBufferEnd() { return mBuffer + mBufferLen; }
|
|
PRInt32 GetBufferLength() const { return mBufferLen; }
|
|
|
|
PRUnichar* mBuffer;
|
|
PRInt32 mBufferLen;
|
|
PRUnichar mAutoBuffer[NS_TEXT_TRANSFORMER_AUTO_WORD_BUF_SIZE];
|
|
};
|
|
|
|
//----------------------------------------
|
|
|
|
/**
|
|
* This object manages the transformation of text:
|
|
*
|
|
* <UL>
|
|
* <LI>whitespace compression
|
|
* <LI>capitalization
|
|
* <LI>lowercasing
|
|
* <LI>uppercasing
|
|
* <LI>ascii to Unicode (if requested)
|
|
* <LI>discarded characters
|
|
* <LI>conversion of   that is not part of whitespace into a space
|
|
* <LI>tab and newline characters to space (normal text only)
|
|
* </UL>
|
|
*
|
|
* Note that no transformations are applied that would impact word
|
|
* breaking (like mapping into space, for example). In
|
|
* addition, this logic will not strip leading or trailing whitespace
|
|
* (across the entire run of text; leading whitespace can be skipped
|
|
* for a frames text because of whitespace compression).
|
|
*/
|
|
class nsTextTransformer {
|
|
public:
|
|
// Note: The text transformer does not hold a reference to the line
|
|
// breaker and work breaker objects
|
|
nsTextTransformer(nsILineBreaker* aLineBreaker,
|
|
nsIWordBreaker* aWordBreaker,
|
|
nsPresContext* aPresContext);
|
|
|
|
~nsTextTransformer();
|
|
|
|
/**
|
|
* Initialize the text transform. Use GetNextWord() and GetPrevWord()
|
|
* to iterate the text
|
|
*
|
|
* The default is to transform all text to Unicode; however, you can
|
|
* specify that the text should be left as ascii if possible. Note that
|
|
* we don't step the text down from Unicode to ascii (even if it doesn't
|
|
* contain multibyte characters) so this only happens for text fragments
|
|
* that contain 1-byte text.
|
|
* XXX This is currently not implemented for GetPreviousWord()
|
|
* @see TransformedTextIsAscii()
|
|
*/
|
|
nsresult Init(nsIFrame* aFrame,
|
|
nsIContent* aContent,
|
|
PRInt32 aStartingOffset,
|
|
PRBool aForceArabicShaping = PR_FALSE,
|
|
PRBool aLeaveAsAscii = PR_FALSE);
|
|
|
|
PRInt32 GetContentLength() const {
|
|
return mFrag ? mFrag->GetLength() : 0;
|
|
}
|
|
|
|
PRUnichar GetContentCharAt(PRInt32 aIndex) {
|
|
return (mFrag && aIndex < mFrag->GetLength()) ? mFrag->CharAt(aIndex) : 0;
|
|
}
|
|
|
|
/**
|
|
* Iterates the next word in the text fragment.
|
|
*
|
|
* Returns a pointer to the word, the number of characters in the word, the
|
|
* content length of the word, whether it is whitespace, and whether the
|
|
* text was transformed (any of the transformations listed above). The content
|
|
* length can be greater than the word length if whitespace compression occured
|
|
* or if characters were discarded
|
|
*
|
|
* The default behavior is to reset the transform buffer to the beginning,
|
|
* but you can choose to not reste it and buffer across multiple words
|
|
*/
|
|
PRUnichar* GetNextWord(PRBool aInWord,
|
|
PRInt32* aWordLenResult,
|
|
PRInt32* aContentLenResult,
|
|
PRBool* aIsWhitespaceResult,
|
|
PRBool* aWasTransformed,
|
|
PRBool aResetTransformBuf = PR_TRUE,
|
|
PRBool aForLineBreak = PR_TRUE,
|
|
PRBool aIsKeyboardSelect = PR_FALSE);
|
|
|
|
PRUnichar* GetPrevWord(PRBool aInWord,
|
|
PRInt32* aWordLenResult,
|
|
PRInt32* aContentLenResult,
|
|
PRBool* aIsWhitespaceResult,
|
|
PRBool aForLineBreak = PR_TRUE,
|
|
PRBool aIsKeyboardSelect = PR_FALSE);
|
|
|
|
|
|
// Returns PR_TRUE if the LEAVE_AS_ASCII flag is set
|
|
PRBool LeaveAsAscii() const {
|
|
return (mFlags & NS_TEXT_TRANSFORMER_LEAVE_AS_ASCII) != 0;
|
|
}
|
|
|
|
// Returns PR_TRUE if any of the characters are multibyte (greater than 127)
|
|
PRBool HasMultibyte() const {
|
|
return (mFlags & NS_TEXT_TRANSFORMER_HAS_MULTIBYTE) != 0;
|
|
}
|
|
|
|
// Returns PR_TRUE if the text in the transform bufer is ascii (i.e., it
|
|
// doesn't contain any multibyte characters)
|
|
PRBool TransformedTextIsAscii() const {
|
|
return (mFlags & NS_TEXT_TRANSFORMER_TRANSFORMED_TEXT_IS_ASCII) != 0;
|
|
}
|
|
|
|
#ifdef IBMBIDI
|
|
// Returns PR_TRUE if the text in the transform bufer needs Arabic
|
|
// shaping
|
|
PRBool NeedsArabicShaping() const {
|
|
return (mFlags & NS_TEXT_TRANSFORMER_DO_ARABIC_SHAPING) != 0;
|
|
}
|
|
|
|
// Returns PR_TRUE if the text in the transform bufer needs numeric
|
|
// shaping
|
|
PRBool NeedsNumericShaping() const {
|
|
return (mFlags & NS_TEXT_TRANSFORMER_DO_NUMERIC_SHAPING) != 0;
|
|
}
|
|
#endif
|
|
|
|
// Set or clears the LEAVE_AS_ASCII bit
|
|
void SetLeaveAsAscii(PRBool aValue) {
|
|
aValue ? mFlags |= NS_TEXT_TRANSFORMER_LEAVE_AS_ASCII :
|
|
mFlags &= (~NS_TEXT_TRANSFORMER_LEAVE_AS_ASCII);
|
|
}
|
|
|
|
// Set or clears the NS_TEXT_TRANSFORMER_HAS_MULTIBYTE bit
|
|
void SetHasMultibyte(PRBool aValue) {
|
|
aValue ? mFlags |= NS_TEXT_TRANSFORMER_HAS_MULTIBYTE :
|
|
mFlags &= (~NS_TEXT_TRANSFORMER_HAS_MULTIBYTE);
|
|
}
|
|
|
|
// Set or clears the NS_TEXT_TRANSFORMER_TRANSFORMED_TEXT_IS_ASCII bit
|
|
void SetTransformedTextIsAscii(PRBool aValue) {
|
|
aValue ? mFlags |= NS_TEXT_TRANSFORMER_TRANSFORMED_TEXT_IS_ASCII :
|
|
mFlags &= (~NS_TEXT_TRANSFORMER_TRANSFORMED_TEXT_IS_ASCII);
|
|
}
|
|
|
|
#ifdef IBMBIDI
|
|
// Set or clears the NS_TEXT_TRANSFORMER_TRANSFORMED_DO_ARABIC_SHAPING bit
|
|
void SetNeedsArabicShaping(PRBool aValue) {
|
|
aValue ? mFlags |= NS_TEXT_TRANSFORMER_DO_ARABIC_SHAPING :
|
|
mFlags &= (~NS_TEXT_TRANSFORMER_DO_ARABIC_SHAPING);
|
|
}
|
|
|
|
// Set or clears the NS_TEXT_TRANSFORMER_TRANSFORMED_DO_NUMERIC_SHAPING bit
|
|
void SetNeedsNumericShaping(PRBool aValue) {
|
|
aValue ? mFlags |= NS_TEXT_TRANSFORMER_DO_NUMERIC_SHAPING :
|
|
mFlags &= (~NS_TEXT_TRANSFORMER_DO_NUMERIC_SHAPING);
|
|
}
|
|
#endif
|
|
|
|
PRUnichar* GetWordBuffer() {
|
|
return mTransformBuf.GetBuffer();
|
|
}
|
|
|
|
PRInt32 GetWordBufferLength() const {
|
|
return mTransformBuf.GetBufferLength();
|
|
}
|
|
|
|
PRBool GetWordSelectEatSpaceAfter() {
|
|
return sWordSelectEatSpaceAfter;
|
|
}
|
|
|
|
PRBool GetWordSelectStopAtPunctuation() {
|
|
return sWordSelectStopAtPunctuation;
|
|
}
|
|
|
|
static nsresult Initialize();
|
|
static void Shutdown();
|
|
|
|
protected:
|
|
// Helper methods for GetNextWord (F == forwards)
|
|
PRInt32 ScanNormalWhiteSpace_F();
|
|
PRInt32 ScanNormalAsciiText_F(PRInt32* aWordLen,
|
|
PRBool* aWasTransformed);
|
|
PRInt32 ScanNormalAsciiText_F_ForWordBreak(PRInt32* aWordLen,
|
|
PRBool* aWasTransformed,
|
|
PRBool aIsKeyboardSelect);
|
|
PRInt32 ScanNormalUnicodeText_F(PRBool aForLineBreak,
|
|
PRInt32* aWordLen,
|
|
PRBool* aWasTransformed);
|
|
PRInt32 ScanPreWrapWhiteSpace_F(PRInt32* aWordLen);
|
|
PRInt32 ScanPreAsciiData_F(PRInt32* aWordLen,
|
|
PRBool* aWasTransformed);
|
|
PRInt32 ScanPreData_F(PRInt32* aWordLen,
|
|
PRBool* aWasTransformed);
|
|
|
|
// Helper methods for GetPrevWord (B == backwards)
|
|
PRInt32 ScanNormalWhiteSpace_B();
|
|
PRInt32 ScanNormalAsciiText_B(PRInt32* aWordLen, PRBool aIsKeyboardSelect);
|
|
PRInt32 ScanNormalUnicodeText_B(PRBool aForLineBreak, PRInt32* aWordLen);
|
|
PRInt32 ScanPreWrapWhiteSpace_B(PRInt32* aWordLen);
|
|
PRInt32 ScanPreData_B(PRInt32* aWordLen);
|
|
|
|
// Converts the current text in the transform buffer from ascii to
|
|
// Unicode
|
|
void ConvertTransformedTextToUnicode();
|
|
|
|
void LanguageSpecificTransform(PRUnichar* aText, PRInt32 aLen,
|
|
PRBool* aWasTransformed);
|
|
|
|
void DoArabicShaping(PRUnichar* aText, PRInt32& aTextLength, PRBool* aWasTransformed);
|
|
|
|
void DoNumericShaping(PRUnichar* aText, PRInt32& aTextLength, PRBool* aWasTransformed);
|
|
|
|
void StripZeroWidthJoinControls(PRUnichar* aSource, PRUnichar* aTarget, PRInt32& aTextLength, PRBool* aWasTransformed);
|
|
|
|
// The text fragment that we are looking at
|
|
const nsTextFragment* mFrag;
|
|
|
|
// Our current offset into the text fragment
|
|
PRInt32 mOffset;
|
|
|
|
// The frame's white-space mode we are using to process text
|
|
enum {
|
|
eNormal,
|
|
ePreformatted,
|
|
ePreWrap
|
|
} mMode;
|
|
|
|
nsILineBreaker* mLineBreaker; // [WEAK]
|
|
|
|
nsIWordBreaker* mWordBreaker; // [WEAK]
|
|
|
|
nsLanguageSpecificTransformType mLanguageSpecificTransformType;
|
|
|
|
#ifdef IBMBIDI
|
|
nsPresContext* mPresContext;
|
|
nsCharType mCharType;
|
|
#endif
|
|
|
|
// Buffer used to hold the transformed words from GetNextWord or
|
|
// GetPrevWord
|
|
nsAutoTextBuffer mTransformBuf;
|
|
|
|
// Our current position within the buffer. Used when iterating the next
|
|
// word, because we may be requested to buffer across multiple words
|
|
PRInt32 mBufferPos;
|
|
|
|
// The frame's text-transform state
|
|
PRUint8 mTextTransform;
|
|
|
|
// Flag for controling mLeaveAsAscii, mHasMultibyte, mTransformedTextIsAscii
|
|
PRUint8 mFlags;
|
|
|
|
// prefs used to configure the double-click word selection behavior
|
|
static int WordSelectPrefCallback(const char* aPref, void* aClosure);
|
|
static PRBool sWordSelectListenerPrefChecked; // have we read the prefs yet?
|
|
static PRBool sWordSelectEatSpaceAfter; // should we include whitespace up to next word?
|
|
static PRBool sWordSelectStopAtPunctuation; // should we stop at punctuation?
|
|
|
|
#ifdef DEBUG
|
|
static void SelfTest(nsILineBreaker* aLineBreaker,
|
|
nsIWordBreaker* aWordBreaker,
|
|
nsPresContext* aPresContext);
|
|
|
|
nsresult Init2(const nsTextFragment* aFrag,
|
|
PRInt32 aStartingOffset,
|
|
PRUint8 aWhiteSpace,
|
|
PRUint8 aTextTransform);
|
|
#endif
|
|
};
|
|
|
|
#endif /* nsTextTransformer_h___ */
|