mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-23 21:01:08 +00:00
45030f6970
This new version of clang 17 also slightly changed the formatting. # ignore-this-changeset Differential Revision: https://phabricator.services.mozilla.com/D215914
1295 lines
45 KiB
C++
1295 lines
45 KiB
C++
/* -*- Mode: C; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
#include "mozilla/TextUtils.h"
|
|
#include "mozTXTToHTMLConv.h"
|
|
#include "mozilla/intl/Segmenter.h"
|
|
#include "mozilla/Maybe.h"
|
|
#include "nsIThreadRetargetableStreamListener.h"
|
|
#include "nsNetUtil.h"
|
|
#include "nsUnicharUtils.h"
|
|
#include "nsUnicodeProperties.h"
|
|
#include "nsCRT.h"
|
|
#include "nsIExternalProtocolHandler.h"
|
|
#include "nsIURI.h"
|
|
|
|
#include <algorithm>
|
|
|
|
#ifdef DEBUG_BenB_Perf
|
|
# include "prtime.h"
|
|
# include "prinrval.h"
|
|
#endif
|
|
|
|
using mozilla::IsAscii;
|
|
using mozilla::IsAsciiAlpha;
|
|
using mozilla::IsAsciiDigit;
|
|
using mozilla::Maybe;
|
|
using mozilla::Some;
|
|
using mozilla::Span;
|
|
using mozilla::intl::GraphemeClusterBreakIteratorUtf16;
|
|
using mozilla::intl::GraphemeClusterBreakReverseIteratorUtf16;
|
|
|
|
const double growthRate = 1.2;
|
|
|
|
// Bug 183111, editor now replaces multiple spaces with leading
|
|
// 0xA0's and a single ending space, so need to treat 0xA0's as spaces.
|
|
// 0xA0 is the Latin1/Unicode character for "non-breaking space (nbsp)"
|
|
// Also recognize the Japanese ideographic space 0x3000 as a space.
|
|
static inline bool IsSpace(const char16_t aChar) {
|
|
return (nsCRT::IsAsciiSpace(aChar) || aChar == 0xA0 || aChar == 0x3000);
|
|
}
|
|
|
|
// Escape Char will take ch, escape it and append the result to
|
|
// aStringToAppendTo
|
|
void mozTXTToHTMLConv::EscapeChar(const char16_t ch,
|
|
nsAString& aStringToAppendTo,
|
|
bool inAttribute) {
|
|
switch (ch) {
|
|
case '<':
|
|
aStringToAppendTo.AppendLiteral("<");
|
|
break;
|
|
case '>':
|
|
aStringToAppendTo.AppendLiteral(">");
|
|
break;
|
|
case '&':
|
|
aStringToAppendTo.AppendLiteral("&");
|
|
break;
|
|
case '"':
|
|
if (inAttribute) {
|
|
aStringToAppendTo.AppendLiteral(""");
|
|
break;
|
|
}
|
|
// else fall through
|
|
[[fallthrough]];
|
|
default:
|
|
aStringToAppendTo += ch;
|
|
}
|
|
}
|
|
|
|
// EscapeStr takes the passed in string and
|
|
// escapes it IN PLACE.
|
|
void mozTXTToHTMLConv::EscapeStr(nsString& aInString, bool inAttribute) {
|
|
// the replace substring routines
|
|
// don't seem to work if you have a character
|
|
// in the in string that is also in the replacement
|
|
// string! =(
|
|
// aInString.ReplaceSubstring("&", "&");
|
|
// aInString.ReplaceSubstring("<", "<");
|
|
// aInString.ReplaceSubstring(">", ">");
|
|
for (uint32_t i = 0; i < aInString.Length();) {
|
|
switch (aInString[i]) {
|
|
case '<':
|
|
aInString.Cut(i, 1);
|
|
aInString.InsertLiteral(u"<", i);
|
|
i += 4; // skip past the integers we just added
|
|
break;
|
|
case '>':
|
|
aInString.Cut(i, 1);
|
|
aInString.InsertLiteral(u">", i);
|
|
i += 4; // skip past the integers we just added
|
|
break;
|
|
case '&':
|
|
aInString.Cut(i, 1);
|
|
aInString.InsertLiteral(u"&", i);
|
|
i += 5; // skip past the integers we just added
|
|
break;
|
|
case '"':
|
|
if (inAttribute) {
|
|
aInString.Cut(i, 1);
|
|
aInString.InsertLiteral(u""", i);
|
|
i += 6;
|
|
break;
|
|
}
|
|
// else fall through
|
|
[[fallthrough]];
|
|
default:
|
|
i++;
|
|
}
|
|
}
|
|
}
|
|
|
|
void mozTXTToHTMLConv::UnescapeStr(const char16_t* aInString, int32_t aStartPos,
|
|
int32_t aLength, nsString& aOutString) {
|
|
const char16_t* subString = nullptr;
|
|
for (uint32_t i = aStartPos; int32_t(i) - aStartPos < aLength;) {
|
|
int32_t remainingChars = i - aStartPos;
|
|
if (aInString[i] == '&') {
|
|
subString = &aInString[i];
|
|
if (!NS_strncmp(subString, u"<",
|
|
std::min(4, aLength - remainingChars))) {
|
|
aOutString.Append(char16_t('<'));
|
|
i += 4;
|
|
} else if (!NS_strncmp(subString, u">",
|
|
std::min(4, aLength - remainingChars))) {
|
|
aOutString.Append(char16_t('>'));
|
|
i += 4;
|
|
} else if (!NS_strncmp(subString, u"&",
|
|
std::min(5, aLength - remainingChars))) {
|
|
aOutString.Append(char16_t('&'));
|
|
i += 5;
|
|
} else if (!NS_strncmp(subString, u""",
|
|
std::min(6, aLength - remainingChars))) {
|
|
aOutString.Append(char16_t('"'));
|
|
i += 6;
|
|
} else {
|
|
aOutString += aInString[i];
|
|
i++;
|
|
}
|
|
} else {
|
|
aOutString += aInString[i];
|
|
i++;
|
|
}
|
|
}
|
|
}
|
|
|
|
void mozTXTToHTMLConv::CompleteAbbreviatedURL(const char16_t* aInString,
|
|
int32_t aInLength,
|
|
const uint32_t pos,
|
|
nsString& aOutString) {
|
|
NS_ASSERTION(int32_t(pos) < aInLength,
|
|
"bad args to CompleteAbbreviatedURL, see bug #190851");
|
|
if (int32_t(pos) >= aInLength) return;
|
|
|
|
if (aInString[pos] == '@') {
|
|
// only pre-pend a mailto url if the string contains a .domain in it..
|
|
// i.e. we want to linkify johndoe@foo.com but not "let's meet @8pm"
|
|
nsDependentString inString(aInString, aInLength);
|
|
if (inString.FindChar('.', pos) !=
|
|
kNotFound) // if we have a '.' after the @ sign....
|
|
{
|
|
aOutString.AssignLiteral("mailto:");
|
|
aOutString += aInString;
|
|
}
|
|
} else if (aInString[pos] == '.') {
|
|
if (ItMatchesDelimited(aInString, aInLength, u"www.", 4, LT_IGNORE,
|
|
LT_IGNORE)) {
|
|
aOutString.AssignLiteral("http://");
|
|
aOutString += aInString;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool mozTXTToHTMLConv::FindURLStart(const char16_t* aInString,
|
|
int32_t aInLength, const uint32_t pos,
|
|
const modetype check, uint32_t& start) {
|
|
switch (check) { // no breaks, because end of blocks is never reached
|
|
case RFC1738: {
|
|
if (!NS_strncmp(&aInString[std::max(int32_t(pos - 4), 0)], u"<URL:", 5)) {
|
|
start = pos + 1;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
case RFC2396E: {
|
|
nsDependentSubstring temp(aInString, aInLength);
|
|
int32_t i = pos <= 0 ? kNotFound : temp.RFindCharInSet(u"<>\"", pos - 1);
|
|
if (i != kNotFound &&
|
|
(temp[uint32_t(i)] == '<' || temp[uint32_t(i)] == '"')) {
|
|
start = uint32_t(++i);
|
|
return start < pos;
|
|
}
|
|
return false;
|
|
}
|
|
case freetext: {
|
|
int32_t i = pos - 1;
|
|
for (; i >= 0 &&
|
|
(IsAsciiAlpha(aInString[uint32_t(i)]) ||
|
|
IsAsciiDigit(aInString[uint32_t(i)]) ||
|
|
aInString[uint32_t(i)] == '+' || aInString[uint32_t(i)] == '-' ||
|
|
aInString[uint32_t(i)] == '.');
|
|
i--) {
|
|
;
|
|
}
|
|
if (++i >= 0 && uint32_t(i) < pos &&
|
|
IsAsciiAlpha(aInString[uint32_t(i)])) {
|
|
start = uint32_t(i);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
case abbreviated: {
|
|
int32_t i = pos - 1;
|
|
// This disallows non-ascii-characters for email.
|
|
// Currently correct, but revisit later after standards changed.
|
|
bool isEmail = aInString[pos] == (char16_t)'@';
|
|
// These chars mark the start of the URL
|
|
for (; i >= 0 && aInString[uint32_t(i)] != '>' &&
|
|
aInString[uint32_t(i)] != '<' && aInString[uint32_t(i)] != '"' &&
|
|
aInString[uint32_t(i)] != '\'' && aInString[uint32_t(i)] != '`' &&
|
|
aInString[uint32_t(i)] != ',' && aInString[uint32_t(i)] != '{' &&
|
|
aInString[uint32_t(i)] != '[' && aInString[uint32_t(i)] != '(' &&
|
|
aInString[uint32_t(i)] != '|' && aInString[uint32_t(i)] != '\\' &&
|
|
!IsSpace(aInString[uint32_t(i)]) &&
|
|
(!isEmail || IsAscii(aInString[uint32_t(i)])) &&
|
|
(!isEmail || aInString[uint32_t(i)] != ')');
|
|
i--) {
|
|
;
|
|
}
|
|
if (++i >= 0 && uint32_t(i) < pos &&
|
|
(IsAsciiAlpha(aInString[uint32_t(i)]) ||
|
|
IsAsciiDigit(aInString[uint32_t(i)]))) {
|
|
start = uint32_t(i);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
default:
|
|
return false;
|
|
} // switch
|
|
}
|
|
|
|
bool mozTXTToHTMLConv::FindURLEnd(const char16_t* aInString,
|
|
int32_t aInStringLength, const uint32_t pos,
|
|
const modetype check, const uint32_t start,
|
|
uint32_t& end) {
|
|
switch (check) { // no breaks, because end of blocks is never reached
|
|
case RFC1738:
|
|
case RFC2396E: {
|
|
nsDependentSubstring temp(aInString, aInStringLength);
|
|
|
|
int32_t i = temp.FindCharInSet(u"<>\"", pos + 1);
|
|
if (i != kNotFound &&
|
|
temp[uint32_t(i--)] ==
|
|
(check == RFC1738 || temp[start - 1] == '<' ? '>' : '"')) {
|
|
end = uint32_t(i);
|
|
return end > pos;
|
|
}
|
|
return false;
|
|
}
|
|
case freetext:
|
|
case abbreviated: {
|
|
uint32_t i = pos + 1;
|
|
bool isEmail = aInString[pos] == (char16_t)'@';
|
|
bool seenOpeningParenthesis = false; // there is a '(' earlier in the URL
|
|
bool seenOpeningSquareBracket =
|
|
false; // there is a '[' earlier in the URL
|
|
for (; int32_t(i) < aInStringLength; i++) {
|
|
// These chars mark the end of the URL
|
|
if (aInString[i] == '>' || aInString[i] == '<' || aInString[i] == '"' ||
|
|
aInString[i] == '`' || aInString[i] == '}' || aInString[i] == '{' ||
|
|
(aInString[i] == ')' && !seenOpeningParenthesis) ||
|
|
(aInString[i] == ']' && !seenOpeningSquareBracket) ||
|
|
// Allow IPv6 adresses like http://[1080::8:800:200C:417A]/foo.
|
|
(aInString[i] == '[' && i > 2 &&
|
|
(aInString[i - 1] != '/' || aInString[i - 2] != '/')) ||
|
|
IsSpace(aInString[i])) {
|
|
break;
|
|
}
|
|
// Disallow non-ascii-characters for email.
|
|
// Currently correct, but revisit later after standards changed.
|
|
if (isEmail && (aInString[i] == '(' || aInString[i] == '\'' ||
|
|
!IsAscii(aInString[i]))) {
|
|
break;
|
|
}
|
|
if (aInString[i] == '(') seenOpeningParenthesis = true;
|
|
if (aInString[i] == '[') seenOpeningSquareBracket = true;
|
|
}
|
|
// These chars are allowed in the middle of the URL, but not at end.
|
|
// Technically they are, but are used in normal text after the URL.
|
|
while (--i > pos && (aInString[i] == '.' || aInString[i] == ',' ||
|
|
aInString[i] == ';' || aInString[i] == '!' ||
|
|
aInString[i] == '?' || aInString[i] == '-' ||
|
|
aInString[i] == ':' || aInString[i] == '\'')) {
|
|
;
|
|
}
|
|
if (i > pos) {
|
|
end = i;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
default:
|
|
return false;
|
|
} // switch
|
|
}
|
|
|
|
void mozTXTToHTMLConv::CalculateURLBoundaries(
|
|
const char16_t* aInString, int32_t aInStringLength, const uint32_t pos,
|
|
const uint32_t whathasbeendone, const modetype check, const uint32_t start,
|
|
const uint32_t end, nsString& txtURL, nsString& desc,
|
|
int32_t& replaceBefore, int32_t& replaceAfter) {
|
|
uint32_t descstart = start;
|
|
switch (check) {
|
|
case RFC1738: {
|
|
descstart = start - 5;
|
|
desc.Append(&aInString[descstart],
|
|
end - descstart + 2); // include "<URL:" and ">"
|
|
replaceAfter = end - pos + 1;
|
|
} break;
|
|
case RFC2396E: {
|
|
descstart = start - 1;
|
|
desc.Append(&aInString[descstart],
|
|
end - descstart + 2); // include brackets
|
|
replaceAfter = end - pos + 1;
|
|
} break;
|
|
case freetext:
|
|
case abbreviated: {
|
|
descstart = start;
|
|
desc.Append(&aInString[descstart],
|
|
end - start + 1); // don't include brackets
|
|
replaceAfter = end - pos;
|
|
} break;
|
|
default:
|
|
break;
|
|
} // switch
|
|
|
|
EscapeStr(desc, false);
|
|
|
|
txtURL.Append(&aInString[start], end - start + 1);
|
|
txtURL.StripWhitespace();
|
|
|
|
// FIX ME
|
|
nsAutoString temp2;
|
|
ScanTXT(nsDependentSubstring(&aInString[descstart], pos - descstart),
|
|
~kURLs /*prevents loop*/ & whathasbeendone, temp2);
|
|
replaceBefore = temp2.Length();
|
|
}
|
|
|
|
bool mozTXTToHTMLConv::ShouldLinkify(const nsCString& aURL) {
|
|
if (!mIOService) return false;
|
|
|
|
nsAutoCString scheme;
|
|
nsresult rv = mIOService->ExtractScheme(aURL, scheme);
|
|
if (NS_FAILED(rv)) return false;
|
|
|
|
if (scheme == "http" || scheme == "https" || scheme == "mailto") {
|
|
return true;
|
|
}
|
|
|
|
// Get the handler for this scheme.
|
|
nsCOMPtr<nsIProtocolHandler> handler;
|
|
rv = mIOService->GetProtocolHandler(scheme.get(), getter_AddRefs(handler));
|
|
if (NS_FAILED(rv)) return false;
|
|
|
|
// Is it an external protocol handler? If not, linkify it.
|
|
nsCOMPtr<nsIExternalProtocolHandler> externalHandler =
|
|
do_QueryInterface(handler);
|
|
if (!externalHandler) return true; // handler is built-in, linkify it!
|
|
|
|
// If external app exists for the scheme then linkify it.
|
|
bool exists;
|
|
rv = externalHandler->ExternalAppExistsForScheme(scheme, &exists);
|
|
return (NS_SUCCEEDED(rv) && exists);
|
|
}
|
|
|
|
bool mozTXTToHTMLConv::CheckURLAndCreateHTML(const nsString& txtURL,
|
|
const nsString& desc,
|
|
const modetype mode,
|
|
nsString& outputHTML) {
|
|
// Create *uri from txtURL
|
|
nsCOMPtr<nsIURI> uri;
|
|
nsresult rv;
|
|
// Lazily initialize mIOService
|
|
if (!mIOService) {
|
|
mIOService = do_GetIOService();
|
|
|
|
if (!mIOService) return false;
|
|
}
|
|
|
|
// See if the url should be linkified.
|
|
NS_ConvertUTF16toUTF8 utf8URL(txtURL);
|
|
if (!ShouldLinkify(utf8URL)) return false;
|
|
|
|
// it would be faster if we could just check to see if there is a protocol
|
|
// handler for the url and return instead of actually trying to create a
|
|
// url...
|
|
rv = mIOService->NewURI(utf8URL, nullptr, nullptr, getter_AddRefs(uri));
|
|
|
|
// Real work
|
|
if (NS_SUCCEEDED(rv) && uri) {
|
|
outputHTML.AssignLiteral("<a class=\"moz-txt-link-");
|
|
switch (mode) {
|
|
case RFC1738:
|
|
outputHTML.AppendLiteral("rfc1738");
|
|
break;
|
|
case RFC2396E:
|
|
outputHTML.AppendLiteral("rfc2396E");
|
|
break;
|
|
case freetext:
|
|
outputHTML.AppendLiteral("freetext");
|
|
break;
|
|
case abbreviated:
|
|
outputHTML.AppendLiteral("abbreviated");
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
nsAutoString escapedURL(txtURL);
|
|
EscapeStr(escapedURL, true);
|
|
|
|
outputHTML.AppendLiteral("\" href=\"");
|
|
outputHTML += escapedURL;
|
|
outputHTML.AppendLiteral("\">");
|
|
outputHTML += desc;
|
|
outputHTML.AppendLiteral("</a>");
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
NS_IMETHODIMP mozTXTToHTMLConv::FindURLInPlaintext(const char16_t* aInString,
|
|
int32_t aInLength,
|
|
int32_t aPos,
|
|
int32_t* aStartPos,
|
|
int32_t* aEndPos) {
|
|
// call FindURL on the passed in string
|
|
nsAutoString outputHTML; // we'll ignore the generated output HTML
|
|
|
|
*aStartPos = -1;
|
|
*aEndPos = -1;
|
|
|
|
FindURL(aInString, aInLength, aPos, kURLs, outputHTML, *aStartPos, *aEndPos);
|
|
|
|
return NS_OK;
|
|
}
|
|
|
|
bool mozTXTToHTMLConv::FindURL(const char16_t* aInString, int32_t aInLength,
|
|
const uint32_t pos,
|
|
const uint32_t whathasbeendone,
|
|
nsString& outputHTML, int32_t& replaceBefore,
|
|
int32_t& replaceAfter) {
|
|
enum statetype { unchecked, invalid, startok, endok, success };
|
|
static const modetype ranking[] = {RFC1738, RFC2396E, freetext, abbreviated};
|
|
|
|
statetype state[mozTXTToHTMLConv_lastMode + 1]; // 0(=unknown)..lastMode
|
|
/* I don't like this abuse of enums as index for the array,
|
|
but I don't know a better method */
|
|
|
|
// Define, which modes to check
|
|
/* all modes but abbreviated are checked for text[pos] == ':',
|
|
only abbreviated for '.', RFC2396E and abbreviated for '@' */
|
|
for (modetype iState = unknown; iState <= mozTXTToHTMLConv_lastMode;
|
|
iState = modetype(iState + 1)) {
|
|
state[iState] = aInString[pos] == ':' ? unchecked : invalid;
|
|
}
|
|
switch (aInString[pos]) {
|
|
case '@':
|
|
state[RFC2396E] = unchecked;
|
|
[[fallthrough]];
|
|
case '.':
|
|
state[abbreviated] = unchecked;
|
|
break;
|
|
case ':':
|
|
state[abbreviated] = invalid;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
// Test, first successful mode wins, sequence defined by |ranking|
|
|
int32_t iCheck = 0; // the currently tested modetype
|
|
modetype check = ranking[iCheck];
|
|
for (; iCheck < mozTXTToHTMLConv_numberOfModes && state[check] != success;
|
|
iCheck++)
|
|
/* check state from last run.
|
|
If this is the first, check this one, which isn't = success yet */
|
|
{
|
|
check = ranking[iCheck];
|
|
|
|
uint32_t start, end;
|
|
|
|
if (state[check] == unchecked) {
|
|
if (FindURLStart(aInString, aInLength, pos, check, start)) {
|
|
state[check] = startok;
|
|
}
|
|
}
|
|
|
|
if (state[check] == startok) {
|
|
if (FindURLEnd(aInString, aInLength, pos, check, start, end)) {
|
|
state[check] = endok;
|
|
}
|
|
}
|
|
|
|
if (state[check] == endok) {
|
|
nsAutoString txtURL, desc;
|
|
int32_t resultReplaceBefore, resultReplaceAfter;
|
|
|
|
CalculateURLBoundaries(aInString, aInLength, pos, whathasbeendone, check,
|
|
start, end, txtURL, desc, resultReplaceBefore,
|
|
resultReplaceAfter);
|
|
|
|
if (aInString[pos] != ':') {
|
|
nsAutoString temp = txtURL;
|
|
txtURL.SetLength(0);
|
|
CompleteAbbreviatedURL(temp.get(), temp.Length(), pos - start, txtURL);
|
|
}
|
|
|
|
if (!txtURL.IsEmpty() &&
|
|
CheckURLAndCreateHTML(txtURL, desc, check, outputHTML)) {
|
|
replaceBefore = resultReplaceBefore;
|
|
replaceAfter = resultReplaceAfter;
|
|
state[check] = success;
|
|
}
|
|
} // if
|
|
} // for
|
|
return state[check] == success;
|
|
}
|
|
|
|
static inline bool IsAlpha(const uint32_t aChar) {
|
|
return mozilla::unicode::GetGenCategory(aChar) == nsUGenCategory::kLetter;
|
|
}
|
|
|
|
static inline bool IsDigit(const uint32_t aChar) {
|
|
return mozilla::unicode::GetGenCategory(aChar) == nsUGenCategory::kNumber;
|
|
}
|
|
|
|
bool mozTXTToHTMLConv::ItMatchesDelimited(const char16_t* aInString,
|
|
int32_t aInLength,
|
|
const char16_t* rep, int32_t aRepLen,
|
|
LIMTYPE before, LIMTYPE after) {
|
|
// this little method gets called a LOT. I found we were spending a
|
|
// lot of time just calculating the length of the variable "rep"
|
|
// over and over again every time we called it. So we're now passing
|
|
// an integer in here.
|
|
int32_t textLen = aInLength;
|
|
|
|
if (((before == LT_IGNORE && (after == LT_IGNORE || after == LT_DELIMITER)) &&
|
|
textLen < aRepLen) ||
|
|
((before != LT_IGNORE || (after != LT_IGNORE && after != LT_DELIMITER)) &&
|
|
textLen < aRepLen + 1) ||
|
|
(before != LT_IGNORE && after != LT_IGNORE && after != LT_DELIMITER &&
|
|
textLen < aRepLen + 2)) {
|
|
return false;
|
|
}
|
|
|
|
uint32_t text0 = aInString[0];
|
|
if (aInLength > 1 && NS_IS_SURROGATE_PAIR(text0, aInString[1])) {
|
|
text0 = SURROGATE_TO_UCS4(text0, aInString[1]);
|
|
}
|
|
// find length of the char/cluster to be ignored
|
|
int32_t ignoreLen = before == LT_IGNORE ? 0 : 1;
|
|
if (ignoreLen) {
|
|
GraphemeClusterBreakIteratorUtf16 ci(
|
|
Span<const char16_t>(aInString, aInLength));
|
|
ignoreLen = *ci.Next();
|
|
}
|
|
|
|
int32_t afterIndex = aRepLen + ignoreLen;
|
|
uint32_t textAfterPos = aInString[afterIndex];
|
|
if (aInLength > afterIndex + 1 &&
|
|
NS_IS_SURROGATE_PAIR(textAfterPos, aInString[afterIndex + 1])) {
|
|
textAfterPos = SURROGATE_TO_UCS4(textAfterPos, aInString[afterIndex + 1]);
|
|
}
|
|
|
|
return !((before == LT_ALPHA && !IsAlpha(text0)) ||
|
|
(before == LT_DIGIT && !IsDigit(text0)) ||
|
|
(before == LT_DELIMITER &&
|
|
(IsAlpha(text0) || IsDigit(text0) || text0 == *rep)) ||
|
|
(after == LT_ALPHA && !IsAlpha(textAfterPos)) ||
|
|
(after == LT_DIGIT && !IsDigit(textAfterPos)) ||
|
|
(after == LT_DELIMITER &&
|
|
(IsAlpha(textAfterPos) || IsDigit(textAfterPos) ||
|
|
textAfterPos == *rep)) ||
|
|
!Substring(Substring(aInString, aInString + aInLength), ignoreLen,
|
|
aRepLen)
|
|
.Equals(Substring(rep, rep + aRepLen),
|
|
nsCaseInsensitiveStringComparator));
|
|
}
|
|
|
|
uint32_t mozTXTToHTMLConv::NumberOfMatches(const char16_t* aInString,
|
|
int32_t aInStringLength,
|
|
const char16_t* rep, int32_t aRepLen,
|
|
LIMTYPE before, LIMTYPE after) {
|
|
uint32_t result = 0;
|
|
|
|
// Limit lookahead length to avoid pathological O(n^2) behavior; looking so
|
|
// far ahead is unlikely to be important for cases where styling marked-up
|
|
// fragments is actually useful anyhow.
|
|
const uint32_t len =
|
|
std::min(2000u, mozilla::AssertedCast<uint32_t>(aInStringLength));
|
|
GraphemeClusterBreakIteratorUtf16 ci(Span<const char16_t>(aInString, len));
|
|
for (uint32_t pos = 0; pos < len; pos = *ci.Next()) {
|
|
if (ItMatchesDelimited(aInString + pos, aInStringLength - pos, rep, aRepLen,
|
|
before, after)) {
|
|
result++;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// NOTE: the converted html for the phrase is appended to aOutString
|
|
// tagHTML and attributeHTML are plain ASCII (literal strings, in fact)
|
|
bool mozTXTToHTMLConv::StructPhraseHit(
|
|
const char16_t* aInString, int32_t aInStringLength, bool col0,
|
|
const char16_t* tagTXT, int32_t aTagTXTLen, const char* tagHTML,
|
|
const char* attributeHTML, nsAString& aOutString, uint32_t& openTags) {
|
|
/* We're searching for the following pattern:
|
|
LT_DELIMITER - "*" - ALPHA -
|
|
[ some text (maybe more "*"-pairs) - ALPHA ] "*" - LT_DELIMITER.
|
|
<strong> is only inserted, if existence of a pair could be verified
|
|
We use the first opening/closing tag, if we can choose */
|
|
|
|
const char16_t* newOffset = aInString;
|
|
int32_t newLength = aInStringLength;
|
|
if (!col0) // skip the first element?
|
|
{
|
|
newOffset = &aInString[1];
|
|
newLength = aInStringLength - 1;
|
|
}
|
|
|
|
// opening tag
|
|
if (ItMatchesDelimited(aInString, aInStringLength, tagTXT, aTagTXTLen,
|
|
(col0 ? LT_IGNORE : LT_DELIMITER),
|
|
LT_ALPHA) // is opening tag
|
|
&& NumberOfMatches(newOffset, newLength, tagTXT, aTagTXTLen, LT_ALPHA,
|
|
LT_DELIMITER) // remaining closing tags
|
|
> openTags) {
|
|
openTags++;
|
|
aOutString.Append('<');
|
|
aOutString.AppendASCII(tagHTML);
|
|
aOutString.Append(char16_t(' '));
|
|
aOutString.AppendASCII(attributeHTML);
|
|
aOutString.AppendLiteral("><span class=\"moz-txt-tag\">");
|
|
aOutString.Append(tagTXT);
|
|
aOutString.AppendLiteral("</span>");
|
|
return true;
|
|
}
|
|
|
|
// closing tag
|
|
if (openTags > 0 && ItMatchesDelimited(aInString, aInStringLength, tagTXT,
|
|
aTagTXTLen, LT_ALPHA, LT_DELIMITER)) {
|
|
openTags--;
|
|
aOutString.AppendLiteral("<span class=\"moz-txt-tag\">");
|
|
aOutString.Append(tagTXT);
|
|
aOutString.AppendLiteral("</span></");
|
|
aOutString.AppendASCII(tagHTML);
|
|
aOutString.Append(char16_t('>'));
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool mozTXTToHTMLConv::SmilyHit(const char16_t* aInString, int32_t aLength,
|
|
bool col0, const char* tagTXT,
|
|
const nsString& imageName, nsString& outputHTML,
|
|
int32_t& glyphTextLen) {
|
|
if (!aInString || !tagTXT || imageName.IsEmpty()) return false;
|
|
|
|
int32_t tagLen = strlen(tagTXT);
|
|
|
|
uint32_t delim = (col0 ? 0 : 1) + tagLen;
|
|
|
|
if ((col0 || IsSpace(aInString[0])) &&
|
|
(aLength <= int32_t(delim) || IsSpace(aInString[delim]) ||
|
|
(aLength > int32_t(delim + 1) &&
|
|
(aInString[delim] == '.' || aInString[delim] == ',' ||
|
|
aInString[delim] == ';' || aInString[delim] == '8' ||
|
|
aInString[delim] == '>' || aInString[delim] == '!' ||
|
|
aInString[delim] == '?') &&
|
|
IsSpace(aInString[delim + 1]))) &&
|
|
ItMatchesDelimited(aInString, aLength,
|
|
NS_ConvertASCIItoUTF16(tagTXT).get(), tagLen,
|
|
col0 ? LT_IGNORE : LT_DELIMITER, LT_IGNORE)
|
|
// Note: tests at different pos for LT_IGNORE and LT_DELIMITER
|
|
) {
|
|
if (!col0) {
|
|
outputHTML.Truncate();
|
|
outputHTML.Append(char16_t(' '));
|
|
}
|
|
|
|
outputHTML.Append(imageName); // emoji unicode
|
|
glyphTextLen = (col0 ? 0 : 1) + tagLen;
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// the glyph is appended to aOutputString instead of the original string...
|
|
bool mozTXTToHTMLConv::GlyphHit(const char16_t* aInString, int32_t aInLength,
|
|
bool col0, nsAString& aOutputString,
|
|
int32_t& glyphTextLen) {
|
|
char16_t text0 = aInString[0];
|
|
char16_t text1 = aInString[1];
|
|
char16_t firstChar = (col0 ? text0 : text1);
|
|
|
|
// temporary variable used to store the glyph html text
|
|
nsAutoString outputHTML;
|
|
bool bTestSmilie;
|
|
bool bArg = false;
|
|
int i;
|
|
|
|
// refactor some of this mess to avoid code duplication and speed execution a
|
|
// bit there are two cases that need to be tried one after another. To avoid a
|
|
// lot of duplicate code, rolling into a loop
|
|
|
|
i = 0;
|
|
while (i < 2) {
|
|
bTestSmilie = false;
|
|
if (!i && (firstChar == ':' || firstChar == ';' || firstChar == '=' ||
|
|
firstChar == '>' || firstChar == '8' || firstChar == 'O')) {
|
|
// first test passed
|
|
|
|
bTestSmilie = true;
|
|
bArg = col0;
|
|
}
|
|
if (i && col0 &&
|
|
(text1 == ':' || text1 == ';' || text1 == '=' || text1 == '>' ||
|
|
text1 == '8' || text1 == 'O')) {
|
|
// second test passed
|
|
|
|
bTestSmilie = true;
|
|
bArg = false;
|
|
}
|
|
if (bTestSmilie && (SmilyHit(aInString, aInLength, bArg, ":-)",
|
|
u"🙂"_ns, // smile, U+1F642
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ":)",
|
|
u"🙂"_ns, // smile, U+1F642
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ":-D",
|
|
u"😂"_ns, // laughing, U+1F602
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ":-(",
|
|
u"🙁"_ns, // frown, U+1F641
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ":(",
|
|
u"🙁"_ns, // frown, U+1F641
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ":$",
|
|
u"😳"_ns, // embarassed, U+1F633
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ";-)",
|
|
u"😉"_ns, // wink, U+1F609
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, col0, ";)",
|
|
u"😉"_ns, // wink, U+1F609
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ":-\\",
|
|
u"😕"_ns, // undecided, U+1F615
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ":-P",
|
|
u"😛"_ns, // tongue, U+1F61B
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ";-P",
|
|
u"😜"_ns, // winking face with tongue, U+1F61C
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, "=-O",
|
|
u"😮"_ns, // surprise, U+1F62E
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ":-*",
|
|
u"😘"_ns, // kiss, U+1F618
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ">:o",
|
|
u"🤬"_ns, // swearing, U+1F92C
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ">:-o",
|
|
u"🤬"_ns, // swearing, U+1F92C
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ">:(",
|
|
u"😠"_ns, // angry, U+1F620
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ">:-(",
|
|
u"😠"_ns, // angry, U+1F620
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, "8-)",
|
|
u"😎"_ns, // cool, U+1F60E
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ":-$",
|
|
u"🤑"_ns, // money, U+1F911
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ":-!",
|
|
u"😬"_ns, // foot, U+1F62C
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, "O:-)",
|
|
u"😇"_ns, // innocent, U+1F607
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ":'(",
|
|
u"😭"_ns, // cry, U+1F62D
|
|
outputHTML, glyphTextLen) ||
|
|
|
|
SmilyHit(aInString, aInLength, bArg, ":-X",
|
|
u"🤐"_ns, // sealed, U+1F910
|
|
outputHTML, glyphTextLen))) {
|
|
aOutputString.Append(outputHTML);
|
|
return true;
|
|
}
|
|
i++;
|
|
}
|
|
if (text0 == '\f') {
|
|
aOutputString.AppendLiteral("<span class='moz-txt-formfeed'></span>");
|
|
glyphTextLen = 1;
|
|
return true;
|
|
}
|
|
if (text0 == '+' || text1 == '+') {
|
|
if (ItMatchesDelimited(aInString, aInLength, u" +/-", 4, LT_IGNORE,
|
|
LT_IGNORE)) {
|
|
aOutputString.AppendLiteral(" ±");
|
|
glyphTextLen = 4;
|
|
return true;
|
|
}
|
|
if (col0 && ItMatchesDelimited(aInString, aInLength, u"+/-", 3, LT_IGNORE,
|
|
LT_IGNORE)) {
|
|
aOutputString.AppendLiteral("±");
|
|
glyphTextLen = 3;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// x^2 => x<sup>2</sup>, also handle powers x^-2, x^0.5
|
|
// implement regular expression /[\dA-Za-z\)\]}]\^-?\d+(\.\d+)*[^\dA-Za-z]/
|
|
if (text1 == '^' &&
|
|
(IsAsciiDigit(text0) || IsAsciiAlpha(text0) || text0 == ')' ||
|
|
text0 == ']' || text0 == '}') &&
|
|
((2 < aInLength && IsAsciiDigit(aInString[2])) ||
|
|
(3 < aInLength && aInString[2] == '-' && IsAsciiDigit(aInString[3])))) {
|
|
// Find first non-digit
|
|
int32_t delimPos = 3; // skip "^" and first digit (or '-')
|
|
for (; delimPos < aInLength &&
|
|
(IsAsciiDigit(aInString[delimPos]) ||
|
|
(aInString[delimPos] == '.' && delimPos + 1 < aInLength &&
|
|
IsAsciiDigit(aInString[delimPos + 1])));
|
|
delimPos++) {
|
|
;
|
|
}
|
|
|
|
if (delimPos < aInLength && IsAsciiAlpha(aInString[delimPos])) {
|
|
return false;
|
|
}
|
|
|
|
outputHTML.Truncate();
|
|
outputHTML += text0;
|
|
outputHTML.AppendLiteral(
|
|
"<sup class=\"moz-txt-sup\">"
|
|
"<span style=\"display:inline-block;width:0;height:0;overflow:hidden\">"
|
|
"^</span>");
|
|
|
|
aOutputString.Append(outputHTML);
|
|
aOutputString.Append(&aInString[2], delimPos - 2);
|
|
aOutputString.AppendLiteral("</sup>");
|
|
|
|
glyphTextLen = delimPos /* - 1 + 1 */;
|
|
return true;
|
|
}
|
|
/*
|
|
The following strings are not substituted:
|
|
|TXT |HTML |Reason
|
|
+------+---------+----------
|
|
-> ← Bug #454
|
|
=> ⇐ dito
|
|
<- → dito
|
|
<= ⇒ dito
|
|
(tm) ™ dito
|
|
1/4 ¼ is triggered by 1/4 Part 1, 2/4 Part 2, ...
|
|
3/4 ¾ dito
|
|
1/2 ½ similar
|
|
*/
|
|
return false;
|
|
}
|
|
|
|
/***************************************************************************
|
|
Library-internal Interface
|
|
****************************************************************************/
|
|
|
|
NS_IMPL_ISUPPORTS(mozTXTToHTMLConv, mozITXTToHTMLConv, nsIStreamConverter,
|
|
nsIThreadRetargetableStreamListener, nsIStreamListener,
|
|
nsIRequestObserver)
|
|
|
|
int32_t mozTXTToHTMLConv::CiteLevelTXT(const char16_t* line,
|
|
uint32_t& logLineStart) {
|
|
int32_t result = 0;
|
|
int32_t lineLength = NS_strlen(line);
|
|
|
|
bool moreCites = true;
|
|
while (moreCites) {
|
|
/* E.g. the following lines count as quote:
|
|
|
|
> text
|
|
//#ifdef QUOTE_RECOGNITION_AGGRESSIVE
|
|
>text
|
|
//#ifdef QUOTE_RECOGNITION_AGGRESSIVE
|
|
> text
|
|
] text
|
|
USER> text
|
|
USER] text
|
|
//#endif
|
|
|
|
logLineStart is the position of "t" in this example
|
|
*/
|
|
uint32_t i = logLineStart;
|
|
|
|
#ifdef QUOTE_RECOGNITION_AGGRESSIVE
|
|
for (; int32_t(i) < lineLength && IsSpace(line[i]); i++);
|
|
for (; int32_t(i) < lineLength && IsAsciiAlpha(line[i]) &&
|
|
nsCRT::IsUpper(line[i]);
|
|
i++);
|
|
if (int32_t(i) < lineLength && (line[i] == '>' || line[i] == ']'))
|
|
#else
|
|
if (int32_t(i) < lineLength && line[i] == '>')
|
|
#endif
|
|
{
|
|
i++;
|
|
if (int32_t(i) < lineLength && line[i] == ' ') i++;
|
|
// sendmail/mbox
|
|
// Placed here for performance increase
|
|
const char16_t* indexString = &line[logLineStart];
|
|
// here, |logLineStart < lineLength| is always true
|
|
uint32_t minlength = std::min(uint32_t(6), NS_strlen(indexString));
|
|
if (Substring(indexString, indexString + minlength)
|
|
.Equals(Substring(u">From "_ns, 0, minlength),
|
|
nsCaseInsensitiveStringComparator)) {
|
|
// XXX RFC2646
|
|
moreCites = false;
|
|
} else {
|
|
result++;
|
|
logLineStart = i;
|
|
}
|
|
} else {
|
|
moreCites = false;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
mozTXTToHTMLConv::ScanTXT(const nsAString& aInString, uint32_t whattodo,
|
|
nsAString& aOutString) {
|
|
if (aInString.Length() == 0) {
|
|
aOutString.Truncate();
|
|
return NS_OK;
|
|
}
|
|
|
|
if (!aOutString.SetCapacity(uint32_t(aInString.Length() * growthRate),
|
|
mozilla::fallible)) {
|
|
return NS_ERROR_OUT_OF_MEMORY;
|
|
}
|
|
|
|
bool doURLs = 0 != (whattodo & kURLs);
|
|
bool doGlyphSubstitution = 0 != (whattodo & kGlyphSubstitution);
|
|
bool doStructPhrase = 0 != (whattodo & kStructPhrase);
|
|
|
|
uint32_t structPhrase_strong = 0; // Number of currently open tags
|
|
uint32_t structPhrase_underline = 0;
|
|
uint32_t structPhrase_italic = 0;
|
|
uint32_t structPhrase_code = 0;
|
|
|
|
uint32_t endOfLastURLOutput = 0;
|
|
|
|
nsAutoString outputHTML; // moved here for performance increase
|
|
|
|
const char16_t* rawInputString = aInString.BeginReading();
|
|
uint32_t inLength = aInString.Length();
|
|
|
|
const Span<const char16_t> inString(aInString);
|
|
GraphemeClusterBreakIteratorUtf16 ci(inString);
|
|
uint32_t i = 0;
|
|
while (i < inLength) {
|
|
if (doGlyphSubstitution) {
|
|
int32_t glyphTextLen;
|
|
if (GlyphHit(&rawInputString[i], inLength - i, i == 0, aOutString,
|
|
glyphTextLen)) {
|
|
i = *ci.Seek(i + glyphTextLen - 1);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (doStructPhrase) {
|
|
const char16_t* newOffset = rawInputString;
|
|
int32_t newLength = aInString.Length();
|
|
if (i > 0) // skip the first element?
|
|
{
|
|
GraphemeClusterBreakReverseIteratorUtf16 ri(
|
|
Span<const char16_t>(rawInputString, i));
|
|
Maybe<uint32_t> nextPos = ri.Next();
|
|
newOffset += *nextPos;
|
|
newLength -= *nextPos;
|
|
}
|
|
|
|
switch (aInString[i]) // Performance increase
|
|
{
|
|
case '*':
|
|
if (StructPhraseHit(newOffset, newLength, i == 0, u"*", 1, "b",
|
|
"class=\"moz-txt-star\"", aOutString,
|
|
structPhrase_strong)) {
|
|
i = *ci.Next();
|
|
continue;
|
|
}
|
|
break;
|
|
case '/':
|
|
if (StructPhraseHit(newOffset, newLength, i == 0, u"/", 1, "i",
|
|
"class=\"moz-txt-slash\"", aOutString,
|
|
structPhrase_italic)) {
|
|
i = *ci.Next();
|
|
continue;
|
|
}
|
|
break;
|
|
case '_':
|
|
if (StructPhraseHit(newOffset, newLength, i == 0, u"_", 1,
|
|
"span" /* <u> is deprecated */,
|
|
"class=\"moz-txt-underscore\"", aOutString,
|
|
structPhrase_underline)) {
|
|
i = *ci.Next();
|
|
continue;
|
|
}
|
|
break;
|
|
case '|':
|
|
if (StructPhraseHit(newOffset, newLength, i == 0, u"|", 1, "code",
|
|
"class=\"moz-txt-verticalline\"", aOutString,
|
|
structPhrase_code)) {
|
|
i = *ci.Next();
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (doURLs) {
|
|
switch (aInString[i]) {
|
|
case ':':
|
|
case '@':
|
|
case '.':
|
|
if ((i == 0 || ((i > 0) && aInString[i - 1] != ' ')) &&
|
|
((i == aInString.Length() - 1) ||
|
|
(aInString[i + 1] != ' '))) // Performance increase
|
|
{
|
|
int32_t replaceBefore;
|
|
int32_t replaceAfter;
|
|
if (FindURL(rawInputString, aInString.Length(), i, whattodo,
|
|
outputHTML, replaceBefore, replaceAfter) &&
|
|
structPhrase_strong + structPhrase_italic +
|
|
structPhrase_underline + structPhrase_code ==
|
|
0
|
|
/* workaround for bug #19445 */) {
|
|
// Don't cut into previously inserted HTML (bug 1509493)
|
|
if (aOutString.Length() - replaceBefore < endOfLastURLOutput) {
|
|
break;
|
|
}
|
|
aOutString.Cut(aOutString.Length() - replaceBefore,
|
|
replaceBefore);
|
|
aOutString += outputHTML;
|
|
endOfLastURLOutput = aOutString.Length();
|
|
i = *ci.Seek(i + replaceAfter);
|
|
continue;
|
|
}
|
|
}
|
|
break;
|
|
} // switch
|
|
}
|
|
|
|
switch (aInString[i]) {
|
|
// Special symbols
|
|
case '<':
|
|
case '>':
|
|
case '&':
|
|
EscapeChar(aInString[i], aOutString, false);
|
|
i = *ci.Next();
|
|
break;
|
|
// Normal characters
|
|
default: {
|
|
const uint32_t oldIdx = i;
|
|
i = *ci.Next();
|
|
aOutString.Append(inString.FromTo(oldIdx, i));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return NS_OK;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
mozTXTToHTMLConv::ScanHTML(const nsAString& input, uint32_t whattodo,
|
|
nsAString& aOutString) {
|
|
const nsPromiseFlatString& aInString = PromiseFlatString(input);
|
|
if (!aOutString.SetCapacity(uint32_t(aInString.Length() * growthRate),
|
|
mozilla::fallible)) {
|
|
return NS_ERROR_OUT_OF_MEMORY;
|
|
}
|
|
|
|
// some common variables we were recalculating
|
|
// every time inside the for loop...
|
|
int32_t lengthOfInString = aInString.Length();
|
|
const char16_t* uniBuffer = aInString.get();
|
|
|
|
#ifdef DEBUG_BenB_Perf
|
|
PRTime parsing_start = PR_IntervalNow();
|
|
#endif
|
|
|
|
// Look for simple entities not included in a tags and scan them.
|
|
// Skip all tags ("<[...]>") and content in an a link tag ("<a [...]</a>"),
|
|
// comment tag ("<!--[...]-->"), style tag, script tag or head tag.
|
|
// Unescape the rest (text between tags) and pass it to ScanTXT.
|
|
nsAutoCString canFollow(" \f\n\r\t>");
|
|
for (int32_t i = 0; i < lengthOfInString;) {
|
|
if (aInString[i] == '<') // html tag
|
|
{
|
|
int32_t start = i;
|
|
if (i + 2 < lengthOfInString && nsCRT::ToLower(aInString[i + 1]) == 'a' &&
|
|
canFollow.FindChar(aInString[i + 2]) != kNotFound)
|
|
// if a tag, skip until </a>.
|
|
// Make sure there's a white-space character after, not to match "abbr".
|
|
{
|
|
i = aInString.LowerCaseFindASCII("</a>", i);
|
|
if (i == kNotFound) {
|
|
i = lengthOfInString;
|
|
} else {
|
|
i += 4;
|
|
}
|
|
} else if (Substring(aInString, i + 1, 3).LowerCaseEqualsASCII("!--"))
|
|
// if out-commended code, skip until -->
|
|
{
|
|
i = aInString.Find(u"-->", i);
|
|
if (i == kNotFound) {
|
|
i = lengthOfInString;
|
|
} else {
|
|
i += 3;
|
|
}
|
|
} else if (i + 6 < lengthOfInString &&
|
|
Substring(aInString, i + 1, 5).LowerCaseEqualsASCII("style") &&
|
|
canFollow.FindChar(aInString[i + 6]) != kNotFound)
|
|
// if style tag, skip until </style>
|
|
{
|
|
i = aInString.LowerCaseFindASCII("</style>", i);
|
|
if (i == kNotFound) {
|
|
i = lengthOfInString;
|
|
} else {
|
|
i += 8;
|
|
}
|
|
} else if (i + 7 < lengthOfInString &&
|
|
Substring(aInString, i + 1, 6)
|
|
.LowerCaseEqualsASCII("script") &&
|
|
canFollow.FindChar(aInString[i + 7]) != kNotFound)
|
|
// if script tag, skip until </script>
|
|
{
|
|
i = aInString.LowerCaseFindASCII("</script>", i);
|
|
if (i == kNotFound) {
|
|
i = lengthOfInString;
|
|
} else {
|
|
i += 9;
|
|
}
|
|
} else if (i + 5 < lengthOfInString &&
|
|
Substring(aInString, i + 1, 4).LowerCaseEqualsASCII("head") &&
|
|
canFollow.FindChar(aInString[i + 5]) != kNotFound)
|
|
// if head tag, skip until </head>
|
|
// Make sure not to match <header>.
|
|
{
|
|
i = aInString.LowerCaseFindASCII("</head>", i);
|
|
if (i == kNotFound) {
|
|
i = lengthOfInString;
|
|
} else {
|
|
i += 7;
|
|
}
|
|
} else // just skip tag (attributes etc.)
|
|
{
|
|
i = aInString.FindChar('>', i);
|
|
if (i == kNotFound) {
|
|
i = lengthOfInString;
|
|
} else {
|
|
i++;
|
|
}
|
|
}
|
|
aOutString.Append(&uniBuffer[start], i - start);
|
|
} else {
|
|
uint32_t start = uint32_t(i);
|
|
i = aInString.FindChar('<', i);
|
|
if (i == kNotFound) i = lengthOfInString;
|
|
|
|
nsAutoStringN<256> tempString;
|
|
tempString.SetCapacity(uint32_t((uint32_t(i) - start) * growthRate));
|
|
UnescapeStr(uniBuffer, start, uint32_t(i) - start, tempString);
|
|
ScanTXT(tempString, whattodo, aOutString);
|
|
}
|
|
}
|
|
|
|
#ifdef DEBUG_BenB_Perf
|
|
printf("ScanHTML time: %d ms\n",
|
|
PR_IntervalToMilliseconds(PR_IntervalNow() - parsing_start));
|
|
#endif
|
|
return NS_OK;
|
|
}
|
|
|
|
/****************************************************************************
|
|
XPCOM Interface
|
|
*****************************************************************************/
|
|
|
|
NS_IMETHODIMP
|
|
mozTXTToHTMLConv::Convert(nsIInputStream* aFromStream, const char* aFromType,
|
|
const char* aToType, nsISupports* aCtxt,
|
|
nsIInputStream** _retval) {
|
|
return NS_ERROR_NOT_IMPLEMENTED;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
mozTXTToHTMLConv::AsyncConvertData(const char* aFromType, const char* aToType,
|
|
nsIStreamListener* aListener,
|
|
nsISupports* aCtxt) {
|
|
return NS_ERROR_NOT_IMPLEMENTED;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
mozTXTToHTMLConv::GetConvertedType(const nsACString& aFromType,
|
|
nsIChannel* aChannel, nsACString& aToType) {
|
|
return NS_ERROR_NOT_IMPLEMENTED;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
mozTXTToHTMLConv::OnDataAvailable(nsIRequest* request, nsIInputStream* inStr,
|
|
uint64_t sourceOffset, uint32_t count) {
|
|
return NS_ERROR_NOT_IMPLEMENTED;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
mozTXTToHTMLConv::OnDataFinished(nsresult aStatus) {
|
|
return NS_ERROR_NOT_IMPLEMENTED;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
mozTXTToHTMLConv::CheckListenerChain() { return NS_ERROR_NOT_IMPLEMENTED; }
|
|
|
|
NS_IMETHODIMP
|
|
mozTXTToHTMLConv::MaybeRetarget(nsIRequest* request) {
|
|
return NS_ERROR_NOT_IMPLEMENTED;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
mozTXTToHTMLConv::OnStartRequest(nsIRequest* request) {
|
|
return NS_ERROR_NOT_IMPLEMENTED;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
mozTXTToHTMLConv::OnStopRequest(nsIRequest* request, nsresult aStatus) {
|
|
return NS_ERROR_NOT_IMPLEMENTED;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
mozTXTToHTMLConv::CiteLevelTXT(const char16_t* line, uint32_t* logLineStart,
|
|
uint32_t* _retval) {
|
|
if (!logLineStart || !_retval || !line) return NS_ERROR_NULL_POINTER;
|
|
*_retval = CiteLevelTXT(line, *logLineStart);
|
|
return NS_OK;
|
|
}
|
|
|
|
nsresult MOZ_NewTXTToHTMLConv(mozTXTToHTMLConv** aConv) {
|
|
MOZ_ASSERT(aConv != nullptr, "null ptr");
|
|
if (!aConv) return NS_ERROR_NULL_POINTER;
|
|
|
|
RefPtr<mozTXTToHTMLConv> conv = new mozTXTToHTMLConv();
|
|
conv.forget(aConv);
|
|
// return (*aConv)->Init();
|
|
return NS_OK;
|
|
}
|