Bug 971043 - Implement getTranslationNodes function to retrieve nodes from webpage that contains meaningful text for translation. r=smaug

This commit is contained in:
Felipe Gomes 2014-04-05 00:21:08 -03:00
parent ce7b44032d
commit 4260890571
11 changed files with 469 additions and 3 deletions

View File

@ -200,6 +200,7 @@ public:
virtual nsresult AppendText(const char16_t* aBuffer, uint32_t aLength,
bool aNotify) MOZ_OVERRIDE;
virtual bool TextIsOnlyWhitespace() MOZ_OVERRIDE;
virtual bool HasTextForTranslation() MOZ_OVERRIDE;
virtual void AppendTextTo(nsAString& aResult) MOZ_OVERRIDE;
virtual bool AppendTextTo(nsAString& aResult,
const mozilla::fallible_t&) MOZ_OVERRIDE NS_WARN_UNUSED_RESULT;

View File

@ -39,8 +39,8 @@ enum nsLinkState {
// IID for the nsIContent interface
#define NS_ICONTENT_IID \
{ 0xafa52dfb, 0x9d92, 0x4592, \
{ 0xa1, 0xd2, 0x08, 0xc4, 0x92, 0x89, 0x7f, 0xce } }
{ 0x1329e5b7, 0x4bcd, 0x450c, \
{ 0xa2, 0x3a, 0x98, 0xc5, 0x85, 0xcd, 0x73, 0xf9 } }
/**
* A node of content in a document's content model. This interface
@ -533,6 +533,14 @@ public:
*/
virtual bool TextIsOnlyWhitespace() = 0;
/**
* Method to see if the text node contains data that is useful
* for a translation: i.e., it consists of more than just whitespace,
* digits and punctuation.
* NOTE: Always returns false for elements.
*/
virtual bool HasTextForTranslation() = 0;
/**
* Append the text content to aResult.
* NOTE: This asserts and returns for elements

View File

@ -1919,6 +1919,12 @@ FragmentOrElement::TextIsOnlyWhitespace()
return false;
}
bool
FragmentOrElement::HasTextForTranslation()
{
return false;
}
void
FragmentOrElement::AppendTextTo(nsAString& aResult)
{

View File

@ -990,6 +990,41 @@ nsGenericDOMDataNode::TextIsOnlyWhitespace()
return true;
}
bool
nsGenericDOMDataNode::HasTextForTranslation()
{
if (mText.Is2b()) {
// The fragment contains non-8bit characters which means there
// was at least one "interesting" character to trigger non-8bit.
return true;
}
if (HasFlag(NS_CACHED_TEXT_IS_ONLY_WHITESPACE) &&
HasFlag(NS_TEXT_IS_ONLY_WHITESPACE)) {
return false;
}
const char* cp = mText.Get1b();
const char* end = cp + mText.GetLength();
unsigned char ch;
for (; cp < end; cp++) {
ch = *cp;
// These are the characters that are letters
// in the first 256 UTF-8 codepoints.
if ((ch >= 'a' && ch <= 'z') ||
(ch >= 'A' && ch <= 'Z') ||
(ch >= 192 && ch <= 214) ||
(ch >= 216 && ch <= 246) ||
(ch >= 248)) {
return true;
}
}
return false;
}
void
nsGenericDOMDataNode::AppendTextTo(nsAString& aResult)
{

View File

@ -145,6 +145,7 @@ public:
virtual nsresult AppendText(const char16_t* aBuffer, uint32_t aLength,
bool aNotify) MOZ_OVERRIDE;
virtual bool TextIsOnlyWhitespace() MOZ_OVERRIDE;
virtual bool HasTextForTranslation() MOZ_OVERRIDE;
virtual void AppendTextTo(nsAString& aResult) MOZ_OVERRIDE;
virtual bool AppendTextTo(nsAString& aResult,
const mozilla::fallible_t&) MOZ_OVERRIDE NS_WARN_UNUSED_RESULT;

View File

@ -87,6 +87,7 @@
#include "nsIInterfaceRequestorUtils.h"
#include "GeckoProfiler.h"
#include "mozilla/Preferences.h"
#include "nsIContentIterator.h"
#ifdef XP_WIN
#undef GetClassName
@ -1597,6 +1598,91 @@ nsDOMWindowUtils::NodesFromRect(float aX, float aY,
aIgnoreRootScrollFrame, aFlushLayout, aReturn);
}
NS_IMETHODIMP
nsDOMWindowUtils::GetTranslationNodes(nsIDOMNode* aRoot,
nsITranslationNodeList** aRetVal)
{
if (!nsContentUtils::IsCallerChrome()) {
return NS_ERROR_DOM_SECURITY_ERR;
}
NS_ENSURE_ARG_POINTER(aRetVal);
nsCOMPtr<nsIContent> root = do_QueryInterface(aRoot);
NS_ENSURE_STATE(root);
nsCOMPtr<nsIDocument> doc = GetDocument();
NS_ENSURE_STATE(doc);
if (root->OwnerDoc() != doc) {
return NS_ERROR_DOM_WRONG_DOCUMENT_ERR;
}
nsTHashtable<nsPtrHashKey<nsIContent>> translationNodesHash(1000);
nsRefPtr<nsTranslationNodeList> list = new nsTranslationNodeList;
uint32_t limit = 15000;
// We begin iteration with content->GetNextNode because we want to explictly
// skip the root tag from being a translation node.
nsIContent* content = root;
while ((limit > 0) && (content = content->GetNextNode(root))) {
if (!content->IsHTML()) {
continue;
}
nsIAtom* localName = content->Tag();
// Skip elements that usually contain non-translatable text content.
if (localName == nsGkAtoms::script ||
localName == nsGkAtoms::iframe ||
localName == nsGkAtoms::frameset ||
localName == nsGkAtoms::frame ||
localName == nsGkAtoms::code ||
localName == nsGkAtoms::noscript ||
localName == nsGkAtoms::style) {
continue;
}
// An element is a translation node if it contains
// at least one text node that has meaningful data
// for translation
for (nsIContent* child = content->GetFirstChild();
child;
child = child->GetNextSibling()) {
if (child->HasTextForTranslation()) {
translationNodesHash.PutEntry(content);
bool isBlockFrame = false;
nsIFrame* frame = content->GetPrimaryFrame();
if (frame) {
isBlockFrame = frame->IsFrameOfType(nsIFrame::eBlockFrame);
}
bool isTranslationRoot = isBlockFrame;
if (!isBlockFrame) {
// If an element is not a block element, it still
// can be considered a translation root if the parent
// of this element didn't make into the list of nodes
// to be translated.
bool parentInList = false;
nsIContent* parent = content->GetParent();
if (parent) {
parentInList = translationNodesHash.Contains(parent);
}
isTranslationRoot = !parentInList;
}
list->AppendElement(content->AsDOMNode(), isTranslationRoot);
--limit;
break;
}
}
}
*aRetVal = list.forget().take();
return NS_OK;
}
static TemporaryRef<DataSourceSurface>
CanvasToDataSourceSurface(nsIDOMHTMLCanvasElement* aCanvas)
{
@ -3883,3 +3969,40 @@ nsDOMWindowUtils::SetAudioVolume(float aVolume)
return window->SetAudioVolume(aVolume);
}
NS_INTERFACE_MAP_BEGIN(nsTranslationNodeList)
NS_INTERFACE_MAP_ENTRY(nsISupports)
NS_INTERFACE_MAP_ENTRY(nsITranslationNodeList)
NS_INTERFACE_MAP_END
NS_IMPL_ADDREF(nsTranslationNodeList)
NS_IMPL_RELEASE(nsTranslationNodeList)
NS_IMETHODIMP
nsTranslationNodeList::Item(uint32_t aIndex, nsIDOMNode** aRetVal)
{
NS_ENSURE_ARG_POINTER(aRetVal);
NS_IF_ADDREF(*aRetVal = mNodes.SafeElementAt(aIndex));
return NS_OK;
}
NS_IMETHODIMP
nsTranslationNodeList::IsTranslationRootAtIndex(uint32_t aIndex, bool* aRetVal)
{
NS_ENSURE_ARG_POINTER(aRetVal);
if (aIndex >= mLength) {
*aRetVal = false;
return NS_OK;
}
*aRetVal = mNodeIsRoot.ElementAt(aIndex);
return NS_OK;
}
NS_IMETHODIMP
nsTranslationNodeList::GetLength(uint32_t* aRetVal)
{
NS_ENSURE_ARG_POINTER(aRetVal);
*aRetVal = mLength;
return NS_OK;
}

View File

@ -25,6 +25,32 @@ namespace mozilla {
}
}
class nsTranslationNodeList MOZ_FINAL : public nsITranslationNodeList
{
public:
nsTranslationNodeList()
{
mNodes.SetCapacity(1000);
mNodeIsRoot.SetCapacity(1000);
mLength = 0;
}
NS_DECL_ISUPPORTS
NS_DECL_NSITRANSLATIONNODELIST
void AppendElement(nsIDOMNode* aElement, bool aIsRoot)
{
mNodes.AppendElement(aElement);
mNodeIsRoot.AppendElement(aIsRoot);
mLength++;
}
private:
nsTArray<nsCOMPtr<nsIDOMNode> > mNodes;
nsTArray<bool> mNodeIsRoot;
uint32_t mLength;
};
class nsDOMWindowUtils MOZ_FINAL : public nsIDOMWindowUtils,
public nsSupportsWeakReference
{

View File

@ -25,6 +25,8 @@ support-files =
[test_domwindowutils.html]
[test_e4x_for_each.html]
[test_error.html]
[test_getTranslationNodes.html]
[test_getTranslationNodes_limit.html]
[test_gsp-qualified.html]
[test_gsp-quirks.html]
[test_gsp-standards.html]

View File

@ -0,0 +1,210 @@
<!DOCTYPE HTML>
<html>
<head>
<title>Test for nsIDOMWindowUtils.getTranslationNodes</title>
<script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
</head>
<body onload="runTest()">
<script type="application/javascript">
var utils = SpecialPowers.wrap(window).
QueryInterface(SpecialPowers.Ci.nsIInterfaceRequestor).
getInterface(SpecialPowers.Ci.nsIDOMWindowUtils);
function testTranslationRoot(rootNode) {
var translationNodes = utils.getTranslationNodes(rootNode);
var expectedResult = rootNode.getAttribute("expected");
var expectedLength = expectedResult.split(" ").length;
is(translationNodes.length, expectedLength,
"Correct number of translation nodes for testcase " + rootNode.id);
var resultList = [];
for (var i = 0; i < translationNodes.length; i++) {
var node = translationNodes.item(i).localName;
if (translationNodes.isTranslationRootAtIndex(i)) {
node += "[root]"
}
resultList.push(node);
}
is(resultList.length, translationNodes.length,
"Correct number of translation nodes for testcase " + rootNode.id);
is(resultList.join(" "), expectedResult,
"Correct list of translation nodes for testcase " + rootNode.id);
}
function runTest() {
isnot(utils, null, "nsIDOMWindowUtils");
var testcases = document.querySelectorAll("div[expected]");
for (var testcase of testcases) {
testTranslationRoot(testcase);
}
var testiframe = document.getElementById("testiframe");
var iframediv = testiframe.contentDocument.querySelector("div");
try {
var foo = utils.getTranslationNodes(iframediv);
ok(false, "Cannot use a node from a different document");
} catch (e) {
is(e.name, "WrongDocumentError", "Cannot use a node from a different document");
}
SimpleTest.finish();
}
SimpleTest.waitForExplicitFinish();
</script>
<!-- Test that an inline element inside a root is not a root -->
<div id="testcase1"
expected="div[root] span">
<div>
lorem ipsum <span>dolor</span> sit amet
</div>
</div>
<!-- Test that a usually inline element becomes a root if it is
displayed as a block -->
<div id="testcase2"
expected="div[root] span[root]">
<div>
lorem ipsum <span style="display: block;">dolor</span> sit amet
</div>
</div>
<!-- Test that the content-less <div> is ignored and only the
<p> with content is returned -->
<div id="testcase3"
expected="p[root]">
<div>
<p>lorem ipsum</p>
</div>
</div>
<!-- Test that an inline element which the parent is not a root
becomes a root -->
<div id="testcase4"
expected="span[root]">
<div>
<span>lorem ipsum</span>
</div>
</div>
<!-- Test siblings -->
<div id="testcase5"
expected="li[root] li[root]">
<ul>
<li>lorem</li>
<li>ipsum</li>
</ul>
</div>
<!-- Test <ul> with content outside li -->
<div id="testcase6"
expected="ul[root] li[root] li[root]">
<ul>Lorem
<li>lorem</li>
<li>ipsum</li>
</ul>
</div>
<!-- Test inline siblings -->
<div id="testcase7"
expected="ul[root] li li">
<ul>Lorem
<li style="display: inline">lorem</li>
<li style="display: inline">ipsum</li>
</ul>
</div>
<!-- Test inline siblings becoming roots -->
<div id="testcase8"
expected="li[root] li[root]">
<ul>
<li style="display: inline">lorem</li>
<li style="display: inline">ipsum</li>
</ul>
</div>
<!-- Test that nodes with only punctuation, whitespace
or numbers are ignored -->
<div id="testcase9"
expected="li[root] li[root]">
<ul>
<li>lorem</li>
<li>ipsum</li>
<li>-.,;'/!@#$%^*()</li>
<li>0123456789</li>
<li>
</li>
</ul>
</div>
<!-- Test paragraphs -->
<div id="testcase10"
expected="p[root] a b p[root] a b">
<p>Lorem ipsum <a href="a.htm">dolor</a> sit <b>amet</b>, consetetur</p>
<p>Lorem ipsum <a href="a.htm">dolor</a> sit <b>amet</b>, consetetur</p>
</div>
<!-- Test that a display:none element is not ignored -->
<div id="testcase11"
expected="p[root] a b">
<p>Lorem ipsum <a href="a.htm">dolor</a> sit <b style="display:none">amet</b>, consetetur</p>
</div>
<!-- Test that deep nesting does not cause useless content to be returned -->
<div id="testcase12"
expected="p[root]">
<div>
<div>
<div>
<p>Lorem ipsum</p>
</div>
</div>
</div>
</div>
<!-- Test that deep nesting does not cause useless content to be returned -->
<div id="testcase13"
expected="div[root] p[root]">
<div>Lorem ipsum
<div>
<div>
<p>Lorem ipsum</p>
</div>
</div>
</div>
</div>
<!-- Test that non-html elements and elements that usually have non-translatable
content are ignored -->
<div id="testcase14"
expected="div[root]">
<div>
Lorem Ipsum
<noscript>Lorem Ipsum</noscript>
<style>.dummyClass { color: blue; }</style>
<script> /* script tag */ </script>
<code> code </code>
<iframe id="testiframe"
src="data:text/html,<div>Lorem ipsum</div>">
</iframe>
<svg>lorem</svg>
<math>ipsum</math>
</div>
</div>
<!-- Test that nesting of inline elements won't produce roots as long as
the parents are in the list of translation nodes -->
<div id="testcase15"
expected="p[root] a b span em">
<p>Lorem <a>ipsum <b>dolor <span>sit</span> amet</b></a>, <em>consetetur</em></p>
</div>
</body>
</html>

View File

@ -0,0 +1,33 @@
<!DOCTYPE HTML>
<html>
<head>
<title>Test for nsIDOMWindowUtils.getTranslationNodes</title>
<script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
</head>
<body onload="runTest()">
<script type="application/javascript">
var utils = SpecialPowers.wrap(window).
QueryInterface(SpecialPowers.Ci.nsIInterfaceRequestor).
getInterface(SpecialPowers.Ci.nsIDOMWindowUtils);
function runTest() {
isnot(utils, null, "nsIDOMWindowUtils");
for (var i = 0; i < 16000; i++) {
var text = document.createTextNode("a");
var node = document.createElement("b");
node.appendChild(text);
document.body.appendChild(node);
}
var translationRoots = utils.getTranslationNodes(document.body);
is (translationRoots.length, 15000, "Translation nodes were limited to 15000 nodes.");
SimpleTest.finish();
}
SimpleTest.waitForExplicitFinish();
</script>
</body>
</html>

View File

@ -42,8 +42,9 @@ interface nsIURI;
interface nsIDOMEventTarget;
interface nsIRunnable;
interface nsICompositionStringSynthesizer;
interface nsITranslationNodeList;
[scriptable, uuid(f3148b3e-6db8-4a49-aa5c-de726449054d)]
[scriptable, uuid(3d977df2-1c0e-4b61-bc21-c6ee757a9191)]
interface nsIDOMWindowUtils : nsISupports {
/**
@ -805,6 +806,16 @@ interface nsIDOMWindowUtils : nsISupports {
in boolean aIgnoreRootScrollFrame,
in boolean aFlushLayout);
/**
* Get a list of nodes that have meaningful textual content to
* be translated. The implementation of this algorithm is in flux
* as we experiment and refine which approach works best.
*
* This method requires chrome privileges.
*/
nsITranslationNodeList getTranslationNodes(in nsIDOMNode aRoot);
/**
* Compare the two canvases, returning the number of differing pixels and
* the maximum difference in a channel. This will throw an error if
@ -1628,3 +1639,13 @@ interface nsIDOMWindowUtils : nsISupports {
*/
attribute float audioVolume;
};
[scriptable, uuid(c694e359-7227-4392-a138-33c0cc1f15a6)]
interface nsITranslationNodeList : nsISupports {
readonly attribute unsigned long length;
nsIDOMNode item(in unsigned long index);
// A translation root is a block element, or an inline element
// which its parent is not a translation node.
boolean isTranslationRootAtIndex(in unsigned long index);
};