Bug 1836974 - Use fastText pref in full-page translation r=gregtatum

Uses the fastText pref to determine which method of language
identification to use in full-page translation

Differential Revision: https://phabricator.services.mozilla.com/D180746
This commit is contained in:
Erik Nordin 2023-06-15 18:33:41 +00:00
parent 1f09c9f711
commit 89332109a8
5 changed files with 65 additions and 5 deletions

View File

@ -75,6 +75,7 @@ const intermittently_loaded_scripts = {
// Translations code which may be preffed on.
"resource://gre/actors/TranslationsChild.sys.mjs",
"resource://gre/modules/translation/LanguageDetector.sys.mjs",
"chrome://global/content/translations/language-id-engine.sys.mjs",
"resource://gre/modules/ConsoleAPIStorage.sys.mjs", // Logging related.

View File

@ -7,6 +7,20 @@ import { clearTimeout, setTimeout } from "resource://gre/modules/Timer.sys.mjs";
const WORKER_URL = "resource://gre/modules/translation/cld-worker.js";
/**
* The length of the substring to pull from the document's text for language
* identification.
*
* This value should ideally be one that is large enough to yield a confident
* identification result without being too large or expensive to extract.
*
* At this time, this value is not driven by statistical data or analysis.
*
* For the moment, while we investigate which language identification library
* we would like to use, keep this logic in sync with language-id-engine.sys.mjs
*/
const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
export var workerManager = {
// Since Emscripten can handle heap growth, but not heap shrinkage, we
// need to refresh the worker after we've processed a particularly large
@ -150,4 +164,27 @@ export var LanguageDetector = {
return workerManager.detectLanguage(aParams);
},
/**
* Attempts to determine the language in which the document's content is written.
*
* For the moment, while we investigate which language identification library
* we would like to use, keep this logic in sync with language-id-engine.sys.mjs
* @returns {string | null}
*/
async detectLanguageFromDocument(aDocument) {
// Grab a selection of text.
let encoder = Cu.createDocumentEncoder("text/plain");
encoder.init(aDocument, "text/plain", encoder.SkipInvisibleContent);
let text = encoder
.encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
.replaceAll("\r", "")
.replaceAll("\n", " ");
const { language, confident } = await workerManager.detectLanguage({
text,
});
return confident ? language : null;
},
};

View File

@ -8,6 +8,8 @@ ChromeUtils.defineESModuleGetters(lazy, {
"chrome://global/content/translations/translations-engine.sys.mjs",
LanguageIdEngine:
"chrome://global/content/translations/language-id-engine.sys.mjs",
LanguageDetector:
"resource://gre/modules/translation/LanguageDetector.sys.mjs",
});
/**
@ -81,11 +83,16 @@ export class TranslationsChild extends JSWindowActorChild {
return this.document.documentElement.lang;
case "Translations:IdentifyLanguage": {
try {
const engine = await this.createLanguageIdEngine();
if (!engine) {
return null;
if (data.useFastText) {
const engine = await this.createLanguageIdEngine();
if (!engine) {
return null;
}
return engine.identifyLanguageFromDocument(this.document);
}
return engine.identifyLanguageFromDocument(this.document);
return lazy.LanguageDetector.detectLanguageFromDocument(
this.document
);
} catch (error) {
return null;
}

View File

@ -99,6 +99,12 @@ XPCOMUtils.defineLazyPreferenceGetter(
"browser.translations.simulateUnsupportedEngine"
);
XPCOMUtils.defineLazyPreferenceGetter(
lazy,
"useFastTextPref",
"browser.translations.languageDetection.fastText"
);
// At this time the signatures of the files are not being checked when they are being
// loaded from disk. This signature check involves hitting the network, and translations
// are explicitly an offline-capable feature. See Bug 1827265 for re-enabling this
@ -1671,7 +1677,9 @@ export class TranslationsParent extends JSWindowActorParent {
}
async queryIdentifyLanguage() {
return this.sendQuery("Translations:IdentifyLanguage").catch(error => {
return this.sendQuery("Translations:IdentifyLanguage", {
useFastText: lazy.useFastTextPref,
}).catch(error => {
if (this.#isDestroyed) {
// The actor was destroyed while this message was still being resolved.
return null;

View File

@ -44,6 +44,9 @@ const DOC_LANGUAGE_DETECTION_THRESHOLD = 0.65;
* identification result without being too large or expensive to extract.
*
* At this time, this value is not driven by statistical data or analysis.
*
* For the moment, while we investigate which language identification library
* we would like to use, keep this logic in sync with LanguageDetector.sys.mjs
*/
const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
@ -196,6 +199,10 @@ export class LanguageIdEngine {
}
/**
* Attempts to determine the language in which the document's content is written.
*
* For the moment, while we investigate which language identification library
* we would like to use, keep this logic in sync with LanguageDetector.sys.mjs
* @returns {string | null}
*/
async identifyLanguageFromDocument(document) {