mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-24 05:11:16 +00:00
Bug 1836974 - Use fastText pref in full-page translation r=gregtatum
Uses the fastText pref to determine which method of language identification to use in full-page translation Differential Revision: https://phabricator.services.mozilla.com/D180746
This commit is contained in:
parent
1f09c9f711
commit
89332109a8
@ -75,6 +75,7 @@ const intermittently_loaded_scripts = {
|
||||
|
||||
// Translations code which may be preffed on.
|
||||
"resource://gre/actors/TranslationsChild.sys.mjs",
|
||||
"resource://gre/modules/translation/LanguageDetector.sys.mjs",
|
||||
"chrome://global/content/translations/language-id-engine.sys.mjs",
|
||||
"resource://gre/modules/ConsoleAPIStorage.sys.mjs", // Logging related.
|
||||
|
||||
|
@ -7,6 +7,20 @@ import { clearTimeout, setTimeout } from "resource://gre/modules/Timer.sys.mjs";
|
||||
|
||||
const WORKER_URL = "resource://gre/modules/translation/cld-worker.js";
|
||||
|
||||
/**
|
||||
* The length of the substring to pull from the document's text for language
|
||||
* identification.
|
||||
*
|
||||
* This value should ideally be one that is large enough to yield a confident
|
||||
* identification result without being too large or expensive to extract.
|
||||
*
|
||||
* At this time, this value is not driven by statistical data or analysis.
|
||||
*
|
||||
* For the moment, while we investigate which language identification library
|
||||
* we would like to use, keep this logic in sync with language-id-engine.sys.mjs
|
||||
*/
|
||||
const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
|
||||
|
||||
export var workerManager = {
|
||||
// Since Emscripten can handle heap growth, but not heap shrinkage, we
|
||||
// need to refresh the worker after we've processed a particularly large
|
||||
@ -150,4 +164,27 @@ export var LanguageDetector = {
|
||||
|
||||
return workerManager.detectLanguage(aParams);
|
||||
},
|
||||
|
||||
/**
|
||||
* Attempts to determine the language in which the document's content is written.
|
||||
*
|
||||
* For the moment, while we investigate which language identification library
|
||||
* we would like to use, keep this logic in sync with language-id-engine.sys.mjs
|
||||
* @returns {string | null}
|
||||
*/
|
||||
async detectLanguageFromDocument(aDocument) {
|
||||
// Grab a selection of text.
|
||||
let encoder = Cu.createDocumentEncoder("text/plain");
|
||||
encoder.init(aDocument, "text/plain", encoder.SkipInvisibleContent);
|
||||
let text = encoder
|
||||
.encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
|
||||
.replaceAll("\r", "")
|
||||
.replaceAll("\n", " ");
|
||||
|
||||
const { language, confident } = await workerManager.detectLanguage({
|
||||
text,
|
||||
});
|
||||
|
||||
return confident ? language : null;
|
||||
},
|
||||
};
|
||||
|
@ -8,6 +8,8 @@ ChromeUtils.defineESModuleGetters(lazy, {
|
||||
"chrome://global/content/translations/translations-engine.sys.mjs",
|
||||
LanguageIdEngine:
|
||||
"chrome://global/content/translations/language-id-engine.sys.mjs",
|
||||
LanguageDetector:
|
||||
"resource://gre/modules/translation/LanguageDetector.sys.mjs",
|
||||
});
|
||||
|
||||
/**
|
||||
@ -81,11 +83,16 @@ export class TranslationsChild extends JSWindowActorChild {
|
||||
return this.document.documentElement.lang;
|
||||
case "Translations:IdentifyLanguage": {
|
||||
try {
|
||||
const engine = await this.createLanguageIdEngine();
|
||||
if (!engine) {
|
||||
return null;
|
||||
if (data.useFastText) {
|
||||
const engine = await this.createLanguageIdEngine();
|
||||
if (!engine) {
|
||||
return null;
|
||||
}
|
||||
return engine.identifyLanguageFromDocument(this.document);
|
||||
}
|
||||
return engine.identifyLanguageFromDocument(this.document);
|
||||
return lazy.LanguageDetector.detectLanguageFromDocument(
|
||||
this.document
|
||||
);
|
||||
} catch (error) {
|
||||
return null;
|
||||
}
|
||||
|
@ -99,6 +99,12 @@ XPCOMUtils.defineLazyPreferenceGetter(
|
||||
"browser.translations.simulateUnsupportedEngine"
|
||||
);
|
||||
|
||||
XPCOMUtils.defineLazyPreferenceGetter(
|
||||
lazy,
|
||||
"useFastTextPref",
|
||||
"browser.translations.languageDetection.fastText"
|
||||
);
|
||||
|
||||
// At this time the signatures of the files are not being checked when they are being
|
||||
// loaded from disk. This signature check involves hitting the network, and translations
|
||||
// are explicitly an offline-capable feature. See Bug 1827265 for re-enabling this
|
||||
@ -1671,7 +1677,9 @@ export class TranslationsParent extends JSWindowActorParent {
|
||||
}
|
||||
|
||||
async queryIdentifyLanguage() {
|
||||
return this.sendQuery("Translations:IdentifyLanguage").catch(error => {
|
||||
return this.sendQuery("Translations:IdentifyLanguage", {
|
||||
useFastText: lazy.useFastTextPref,
|
||||
}).catch(error => {
|
||||
if (this.#isDestroyed) {
|
||||
// The actor was destroyed while this message was still being resolved.
|
||||
return null;
|
||||
|
@ -44,6 +44,9 @@ const DOC_LANGUAGE_DETECTION_THRESHOLD = 0.65;
|
||||
* identification result without being too large or expensive to extract.
|
||||
*
|
||||
* At this time, this value is not driven by statistical data or analysis.
|
||||
*
|
||||
* For the moment, while we investigate which language identification library
|
||||
* we would like to use, keep this logic in sync with LanguageDetector.sys.mjs
|
||||
*/
|
||||
const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
|
||||
|
||||
@ -196,6 +199,10 @@ export class LanguageIdEngine {
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts to determine the language in which the document's content is written.
|
||||
*
|
||||
* For the moment, while we investigate which language identification library
|
||||
* we would like to use, keep this logic in sync with LanguageDetector.sys.mjs
|
||||
* @returns {string | null}
|
||||
*/
|
||||
async identifyLanguageFromDocument(document) {
|
||||
|
Loading…
Reference in New Issue
Block a user