mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-07 01:44:42 +00:00
Bug 1863793 - Add a Bergamot translator build script; r=translations-reviewers,nordzilla
Differential Revision: https://phabricator.services.mozilla.com/D193559
This commit is contained in:
parent
39d85a0ba9
commit
d740b0a3ec
3
.gitignore
vendored
3
.gitignore
vendored
@ -71,6 +71,9 @@ browser/components/newtab/content-src/asrouter/schemas/corpus/PanelTestProvider_
|
||||
# Ignore Pocket component build and dev assets
|
||||
browser/components/pocket/content/panels/css/main.compiled.css.map
|
||||
|
||||
# Ignore downloaded thirdparty build artifacts.
|
||||
toolkit/components/translations/bergamot-translator/thirdparty
|
||||
|
||||
# Build directories for js shell
|
||||
*_DBG.OBJ/
|
||||
*_OPT.OBJ/
|
||||
|
@ -69,6 +69,9 @@ compile_commands\.json
|
||||
# Ignore Pocket component build and dev assets
|
||||
browser/components/pocket/content/panels/css/main.compiled.css.map
|
||||
|
||||
# Ignore downloaded thirdparty build artifacts.
|
||||
toolkit/components/translations/bergamot-translator/thirdparty
|
||||
|
||||
# Build directories for js shell
|
||||
_DBG\.OBJ/
|
||||
_OPT\.OBJ/
|
||||
|
@ -1424,7 +1424,8 @@ toolkit/components/normandy/vendor/
|
||||
toolkit/components/passwordmgr/PasswordRulesParser.sys.mjs
|
||||
toolkit/components/protobuf/
|
||||
toolkit/components/translation/cld2/
|
||||
toolkit/components/translations/bergamot-translator
|
||||
toolkit/components/translations/bergamot-translator/thirdparty
|
||||
toolkit/components/translations/bergamot-translator/bergamot-translator.js
|
||||
toolkit/components/url-classifier/chromium/
|
||||
toolkit/components/utils/mozjexl.js
|
||||
toolkit/components/viaduct/fetch_msg_types.pb.cc
|
||||
|
269
toolkit/components/translations/bergamot-translator/build-bergamot.py
Executable file
269
toolkit/components/translations/bergamot-translator/build-bergamot.py
Executable file
@ -0,0 +1,269 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""
|
||||
Builds the Bergamot translations engine for integration with Firefox.
|
||||
|
||||
If you wish to test the Bergamot engine locally, then uncomment the .wasm line in
|
||||
the toolkit/components/translations/jar.mn after building the file. Just make sure
|
||||
not to check the code change in.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import multiprocessing
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
from collections import namedtuple
|
||||
|
||||
import yaml
|
||||
|
||||
DIR_PATH = os.path.realpath(os.path.dirname(__file__))
|
||||
THIRD_PARTY_PATH = os.path.join(DIR_PATH, "thirdparty")
|
||||
MOZ_YAML_PATH = os.path.join(DIR_PATH, "moz.yaml")
|
||||
PATCHES_PATH = os.path.join(DIR_PATH, "patches")
|
||||
BERGAMOT_PATH = os.path.join(THIRD_PARTY_PATH, "bergamot-translator")
|
||||
MARIAN_PATH = os.path.join(BERGAMOT_PATH, "3rd_party/marian-dev")
|
||||
GEMM_SCRIPT = os.path.join(BERGAMOT_PATH, "wasm/patch-artifacts-import-gemm-module.sh")
|
||||
BUILD_PATH = os.path.join(THIRD_PARTY_PATH, "build-wasm")
|
||||
EMSDK_PATH = os.path.join(THIRD_PARTY_PATH, "emsdk")
|
||||
EMSDK_ENV_PATH = os.path.join(EMSDK_PATH, "emsdk_env.sh")
|
||||
WASM_PATH = os.path.join(BUILD_PATH, "bergamot-translator-worker.wasm")
|
||||
JS_PATH = os.path.join(BUILD_PATH, "bergamot-translator-worker.js")
|
||||
FINAL_JS_PATH = os.path.join(DIR_PATH, "bergamot-translator.js")
|
||||
ROOT_PATH = os.path.join(DIR_PATH, "../../../..")
|
||||
|
||||
# 3.1.47 had an error compiling sentencepiece.
|
||||
EMSDK_VERSION = "3.1.8"
|
||||
EMSDK_REVISION = "2346baa7bb44a4a0571cc75f1986ab9aaa35aa03"
|
||||
|
||||
patches = [
|
||||
(BERGAMOT_PATH, os.path.join(PATCHES_PATH, "allocation-bergamot.patch")),
|
||||
(MARIAN_PATH, os.path.join(PATCHES_PATH, "allocation-marian.patch")),
|
||||
]
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description=__doc__,
|
||||
# Preserves whitespace in the help text.
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--clobber", action="store_true", help="Clobber the build artifacts"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
action="store_true",
|
||||
help="Build with debug symbols, useful for profiling",
|
||||
)
|
||||
|
||||
ArgNamespace = namedtuple("ArgNamespace", ["clobber", "debug"])
|
||||
|
||||
|
||||
def git_clone_update(name: str, repo_path: str, repo_url: str, revision: str):
|
||||
if not os.path.exists(repo_path):
|
||||
print(f"\n⬇️ Clone the {name} repo into {repo_path}\n")
|
||||
subprocess.check_call(
|
||||
["git", "clone", repo_url],
|
||||
cwd=THIRD_PARTY_PATH,
|
||||
)
|
||||
|
||||
local_head = subprocess.check_output(
|
||||
["git", "rev-parse", "HEAD"],
|
||||
cwd=repo_path,
|
||||
text=True,
|
||||
).strip()
|
||||
|
||||
def run(command):
|
||||
return subprocess.check_call(command, cwd=repo_path)
|
||||
|
||||
if local_head != revision:
|
||||
print(f"The head ({local_head}) and revision ({revision}) don't match.")
|
||||
print(f"\n🔎 Fetching the latest from {name}.\n")
|
||||
run(["git", "fetch", "--recurse-submodules"])
|
||||
|
||||
print(f"🛒 Checking out the revision {revision}")
|
||||
run(["git", "checkout", revision])
|
||||
run(["git", "submodule", "update", "--init", "--recursive"])
|
||||
|
||||
|
||||
def install_and_activate_emscripten(args: ArgNamespace):
|
||||
git_clone_update(
|
||||
name="emsdk",
|
||||
repo_path=EMSDK_PATH,
|
||||
repo_url="https://github.com/emscripten-core/emsdk.git",
|
||||
revision=EMSDK_REVISION,
|
||||
)
|
||||
|
||||
# Run these commands in the shell so that the configuration is saved.
|
||||
def run_shell(command):
|
||||
return subprocess.run(command, cwd=EMSDK_PATH, shell=True, check=True)
|
||||
|
||||
print(f"\n🛠️ Installing EMSDK version {EMSDK_VERSION}\n")
|
||||
run_shell("./emsdk install " + EMSDK_VERSION)
|
||||
|
||||
print("\n🛠️ Activating emsdk\n")
|
||||
run_shell("./emsdk activate " + EMSDK_VERSION)
|
||||
|
||||
|
||||
def install_bergamot():
|
||||
with open(MOZ_YAML_PATH, "r", encoding="utf8") as file:
|
||||
text = file.read()
|
||||
|
||||
moz_yaml = yaml.safe_load(text)
|
||||
|
||||
git_clone_update(
|
||||
name="bergamot",
|
||||
repo_path=BERGAMOT_PATH,
|
||||
repo_url=moz_yaml["origin"]["url"],
|
||||
revision=moz_yaml["origin"]["revision"],
|
||||
)
|
||||
|
||||
|
||||
def to_human_readable(size):
|
||||
"""Convert sizes to human-readable format"""
|
||||
size_in_mb = size / 1048576
|
||||
return f"{size_in_mb:.2f}M ({size} bytes)"
|
||||
|
||||
|
||||
def apply_git_patch(repo_path, patch_path):
|
||||
print(f"Applying patch {patch_path} to {os.path.basename(repo_path)}")
|
||||
subprocess.check_call(["git", "apply", "--reject", patch_path], cwd=repo_path)
|
||||
|
||||
|
||||
def revert_git_patch(repo_path, patch_path):
|
||||
print(f"Reverting patch {patch_path} from {os.path.basename(repo_path)}")
|
||||
subprocess.check_call(["git", "apply", "-R", "--reject", patch_path], cwd=repo_path)
|
||||
|
||||
|
||||
def build_bergamot(args: ArgNamespace):
|
||||
if args.clobber and os.path.exists(BUILD_PATH):
|
||||
shutil.rmtree(BUILD_PATH)
|
||||
|
||||
if not os.path.exists(BUILD_PATH):
|
||||
os.mkdir(BUILD_PATH)
|
||||
|
||||
print("\n 🖌️ Applying source code patches\n")
|
||||
for repo_path, patch_path in patches:
|
||||
apply_git_patch(repo_path, patch_path)
|
||||
|
||||
# These commands require the emsdk environment variables to be set up.
|
||||
def run_shell(command):
|
||||
if '"' in command or "'" in command:
|
||||
raise Exception("This run_shell utility does not support quotes.")
|
||||
|
||||
return subprocess.run(
|
||||
# "source" is not available in all shells so explicitly
|
||||
f"bash -c 'source {EMSDK_ENV_PATH} && {command}'",
|
||||
cwd=BUILD_PATH,
|
||||
shell=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
try:
|
||||
flags = ""
|
||||
if args.debug:
|
||||
flags = "-DCMAKE_BUILD_TYPE=RelWithDebInfo"
|
||||
|
||||
print("\n 🏃 Running CMake for Bergamot\n")
|
||||
run_shell(f"emcmake cmake -DCOMPILE_WASM=on {flags} {BERGAMOT_PATH}")
|
||||
|
||||
print("\n 🏃 Building Bergamot with emmake\n")
|
||||
run_shell(f"emmake make -j {multiprocessing.cpu_count()}")
|
||||
|
||||
print("\n 🪚 Patching Bergamot for gemm support\n")
|
||||
subprocess.check_call(["bash", GEMM_SCRIPT, BUILD_PATH])
|
||||
|
||||
print("\n✅ Build complete\n")
|
||||
print(" " + JS_PATH)
|
||||
print(" " + WASM_PATH)
|
||||
|
||||
# Get the sizes of the build artifacts.
|
||||
wasm_size = os.path.getsize(WASM_PATH)
|
||||
gzip_size = int(
|
||||
subprocess.run(
|
||||
f"gzip -c {WASM_PATH} | wc -c",
|
||||
check=True,
|
||||
shell=True,
|
||||
capture_output=True,
|
||||
).stdout.strip()
|
||||
)
|
||||
print(f" Uncompressed wasm size: {to_human_readable(wasm_size)}")
|
||||
print(f" Compressed wasm size: {to_human_readable(gzip_size)}")
|
||||
finally:
|
||||
print("\n🖌️ Reverting the source code patches\n")
|
||||
for repo_path, patch_path in patches[::-1]:
|
||||
revert_git_patch(repo_path, patch_path)
|
||||
|
||||
|
||||
def write_final_bergamot_js_file():
|
||||
"""
|
||||
The generated JS file requires some light patching for integration.
|
||||
"""
|
||||
|
||||
source = "\n".join(
|
||||
[
|
||||
"/* This Source Code Form is subject to the terms of the Mozilla Public",
|
||||
" * License, v. 2.0. If a copy of the MPL was not distributed with this",
|
||||
" * file, You can obtain one at http://mozilla.org/MPL/2.0/. */",
|
||||
"",
|
||||
"function loadBergamot(Module) {",
|
||||
"",
|
||||
]
|
||||
)
|
||||
|
||||
with open(JS_PATH, "r", encoding="utf8") as file:
|
||||
for line in file.readlines():
|
||||
source += " " + line
|
||||
|
||||
source += " return Module;\n}"
|
||||
|
||||
# Use the Module's printing.
|
||||
source = source.replace("console.log(", "Module.print(")
|
||||
|
||||
# Add some instrumentation to the module's memory size.
|
||||
source = source.replace(
|
||||
"function updateGlobalBufferAndViews(buf) {",
|
||||
"""
|
||||
function updateGlobalBufferAndViews(buf) {
|
||||
const mb = (buf.byteLength / 1_000_000).toFixed();
|
||||
Module.print(
|
||||
`Growing wasm buffer to ${mb}MB (${buf.byteLength} bytes).`
|
||||
);
|
||||
""",
|
||||
)
|
||||
|
||||
print("\n Formatting the final bergamot file")
|
||||
# Create the file outside of this directory so it's not ignored by eslint.
|
||||
temp_path = os.path.join(DIR_PATH, "../temp-bergamot.js")
|
||||
with open(temp_path, "w", encoding="utf8") as file:
|
||||
file.write(source)
|
||||
|
||||
subprocess.run(
|
||||
f"./mach eslint --fix {temp_path}",
|
||||
cwd=ROOT_PATH,
|
||||
check=True,
|
||||
shell=True,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
print(f"\n Writing out final bergamot file: {FINAL_JS_PATH}")
|
||||
shutil.move(temp_path, FINAL_JS_PATH)
|
||||
|
||||
|
||||
def main():
|
||||
args: ArgNamespace = parser.parse_args()
|
||||
|
||||
if not os.path.exists(THIRD_PARTY_PATH):
|
||||
os.mkdir(THIRD_PARTY_PATH)
|
||||
|
||||
install_and_activate_emscripten(args)
|
||||
install_bergamot()
|
||||
build_bergamot(args)
|
||||
write_final_bergamot_js_file()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -16,15 +16,15 @@ origin:
|
||||
|
||||
# Full URL for the package's homepage/etc
|
||||
# Usually different from repository url
|
||||
url: https://github.com/mozilla/bergamot-translator/
|
||||
url: https://github.com/browsermt/bergamot-translator.git
|
||||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: v0.4.4
|
||||
release: v0.4.5
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: 5ae1b1ebb3fa9a3eabed8a64ca6798154bd486eb
|
||||
revision: 05a87784973b6e1cc591f1f1a9a05c5873d9971e
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
@ -0,0 +1,26 @@
|
||||
commit dfa705777729fd084f0187a90f9712eb76ea9209
|
||||
parent 05a87784973b6e1cc591f1f1a9a05c5873d9971e
|
||||
Author: Greg Tatum <tatum.creative@gmail.com>
|
||||
Date: Tue Nov 7 10:57:07 2023 -0600
|
||||
|
||||
Change allocation strategy
|
||||
|
||||
This fixes an issue where the memory would grow to 500mb by pre-allocating large
|
||||
workspaces. For some reason the "workspace" configuration for the Wasm build wasn't
|
||||
fixing this, but hard-coding the value does. Perhaps the configuration file in Bergamot
|
||||
is not working correctly, or it was just a mistake on the author's part. Empirically
|
||||
this value keeps memory from growing too rapidly, and does not degrade Wasm performance.
|
||||
|
||||
diff --git a/src/translator/translation_model.cpp b/src/translator/translation_model.cpp
|
||||
index 3f91ebb..61a299f 100644
|
||||
--- a/src/translator/translation_model.cpp
|
||||
+++ b/src/translator/translation_model.cpp
|
||||
@@ -59,7 +59,7 @@ void TranslationModel::loadBackend(size_t idx) {
|
||||
graph->setDefaultElementType(typeFromString(prec[0]));
|
||||
graph->setDevice(device_);
|
||||
graph->getBackend()->configureDevice(options_);
|
||||
- graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
|
||||
+ graph->reserveWorkspaceMB(5);
|
||||
|
||||
// Marian Model: Load from memoryBundle or shortList
|
||||
if (memory_.model.size() > 0 &&
|
@ -0,0 +1,25 @@
|
||||
commit 31a05b47381a5b22b57fe9af7805fa40a5c5e384
|
||||
parent 11c6ae7c46be21ef96ed10c60f28022fa968939f
|
||||
Author: Greg Tatum <tatum.creative@gmail.com>
|
||||
Date: Mon Nov 6 14:01:32 2023 -0600
|
||||
|
||||
Change allocation strategy for tensors
|
||||
|
||||
When tensors grow, they would pre-emptively allocate large amounts of memory, and
|
||||
would allocate ~500mb of memory for a single translation. Adjusting this value
|
||||
down appears to fix this issue. Empirically this value keeps memory from growing too
|
||||
rapidly, and does not degrade Wasm performance.
|
||||
|
||||
diff --git a/src/tensors/tensor_allocator.h b/src/tensors/tensor_allocator.h
|
||||
index e3bc79f9..66f8e44d 100644
|
||||
--- a/src/tensors/tensor_allocator.h
|
||||
+++ b/src/tensors/tensor_allocator.h
|
||||
@@ -13,7 +13,7 @@ class TensorAllocator {
|
||||
private:
|
||||
const size_t CHUNK = 128;
|
||||
const size_t MBYTE = 1024 * 1024;
|
||||
- const size_t GROW = CHUNK * MBYTE;
|
||||
+ const size_t GROW = MBYTE;
|
||||
const size_t ALIGN = 256;
|
||||
|
||||
Ptr<Backend> backend_;
|
@ -493,19 +493,19 @@ class BergamotUtils {
|
||||
*
|
||||
* https://github.com/mozilla/bergamot-translator/
|
||||
*
|
||||
* @param {ArrayBuffer} wasmBinary
|
||||
* @param {ArrayBuffer} wasm
|
||||
* @returns {Promise<Bergamot>}
|
||||
*/
|
||||
static initializeWasm(wasmBinary) {
|
||||
static initializeWasm(wasm) {
|
||||
return new Promise((resolve, reject) => {
|
||||
/** @type {number} */
|
||||
let start = performance.now();
|
||||
|
||||
/** @type {Bergamot} */
|
||||
const bergamot = loadBergamot({
|
||||
// This is the amount of memory that a simple run of Bergamot uses, in byte.
|
||||
INITIAL_MEMORY: 459_276_288,
|
||||
preRun: [],
|
||||
// This is the amount of memory that a simple run of Bergamot uses, in bytes.
|
||||
INITIAL_MEMORY: 234_291_200,
|
||||
print: log,
|
||||
onAbort() {
|
||||
reject(new Error("Error loading Bergamot wasm module."));
|
||||
},
|
||||
@ -519,7 +519,7 @@ class BergamotUtils {
|
||||
await Promise.resolve();
|
||||
resolve(bergamot);
|
||||
},
|
||||
wasmBinary,
|
||||
wasm,
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -179,7 +179,8 @@ toolkit/components/normandy/vendor/
|
||||
toolkit/components/passwordmgr/PasswordRulesParser.sys.mjs
|
||||
toolkit/components/protobuf/
|
||||
toolkit/components/translation/cld2/
|
||||
toolkit/components/translations/bergamot-translator
|
||||
toolkit/components/translations/bergamot-translator/thirdparty
|
||||
toolkit/components/translations/bergamot-translator/bergamot-translator.js
|
||||
toolkit/components/url-classifier/chromium/
|
||||
toolkit/components/utils/mozjexl.js
|
||||
toolkit/components/viaduct/fetch_msg_types.pb.cc
|
||||
|
Loading…
Reference in New Issue
Block a user