From 8f6b156c98da4b3a12f29d09a25343995a4ed6ba Mon Sep 17 00:00:00 2001 From: Mike Hommey Date: Mon, 26 Jun 2023 20:02:49 +0000 Subject: [PATCH] Bug 1839658 - Introduce a hack to reduce the use of snapshot.debian.org. r=jcristau snapshot.debian.org is causing us some grief by being extremely slow (there are also more rare occasions of it returning e.g. 404s for files that do exist, but those are less common). The reason we use snapshot.debian.org rather than deb.debian.org is for reproducibility of the docker images. But, in practical terms, most of the packages we do pull from snapshot.debian.org are on deb.debian.org because they haven't received updates between the snapshot date and now. So we interpose the HTTP method from APT, such that when it requests an URL on snapshot.debian.org, we check whether the equivalent URL exists on deb.debian.org and make it use that instead. The downside is that we do need to pull python first before being able to use the script, since it's not in the base docker image. There are a few alternatives I've considered: - A shell script, but that's more fragile, requires curl or wget, which is not available either. - A perl script, as perl is available in the base image, but pulling the required libraries is as much data as pulling python, spread across more packages, which unfortunately tends to make it slower to download. - A custom proxy server, but that felt like more work, and would still require the same amount of extra packages. I've kept the hack simple, not looking too deep in the internal protocol used by APT to talk to its HTTP method. We may want to dig deeper in the future, though, if the 404s get too problematic. Differential Revision: https://phabricator.services.mozilla.com/D181830 --- taskcluster/docker/debian-raw/Dockerfile | 5 + .../docker/debian-raw/snapshot-hack.py | 105 ++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100755 taskcluster/docker/debian-raw/snapshot-hack.py diff --git a/taskcluster/docker/debian-raw/Dockerfile b/taskcluster/docker/debian-raw/Dockerfile index e1a650280256..c331b620ee34 100644 --- a/taskcluster/docker/debian-raw/Dockerfile +++ b/taskcluster/docker/debian-raw/Dockerfile @@ -11,6 +11,7 @@ CMD ["/bin/bash", "--login"] COPY topsrcdir/taskcluster/docker/recipes/setup_packages.sh /usr/local/sbin/ COPY taskcluster-hack.sh /usr/local/sbin +COPY snapshot-hack.py /usr/local/sbin COPY gpgvnoexpkeysig /usr/local/sbin ARG DIST @@ -57,6 +58,10 @@ RUN if [ -n "$DIST" ]; then for s in debian_$DIST debian_$DIST-updates debian_$D ) > /etc/apt/preferences.d/99taskcluster RUN apt-get update && \ + if grep -q snapshot.debian.org /etc/apt/sources.list; then \ + apt-get install python3-minimal libpython3-stdlib; \ + echo 'dir::bin::methods::http "/usr/local/sbin/snapshot-hack.py";' >> /etc/apt/apt.conf.d/99taskcluster; \ + fi && \ apt-get dist-upgrade && \ apt-get install \ apt-transport-https \ diff --git a/taskcluster/docker/debian-raw/snapshot-hack.py b/taskcluster/docker/debian-raw/snapshot-hack.py new file mode 100755 index 000000000000..6e880f0a741d --- /dev/null +++ b/taskcluster/docker/debian-raw/snapshot-hack.py @@ -0,0 +1,105 @@ +#!/usr/bin/python3 +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import subprocess +import sys +import threading +import urllib.request +from urllib.parse import urlparse, urlunparse + +# This script interposes between APT and its HTTP method. APT sends queries on +# stdin, and expect responses on stdout. We intercept those and change the +# snapshot.debian.org URLs it requests on the fly, if the equivalent URLs +# exist on deb.debian.org. + +URI_HEADER = "URI: " + + +def url_exists(url): + try: + req = urllib.request.Request(url, method="HEAD") + response = urllib.request.urlopen(req) + return response.getcode() == 200 + except Exception: + return False + + +def write_and_flush(fh, data): + fh.write(data) + fh.flush() + + +def output_handler(proc, url_mapping, lock): + for line in proc.stdout: + if line.startswith(URI_HEADER): + url = line[len(URI_HEADER) :].rstrip() + # APT expects back the original url it requested. + with lock: + original_url = url_mapping.get(url, None) + if original_url: + write_and_flush(sys.stdout, line.replace(url, original_url)) + continue + write_and_flush(sys.stdout, line) + + +def main(): + proc = subprocess.Popen( + ["/usr/lib/apt/methods/http"], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + text=True, + ) + url_mapping = {} + lock = threading.Lock() + output_thread = threading.Thread( + target=output_handler, args=(proc, url_mapping, lock), daemon=True + ) + output_thread.start() + + while True: + try: + line = sys.stdin.readline() + except KeyboardInterrupt: + # When apt cuts the connection, we receive a KeyboardInterrupt. + break + if not line: + break + + if line.startswith(URI_HEADER): + url = line[len(URI_HEADER) :].rstrip() + url_parts = urlparse(url) + # For .deb packages, if we can find the file on deb.debian.org, take it + # from there instead of snapshot.debian.org, because deb.debian.org will + # be much faster. Hopefully, most files will be available on deb.debian.org. + if url_parts.hostname == "snapshot.debian.org" and url_parts.path.endswith( + ".deb" + ): + # The url is assumed to be of the form + # http://snapshot.debian.org/archive/section/yymmddThhmmssZ/... + path_parts = url_parts.path.split("/") + # urlparse().path always starts with a / so path_parts is + # expected to look like ["", "archive", "section", "yymmddThhmmssZ", ...] + # we want to remove "archive" and "yymmddThhmmssZ" to create an url + # on deb.debian.org. + path_parts.pop(3) + path_parts.pop(1) + modified_url = urlunparse( + url_parts._replace( + netloc="deb.debian.org", path="/".join(path_parts) + ) + ) + if url_exists(modified_url): + with lock: + url_mapping[modified_url] = url + write_and_flush(proc.stdin, line.replace(url, modified_url)) + continue + write_and_flush(proc.stdin, line) + + proc.stdin.close() + output_thread.join() + + +if __name__ == "__main__": + main()