Bug 1210538 - Add antivirus checks to release promotion graph a=rail

2024-11-27 23:02:20 +00:00 · 2016-02-22 15:51:22 -05:00 · 2016-02-22 15:51:22 -05:00 · c048c0b00d
commit c048c0b00d
parent 877cfaed43
3 changed files with 329 additions and 12 deletions
--- a/release/docker/beet-mover/Dockerfile
+++ b/release/docker/beet-mover/Dockerfile
@ -0,0 +1,19 @@
+FROM ubuntu:vivid
+
+RUN apt-get -q update \
+    && apt-get install --yes -q \
+    mercurial \
+    python-dev \
+    python-pip \
+    python-virtualenv \
+    libffi-dev \
+    libssl-dev \
+    libyaml-dev \
+    libmysqlclient-dev \
+    clamav \
+    clamav-freshclam \
+    curl \
+    wget \
+    && apt-get clean
+
+RUN freshclam --verbose
--- a/testing/mozharness/external_tools/extract_and_run_command.py
+++ b/testing/mozharness/external_tools/extract_and_run_command.py
@ -0,0 +1,205 @@
+#!/usr/bin/env python
+"""\
+Usage: extract_and_run_command.py [-j N] [command to run] -- [files and/or directories]
+    -j is the number of workers to start, defaulting to 1.
+    [command to run] must be a command that can accept one or many files
+    to process as arguments.
+
+WARNING: This script does NOT respond to SIGINT. You must use SIGQUIT or SIGKILL to
+         terminate it early.
+ """
+
+### The canonical location for this file is
+###   https://hg.mozilla.org/build/tools/file/default/stage/extract_and_run_command.py
+###
+### Please update the copy in puppet to deploy new changes to
+### stage.mozilla.org, see
+# https://wiki.mozilla.org/ReleaseEngineering/How_To/Modify_scripts_on_stage
+
+import logging
+import os
+from os import path
+import sys
+from Queue import Queue
+import shutil
+import subprocess
+import tempfile
+from threading import Thread
+import time
+
+logging.basicConfig(
+    stream=sys.stdout, level=logging.INFO, format="%(message)s")
+log = logging.getLogger(__name__)
+
+try:
+    # the future - https://github.com/mozilla/build-mar via a venv
+    from mardor.marfile import BZ2MarFile
+except:
+    # the past - http://hg.mozilla.org/build/tools/file/default/buildfarm/utils/mar.py
+    sys.path.append(
+        path.join(path.dirname(path.realpath(__file__)), "../buildfarm/utils"))
+    from mar import BZ2MarFile
+
+SEVENZIP = "7za"
+
+
+def extractMar(filename, tempdir):
+    m = BZ2MarFile(filename)
+    m.extractall(path=tempdir)
+
+
+def extractExe(filename, tempdir):
+    try:
+        # We don't actually care about output, put we redirect to a tempfile
+        # to avoid deadlocking in wait() when stdout=PIPE
+        fd = tempfile.TemporaryFile()
+        proc = subprocess.Popen([SEVENZIP, 'x', '-o%s' % tempdir, filename],
+                                stdout=fd, stderr=subprocess.STDOUT)
+        proc.wait()
+    except subprocess.CalledProcessError:
+        # Not all EXEs are 7-zip files, so we have to ignore extraction errors
+        pass
+
+# The keys here are matched against the last 3 characters of filenames.
+# The values are callables that accept two string arguments.
+EXTRACTORS = {
+    '.mar': extractMar,
+    '.exe': extractExe,
+}
+
+
+def find_files(d):
+    """yields all of the files in `d'"""
+    for root, dirs, files in os.walk(d):
+        for f in files:
+            yield path.abspath(path.join(root, f))
+
+
+def rchmod(d, mode=0755):
+    """chmods everything in `d' to `mode', including `d' itself"""
+    os.chmod(d, mode)
+    for root, dirs, files in os.walk(d):
+        for item in dirs:
+            os.chmod(path.join(root, item), mode)
+        for item in files:
+            os.chmod(path.join(root, item), mode)
+
+
+def maybe_extract(filename):
+    """If an extractor is found for `filename', extracts it to a temporary
+       directory and chmods it. The consumer is responsible for removing
+       the extracted files, if desired."""
+    ext = path.splitext(filename)[1]
+    if ext not in EXTRACTORS.keys():
+        return None
+    # Append the full filepath to the tempdir
+    tempdir_root = tempfile.mkdtemp()
+    tempdir = path.join(tempdir_root, filename.lstrip('/'))
+    os.makedirs(tempdir)
+    EXTRACTORS[ext](filename, tempdir)
+    rchmod(tempdir_root)
+    return tempdir_root
+
+
+def process(item, command):
+    def format_time(t):
+        return time.strftime("%H:%M:%S", time.localtime(t))
+    # Buffer output to avoid interleaving of multiple workers'
+    logs = []
+    args = [item]
+    proc = None
+    start = time.time()
+    logs.append("START %s: %s" % (format_time(start), item))
+    # If the file was extracted, we need to process all of its files, too.
+    tempdir = maybe_extract(item)
+    if tempdir:
+        for f in find_files(tempdir):
+            args.append(f)
+
+    try:
+        fd = tempfile.TemporaryFile()
+        proc = subprocess.Popen(command + args, stdout=fd)
+        proc.wait()
+        if proc.returncode != 0:
+            raise Exception("returned %s" % proc.returncode)
+    finally:
+        if tempdir:
+            shutil.rmtree(tempdir)
+        fd.seek(0)
+        # rstrip() here to avoid an unnecessary newline, if it exists.
+        logs.append(fd.read().rstrip())
+        end = time.time()
+        elapsed = end - start
+        logs.append("END %s (%d seconds elapsed): %s\n" % (
+            format_time(end), elapsed, item))
+        # Now that we've got all of our output, print it. It's important that
+        # the logging module is used for this, because "print" is not
+        # thread-safe.
+        log.info("\n".join(logs))
+
+
+def worker(command, errors):
+    item = q.get()
+    while item != None:
+        try:
+            process(item, command)
+        except:
+            errors.put(item)
+        item = q.get()
+
+if __name__ == '__main__':
+    # getopt is used in favour of optparse to enable "--" as a separator
+    # between the command and list of files. optparse doesn't allow that.
+    from getopt import getopt
+    options, args = getopt(sys.argv[1:], 'j:h', ['help'])
+
+    concurrency = 1
+    for o, a in options:
+        if o == '-j':
+            concurrency = int(a)
+        elif o in ('-h', '--help'):
+            log.info(__doc__)
+            sys.exit(0)
+
+    if len(args) < 3 or '--' not in args:
+        log.error(__doc__)
+        sys.exit(1)
+
+    command = []
+    while args[0] != "--":
+        command.append(args.pop(0))
+    args.pop(0)
+
+    q = Queue()
+    errors = Queue()
+    threads = []
+    for i in range(concurrency):
+        t = Thread(target=worker, args=(command, errors))
+        t.start()
+        threads.append(t)
+
+    # find_files is a generator, so work will begin prior to it finding
+    # all of the files
+    for arg in args:
+        if path.isfile(arg):
+            q.put(arg)
+        else:
+            for f in find_files(arg):
+                q.put(f)
+    # Because the workers are started before we start populating the q
+    # they can't use .empty() to determine whether or not their done.
+    # We also can't use q.join() or j.task_done(), because we need to
+    # support Python 2.4. We know that find_files won't yield None,
+    # so we can detect doneness by having workers die when they get None
+    # as an item.
+    for i in range(concurrency):
+        q.put(None)
+
+    for t in threads:
+        t.join()
+
+    if not errors.empty():
+        log.error("Command failed for the following files:")
+        while not errors.empty():
+            log.error("  %s" % errors.get())
+        sys.exit(1)
--- a/testing/mozharness/scripts/release/beet_mover.py
+++ b/testing/mozharness/scripts/release/beet_mover.py
@ -6,17 +6,21 @@
 # ***** END LICENSE BLOCK *****
 """beet_mover.py.

-downloads artifacts and uploads them to s3
+downloads artifacts, scans them and uploads them to s3
 """
 import hashlib
 import sys
 import os
 import pprint
+import re
+from os import listdir
+from os.path import isfile, join

 sys.path.insert(1, os.path.dirname(os.path.dirname(sys.path[0])))
 from mozharness.base.log import FATAL
 from mozharness.base.python import VirtualenvMixin
 from mozharness.base.script import BaseScript
+import mozharness


 def get_hash(content, hash_type="md5"):
@ -85,8 +89,39 @@ CONFIG_OPTIONS = [
        "default": False,
        "help": "taskcluster task id to download artifacts from",
    }],
+    [["--exclude"], {
+        "dest": "excludes",
+        "action": "append",
+        "help": "List of filename patterns to exclude. See script source for default",
+    }],
+    [["-s", "--scan-parallelization"], {
+        "dest": "scan_parallelization",
+        "default": 4,
+        "type": "int",
+        "help": "Number of concurrent file scans",
+    }],
 ]

+DEFAULT_EXCLUDES = [
+    r"^.*tests.*$",
+    r"^.*crashreporter.*$",
+    r"^.*\.zip(\.asc)?$",
+    r"^.*\.log$",
+    r"^.*\.txt$",
+    r"^.*\.asc$",
+    r"^.*/partner-repacks.*$",
+    r"^.*.checksums(\.asc)?$",
+    r"^.*/logs/.*$",
+    r"^.*/jsshell.*$",
+    r"^.*json$",
+    r"^.*/host.*$",
+    r"^.*/mar-tools/.*$",
+    r"^.*gecko-unsigned-unaligned.apk$",
+    r"^.*robocop.apk$",
+    r"^.*contrib.*"
+]
+CACHE_DIR = 'cache'
+

 class BeetMover(BaseScript, VirtualenvMixin, object):
    def __init__(self, aws_creds):
@ -98,6 +133,8 @@ class BeetMover(BaseScript, VirtualenvMixin, object):
                'activate-virtualenv',
                'generate-candidates-manifest',
                'verify-bits',  # beets
+                'download-bits', # beets
+                'scan-bits',     # beets
                'upload-bits',  # beets
            ],
            'require_config_file': False,
@ -111,6 +148,8 @@ class BeetMover(BaseScript, VirtualenvMixin, object):
                    "boto",
                    "PyYAML",
                    "Jinja2",
+                    "redo",
+                    "mar",
                ],
                "virtualenv_path": "venv",
                'buckets': {
@ -120,6 +159,7 @@ class BeetMover(BaseScript, VirtualenvMixin, object):
                'product': 'firefox',
            },
        }
+        #todo do excludes need to be configured via command line for specific builds?
        super(BeetMover, self).__init__(**beetmover_kwargs)

        c = self.config
@ -128,6 +168,10 @@ class BeetMover(BaseScript, VirtualenvMixin, object):
        self.virtualenv_imports = None
        self.bucket = c['buckets']['production'] if c['production'] else c['buckets']['development']
        self.aws_key_id, self.aws_secret_key = aws_creds
+        # if excludes is set from command line, use it otherwise use defaults
+        self.excludes = self.config.get('excludes', DEFAULT_EXCLUDES)
+        dirs = self.query_abs_dirs()
+        self.dest_dir = os.path.join(dirs['abs_work_dir'], CACHE_DIR)

    def activate_virtualenv(self):
        """
@ -172,7 +216,7 @@ class BeetMover(BaseScript, VirtualenvMixin, object):
            # mirror current release folder structure
            "s3_prefix": 'pub/{}/candidates'.format(self.config['product']),
            "artifact_base_url": self.config['artifact_base_url'].format(
-                    taskid=self.config['taskid'], subdir=self.config['artifact_sudbir']
+                    taskid=self.config['taskid'], subdir=self.config['artifact_subdir']
            )
        }
        self.manifest = yaml.safe_load(template.render(**template_vars))
@ -187,37 +231,60 @@ class BeetMover(BaseScript, VirtualenvMixin, object):
        # TODO
        self.log('skipping verification. unimplemented...')

+    def download_bits(self):
+        """
+        downloads list of artifacts to self.dest_dir dir based on a given manifest
+        """
+        self.log('downloading and uploading artifacts to self_dest_dir...')
+
+        # TODO - do we want to mirror/upload to more than one region?
+        dirs = self.query_abs_dirs()
+
+        for locale in self.manifest['mapping']:
+            for deliverable in self.manifest['mapping'][locale]:
+                self.log("downloading '{}' deliverable for '{}' locale".format(deliverable, locale))
+                # download locally to working dir
+                source=self.manifest['mapping'][locale][deliverable]['artifact']
+                file_name = self.retry(self.download_file,
+                    args=[source],
+                    kwargs={'parent_dir': dirs['abs_work_dir']},
+                    error_level=FATAL)
+        self.log('Success!')
+
    def upload_bits(self):
        """
-        downloads and uploads list of artifacts to s3 candidates dir based on a given manifest
+        uploads list of artifacts to s3 candidates dir based on a given manifest
        """
-        self.log('downloading and uploading artifacts to s3...')
+        self.log('uploading artifacts to s3...')
+        dirs = self.query_abs_dirs()

        # connect to s3
        boto = self.virtualenv_imports['boto']
        conn = boto.connect_s3(self.aws_key_id, self.aws_secret_key)
        bucket = conn.get_bucket(self.bucket)

+        #todo change so this is not every entry in manifest - should exclude those that don't pass virus sign
+        #not sure how to determine this
        for locale in self.manifest['mapping']:
            for deliverable in self.manifest['mapping'][locale]:
                self.log("uploading '{}' deliverable for '{}' locale".format(deliverable, locale))
+                #we have already downloaded the files locally so we can use that version
+                source = self.manifest['mapping'][locale][deliverable]['artifact']
+                downloaded_file = os.path.join(dirs['abs_work_dir'], self.get_filename_from_url(source))
                self.upload_bit(
-                    source=self.manifest['mapping'][locale][deliverable]['artifact'],
+                    source=downloaded_file,
                    s3_key=self.manifest['mapping'][locale][deliverable]['s3_key'],
                    bucket=bucket,
                )
        self.log('Success!')

+
    def upload_bit(self, source, s3_key, bucket):
        # TODO - do we want to mirror/upload to more than one region?
        dirs = self.query_abs_dirs()
        boto = self.virtualenv_imports['boto']

-        # download locally
-        file_name = self.retry(self.download_file,
-                               args=[source],
-                               kwargs={'parent_dir': dirs['abs_work_dir']},
-                               error_level=FATAL)
+        #todo need to copy from dir to s3

        self.info('uploading to s3 with key: {}'.format(s3_key))
        key = boto.s3.key.Key(bucket)  # create new key
@ -230,20 +297,46 @@ class BeetMover(BaseScript, VirtualenvMixin, object):
            key = bucket.new_key(s3_key)

            # set key value
-            self.retry(key.set_contents_from_filename, args=[file_name], error_level=FATAL),
+            self.retry(key.set_contents_from_filename, args=[source], error_level=FATAL),

            # key.make_public() may lead to race conditions, because
            # it doesn't pass version_id, so it may not set permissions
            bucket.set_canned_acl(acl_str='public-read', key_name=s3_key,
                                  version_id=key.version_id)
        else:
-            if not get_hash(key.get_contents_as_string()) == get_hash(open(file_name).read()):
+            if not get_hash(key.get_contents_as_string()) == get_hash(open(source).read()):
                # for now, let's halt. If necessary, we can revisit this and allow for overwrites
                #  to the same buildnum release with different bits
                self.fatal("`{}` already exists with different checksum.".format(s3_key))
            self.log("`{}` has the same MD5 checksum, not uploading".format(s3_key))

+    def scan_bits(self):

+        dirs = self.query_abs_dirs()
+
+        filenames = [f for f in listdir(dirs['abs_work_dir']) if isfile(join(dirs['abs_work_dir'], f))]
+        self.mkdir_p(self.dest_dir)
+        for file_name in filenames:
+            if self._matches_exclude(file_name):
+                self.info("Excluding {} from virus scan".format(file_name))
+            else:
+                self.info('Copying {} to {}'.format(file_name,self.dest_dir))
+                self.copyfile(os.path.join(dirs['abs_work_dir'], file_name), os.path.join(self.dest_dir,file_name))
+        self._scan_files()
+        self.info('Emptying {}'.format(self.dest_dir))
+        self.rmtree(self.dest_dir)
+
+    def _scan_files(self):
+        """Scan the files we've collected. We do the download and scan concurrently to make
+        it easier to have a coherent log afterwards. Uses the venv python."""
+        external_tools_path = os.path.join(
+                              os.path.abspath(os.path.dirname(os.path.dirname(mozharness.__file__))), 'external_tools')
+        self.run_command([self.query_python_path(), os.path.join(external_tools_path,'extract_and_run_command.py'),
+                         '-j{}'.format(self.config['scan_parallelization']),
+                         'clamscan', '--no-summary', '--', self.dest_dir])
+
+    def _matches_exclude(self, keyname):
+         return any(re.search(exclude, keyname) for exclude in self.excludes)

 if __name__ == '__main__':
    beet_mover = BeetMover(get_aws_auth())