Bug 1272083 - Download and unpacking should be performed in process. r=gps

Instead of downloading a file first and then unpacking it, we would like to
fetch it into memory and then unpacking directly from there.

This saves writing the file first to disk, thus, saving on IO.

MozReview-Commit-ID: JdNGnxIYEvy

--HG--
extra : rebase_source : 73c280fd23f3066d04dd3af07ac443db77dfd6d0
This commit is contained in:
Armen Zambrano Gasparnian 2016-08-25 11:04:16 -04:00
parent ef92f5ddf5
commit 9bad2fd425
2 changed files with 192 additions and 23 deletions

View File

@ -50,6 +50,8 @@ try:
except ImportError:
import json
from cStringIO import StringIO
from mozprocess import ProcessHandler
from mozharness.base.config import BaseConfig
from mozharness.base.log import SimpleFileLogger, MultiFileLogger, \
@ -458,28 +460,157 @@ class ScriptMixin(PlatformMixin):
**retry_args
)
def download_unpack(self, url, extract_to, extract_dirs=None,
error_level=FATAL):
"""Generic method to download and extract a compressed file.
The downloaded file will always be saved to the working directory and is not getting
deleted after extracting.
def _filter_entries(self, namelist, extract_dirs):
"""Filter entries of the archive based on the specified list of to extract dirs."""
filter_partial = functools.partial(fnmatch.filter, namelist)
entries = itertools.chain(*map(filter_partial, extract_dirs or ['*']))
for entry in entries:
yield entry
def unzip(self, file_object, extract_to='.', extract_dirs='*', verbose=False):
"""This method allows to extract a zip file without writing to disk first.
Args:
file_object (object): Any file like object that is seekable.
extract_to (str, optional): where to extract the compressed file.
extract_dirs (list, optional): directories inside the archive file to extract.
Defaults to '*'.
"""
compressed_file = StringIO(file_object.read())
try:
with zipfile.ZipFile(compressed_file) as bundle:
entries = self._filter_entries(bundle.namelist(), extract_dirs)
for entry in entries:
if verbose:
self.info(' {}'.format(entry))
bundle.extract(entry, path=extract_to)
# ZipFile doesn't preserve permissions during extraction:
# http://bugs.python.org/issue15795
fname = os.path.realpath(os.path.join(extract_to, entry))
mode = bundle.getinfo(entry).external_attr >> 16 & 0x1FF
# Only set permissions if attributes are available. Otherwise all
# permissions will be removed eg. on Windows.
if mode:
os.chmod(fname, mode)
except zipfile.BadZipfile as e:
self.exception('{}'.format(e.message))
def deflate(self, file_object, mode, extract_to='.', extract_dirs='*', verbose=False):
"""This method allows to extract a tar, tar.bz2 and tar.gz file without writing to disk first.
Args:
file_object (object): Any file like object that is seekable.
extract_to (str, optional): where to extract the compressed file.
extract_dirs (list, optional): directories inside the archive file to extract.
Defaults to `*`.
verbose (bool, optional): whether or not extracted content should be displayed.
Defaults to False.
"""
compressed_file = StringIO(file_object.read())
t = tarfile.open(fileobj=compressed_file, mode=mode)
t.extractall(path=extract_to)
def download_unpack(self, url, extract_to='.', extract_dirs='*', verbose=False):
"""Generic method to download and extract a compressed file without writing it to disk first.
Args:
url (str): URL where the file to be downloaded is located.
extract_to (str): directory where the downloaded file will
be extracted to.
extract_to (str, optional): directory where the downloaded file will
be extracted to.
extract_dirs (list, optional): directories inside the archive to extract.
Defaults to `None`.
error_level (str, optional): log level to use in case an error occurs.
Defaults to `FATAL`.
Defaults to `*`. It currently only applies to zip files.
Raises:
IOError: on `filename` file not found.
"""
dirs = self.query_abs_dirs()
archive = self.download_file(url, parent_dir=dirs['abs_work_dir'],
error_level=error_level)
self.unpack(archive, extract_to, extract_dirs=extract_dirs,
error_level=error_level)
# Many scripts overwrite this method and set extract_dirs to None
extract_dirs = '*' if extract_dirs is None else extract_dirs
EXTENSION_TO_MIMETYPE = {
'bz2': 'application/x-bzip2',
'gz': 'application/x-gzip',
'tar': 'application/x-tar',
'zip': 'application/zip',
}
MIMETYPES = {
'application/x-bzip2': {
'function': self.deflate,
'kwargs': {'mode': 'r:bz2'},
},
'application/x-gzip': {
'function': self.deflate,
'kwargs': {'mode': 'r:gz'},
},
'application/x-tar': {
'function': self.deflate,
'kwargs': {'mode': 'r'},
},
'application/zip': {
'function': self.unzip,
},
}
parsed_url = urlparse.urlparse(url)
# In case we're referrencing a file without file://
if parsed_url.scheme == '':
if not os.path.isfile(url):
raise IOError('Could not find file to extract: {}'.format(url))
url = 'file://%s' % os.path.abspath(url)
parsed_fd = urlparse.urlparse(url)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
if parsed_url.scheme == 'file':
filename = url.split('/')[-1]
# XXX: bz2/gz instead of tar.{bz2/gz}
extension = filename[filename.rfind('.')+1:]
mimetype = EXTENSION_TO_MIMETYPE[extension]
else:
mimetype = response.headers.type
self.debug('Url: {}'.format(url))
self.debug('Mimetype: {}'.format(mimetype))
self.debug('Content-Encoding {}'.format(response.headers.get('Content-Encoding')))
function = MIMETYPES[mimetype]['function']
kwargs = {
'file_object': response,
'extract_to': extract_to,
'extract_dirs': extract_dirs,
'verbose': verbose,
}
kwargs.update(MIMETYPES[mimetype].get('kwargs', {}))
self.info('Downloading and extracting to {} these dirs {} from {}'.format(
extract_to,
', '.join(extract_dirs),
url,
))
retry_args = dict(
failure_status=None,
retry_exceptions=(urllib2.HTTPError, urllib2.URLError,
httplib.BadStatusLine,
socket.timeout, socket.error),
error_message="Can't download from {}".format(url),
error_level=FATAL,
)
self.retry(
function,
kwargs=kwargs,
**retry_args
)
def load_json_url(self, url, error_level=None, *args, **kwargs):
""" Returns a json object from a url (it retries). """
@ -1409,12 +1540,6 @@ class ScriptMixin(PlatformMixin):
IOError: on `filename` file not found.
"""
def _filter_entries(namelist):
"""Filter entries of the archive based on the specified list of to extract dirs."""
filter_partial = functools.partial(fnmatch.filter, namelist)
for entry in itertools.chain(*map(filter_partial, extract_dirs or ['*'])):
yield entry
if not os.path.isfile(filename):
raise IOError('Could not find file to extract: %s' % filename)
@ -1422,7 +1547,7 @@ class ScriptMixin(PlatformMixin):
try:
self.info('Using ZipFile to extract {} to {}'.format(filename, extract_to))
with zipfile.ZipFile(filename) as bundle:
for entry in _filter_entries(bundle.namelist()):
for entry in self._filter_entries(bundle.namelist(), extract_dirs):
if verbose:
self.info(' %s' % entry)
bundle.extract(entry, path=extract_to)
@ -1444,7 +1569,7 @@ class ScriptMixin(PlatformMixin):
try:
self.info('Using TarFile to extract {} to {}'.format(filename, extract_to))
with tarfile.open(filename) as bundle:
for entry in _filter_entries(bundle.getnames()):
for entry in self._filter_entries(bundle.getnames(), extract_dirs):
if verbose:
self.info(' %s' % entry)
bundle.extract(entry, path=extract_to)

View File

@ -259,6 +259,50 @@ class TestScript(unittest.TestCase):
self.assertTrue(error_logsize > 0,
msg="error list not working properly")
def test_download_unpack(self):
# NOTE: The action is called *download*, however, it can work for files in disk
self.s = get_debug_script_obj()
archives_path = os.path.join(here, 'helper_files', 'archives')
# Test basic decompression
for archive in ('archive.tar', 'archive.tar.bz2', 'archive.tar.gz', 'archive.zip'):
self.s.download_unpack(
url=os.path.join(archives_path, archive),
extract_to=self.tmpdir
)
self.assertIn('script.sh', os.listdir(os.path.join(self.tmpdir, 'bin')))
self.assertIn('lorem.txt', os.listdir(self.tmpdir))
shutil.rmtree(self.tmpdir)
# Test permissions for extracted entries from zip archive
self.s.download_unpack(
url=os.path.join(archives_path, 'archive.zip'),
extract_to=self.tmpdir,
)
file_stats = os.stat(os.path.join(self.tmpdir, 'bin', 'script.sh'))
orig_fstats = os.stat(os.path.join(archives_path, 'reference', 'bin', 'script.sh'))
self.assertEqual(file_stats.st_mode, orig_fstats.st_mode)
shutil.rmtree(self.tmpdir)
# Test unzip specific dirs only
self.s.download_unpack(
url=os.path.join(archives_path, 'archive.zip'),
extract_to=self.tmpdir,
extract_dirs=['bin/*']
)
self.assertIn('bin', os.listdir(self.tmpdir))
self.assertNotIn('lorem.txt', os.listdir(self.tmpdir))
shutil.rmtree(self.tmpdir)
# Test for invalid filenames (Windows only)
if PYWIN32:
with self.assertRaises(IOError):
self.s.download_unpack(
url=os.path.join(archives_path, 'archive_invalid_filename.zip'),
extract_to=self.tmpdir
)
def test_unpack(self):
self.s = get_debug_script_obj()