mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-24 21:31:04 +00:00
Bug 1800198 - Rewrite a pbzx extractor from scratch. r=gsvelto
It turns out the format doesn't look quite like what the original code made it out to be. There's also no need to extract parts separately to then post-process them in PackageSymbolDumper, the pbzx extractor can handle that itself. Differential Revision: https://phabricator.services.mozilla.com/D161878
This commit is contained in:
parent
7c1951aff7
commit
07bb461fac
@ -44,7 +44,6 @@ from __future__ import absolute_import
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import errno
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
@ -147,30 +146,19 @@ def extract_payload(payload_path, output_path):
|
||||
return True
|
||||
elif header == "pb":
|
||||
logging.info("Extracting pbzx payload")
|
||||
extract = "parse_pbzx.py"
|
||||
extract = "extract_pbzx.py"
|
||||
|
||||
payload_dir = os.path.dirname(payload_path)
|
||||
# First, unpack the PBZX into cpio parts.
|
||||
subprocess.check_call(["parse_pbzx.py", payload_path], cwd=payload_dir)
|
||||
# Next, decompress any parts that are .xz, and feed them all into pax.
|
||||
# First, extract the PBZX into cpio.
|
||||
subprocess.check_call([extract, payload_path], cwd=payload_dir)
|
||||
# Next, feed the extracted PBZX into pax.
|
||||
pax_proc = subprocess.Popen(
|
||||
["pax", "-r", "-k", "-s", ":^/::"],
|
||||
stdin=subprocess.PIPE,
|
||||
cwd=output_path,
|
||||
)
|
||||
for part in sorted(glob.glob(os.path.join(payload_dir, "Payload.part*"))):
|
||||
if part.endswith(".xz"):
|
||||
logging.info("Extracting xz part {}".format(part))
|
||||
# This would be easier if we pulled in the lzma module...
|
||||
xz_proc = subprocess.Popen(
|
||||
["xz", "-dc", part], stdout=subprocess.PIPE, cwd=payload_dir
|
||||
)
|
||||
shutil.copyfileobj(xz_proc.stdout, pax_proc.stdin)
|
||||
xz_proc.wait()
|
||||
else:
|
||||
logging.info("Copying plain cpio part {}".format(part))
|
||||
with open(part, "rb") as f:
|
||||
shutil.copyfileobj(f, pax_proc.stdin)
|
||||
with open(payload_path + ".cpio", "rb") as f:
|
||||
shutil.copyfileobj(f, pax_proc.stdin)
|
||||
pax_proc.stdin.close()
|
||||
pax_proc.wait()
|
||||
return True
|
||||
|
40
tools/crashreporter/system-symbols/mac/extract_pbzx.py
Executable file
40
tools/crashreporter/system-symbols/mac/extract_pbzx.py
Executable file
@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python3
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
import lzma
|
||||
import struct
|
||||
import sys
|
||||
|
||||
|
||||
def extract_pbzx(pbzx_path):
|
||||
with open(pbzx_path, "rb") as f:
|
||||
magic = f.read(4)
|
||||
if magic != b"pbzx":
|
||||
raise Exception("Not a PBZX payload?")
|
||||
# The first thing in the file looks like the size of each
|
||||
# decompressed chunk except the last one. It should match
|
||||
# decompressed_size in all cases except last, but we don't
|
||||
# check.
|
||||
chunk_size = f.read(8)
|
||||
chunk_size = struct.unpack(">Q", chunk_size)[0]
|
||||
with open(pbzx_path + ".cpio", "wb") as out:
|
||||
while True:
|
||||
header = f.read(16)
|
||||
if header == b"":
|
||||
break
|
||||
if len(header) != 16:
|
||||
raise Exception("Corrupted PBZX payload?")
|
||||
decompressed_size, compressed_size = struct.unpack(">QQ", header)
|
||||
if compressed_size == decompressed_size:
|
||||
out.write(f.read(decompressed_size))
|
||||
else:
|
||||
data = lzma.decompress(f.read(compressed_size))
|
||||
if len(data) != decompressed_size:
|
||||
raise Exception("Corrupted PBZX payload?")
|
||||
out.write(data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_pbzx(sys.argv[1])
|
@ -1,107 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# This code is from https://gist.github.com/pudquick/ff412bcb29c9c1fa4b8d
|
||||
#
|
||||
# v2 pbzx stream handler
|
||||
# My personal writeup on the differences here:
|
||||
# https://gist.github.com/pudquick/29fcfe09c326a9b96cf5
|
||||
#
|
||||
# Pure python reimplementation of .cpio.xz content extraction from pbzx file
|
||||
# payload originally here:
|
||||
# http://www.tonymacx86.com/general-help/135458-pbzx-stream-parser.html
|
||||
#
|
||||
# Cleaned up C version (as the basis for my code) here, thanks to Pepijn Bruienne / @bruienne
|
||||
# https://gist.github.com/bruienne/029494bbcfb358098b41
|
||||
#
|
||||
# The python version of this code does not have an explicit license, but
|
||||
# is based on GPLv3 C code linked above.
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import struct
|
||||
import sys
|
||||
|
||||
|
||||
def seekread(f, offset=None, length=0, relative=True):
|
||||
if offset is not None:
|
||||
# offset provided, let's seek
|
||||
f.seek(offset, [0, 1, 2][relative])
|
||||
if length != 0:
|
||||
return f.read(length)
|
||||
|
||||
|
||||
def parse_pbzx(pbzx_path):
|
||||
section = 0
|
||||
xar_out_path = "%s.part%02d.cpio.xz" % (pbzx_path, section)
|
||||
f = open(pbzx_path, "rb")
|
||||
# pbzx = f.read()
|
||||
# f.close()
|
||||
magic = seekread(f, length=4)
|
||||
if magic != b"pbzx":
|
||||
raise Exception("Error: Not a pbzx file")
|
||||
# Read 8 bytes for initial flags
|
||||
flags = seekread(f, length=8)
|
||||
# Interpret the flags as a 64-bit big-endian unsigned int
|
||||
flags = struct.unpack(">Q", flags)[0]
|
||||
xar_f = open(xar_out_path, "wb")
|
||||
while flags & (1 << 24):
|
||||
# Read in more flags
|
||||
flags = seekread(f, length=8)
|
||||
flags = struct.unpack(">Q", flags)[0]
|
||||
# Read in length
|
||||
f_length = seekread(f, length=8)
|
||||
f_length = struct.unpack(">Q", f_length)[0]
|
||||
xzmagic = seekread(f, length=6)
|
||||
if xzmagic != b"\xfd7zXZ\x00":
|
||||
# This isn't xz content, this is actually _raw decompressed cpio_
|
||||
# chunk of 16MB in size...
|
||||
# Let's back up ...
|
||||
seekread(f, offset=-6, length=0)
|
||||
# ... and split it out ...
|
||||
f_content = seekread(f, length=f_length)
|
||||
section += 1
|
||||
decomp_out = "%s.part%02d.cpio" % (pbzx_path, section)
|
||||
g = open(decomp_out, "wb")
|
||||
g.write(f_content)
|
||||
g.close()
|
||||
# Now to start the next section, which should hopefully be .xz
|
||||
# (we'll just assume it is ...)
|
||||
xar_f.close()
|
||||
section += 1
|
||||
new_out = "%s.part%02d.cpio.xz" % (pbzx_path, section)
|
||||
xar_f = open(new_out, "wb")
|
||||
else:
|
||||
f_length -= 6
|
||||
# This part needs buffering
|
||||
f_content = seekread(f, length=f_length)
|
||||
tail = seekread(f, offset=-2, length=2)
|
||||
xar_f.write(xzmagic)
|
||||
xar_f.write(f_content)
|
||||
if tail != b"YZ":
|
||||
xar_f.close()
|
||||
raise Exception("Error: Footer is not xar file footer")
|
||||
try:
|
||||
f.close()
|
||||
xar_f.close()
|
||||
except BaseException:
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
parse_pbzx(sys.argv[1])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user