Bug 1800198 - Rewrite a pbzx extractor from scratch. r=gsvelto

It turns out the format doesn't look quite like what the original code made it out to be. There's also no need to extract parts separately to then post-process them in PackageSymbolDumper, the pbzx extractor can handle that itself. Differential Revision: https://phabricator.services.mozilla.com/D161878
2024-11-24 21:31:04 +00:00 · 2022-11-16 21:35:11 +00:00 · 2022-11-16 21:35:11 +00:00 · 07bb461fac
commit 07bb461fac
parent 7c1951aff7
3 changed files with 46 additions and 125 deletions
--- a/tools/crashreporter/system-symbols/mac/PackageSymbolDumper.py
+++ b/tools/crashreporter/system-symbols/mac/PackageSymbolDumper.py
@ -44,7 +44,6 @@ from __future__ import absolute_import
 import argparse
 import concurrent.futures
 import errno
-import glob
 import logging
 import os
 import shutil
@ -147,30 +146,19 @@ def extract_payload(payload_path, output_path):
            return True
        elif header == "pb":
            logging.info("Extracting pbzx payload")
-            extract = "parse_pbzx.py"
+            extract = "extract_pbzx.py"

            payload_dir = os.path.dirname(payload_path)
-            # First, unpack the PBZX into cpio parts.
-            subprocess.check_call(["parse_pbzx.py", payload_path], cwd=payload_dir)
-            # Next, decompress any parts that are .xz, and feed them all into pax.
+            # First, extract the PBZX into cpio.
+            subprocess.check_call([extract, payload_path], cwd=payload_dir)
+            # Next, feed the extracted PBZX into pax.
            pax_proc = subprocess.Popen(
                ["pax", "-r", "-k", "-s", ":^/::"],
                stdin=subprocess.PIPE,
                cwd=output_path,
            )
-            for part in sorted(glob.glob(os.path.join(payload_dir, "Payload.part*"))):
-                if part.endswith(".xz"):
-                    logging.info("Extracting xz part {}".format(part))
-                    # This would be easier if we pulled in the lzma module...
-                    xz_proc = subprocess.Popen(
-                        ["xz", "-dc", part], stdout=subprocess.PIPE, cwd=payload_dir
-                    )
-                    shutil.copyfileobj(xz_proc.stdout, pax_proc.stdin)
-                    xz_proc.wait()
-                else:
-                    logging.info("Copying plain cpio part {}".format(part))
-                    with open(part, "rb") as f:
-                        shutil.copyfileobj(f, pax_proc.stdin)
+            with open(payload_path + ".cpio", "rb") as f:
+                shutil.copyfileobj(f, pax_proc.stdin)
            pax_proc.stdin.close()
            pax_proc.wait()
            return True
--- a/tools/crashreporter/system-symbols/mac/extract_pbzx.py
+++ b/tools/crashreporter/system-symbols/mac/extract_pbzx.py
@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import lzma
+import struct
+import sys
+
+
+def extract_pbzx(pbzx_path):
+    with open(pbzx_path, "rb") as f:
+        magic = f.read(4)
+        if magic != b"pbzx":
+            raise Exception("Not a PBZX payload?")
+        # The first thing in the file looks like the size of each
+        # decompressed chunk except the last one. It should match
+        # decompressed_size in all cases except last, but we don't
+        # check.
+        chunk_size = f.read(8)
+        chunk_size = struct.unpack(">Q", chunk_size)[0]
+        with open(pbzx_path + ".cpio", "wb") as out:
+            while True:
+                header = f.read(16)
+                if header == b"":
+                    break
+                if len(header) != 16:
+                    raise Exception("Corrupted PBZX payload?")
+                decompressed_size, compressed_size = struct.unpack(">QQ", header)
+                if compressed_size == decompressed_size:
+                    out.write(f.read(decompressed_size))
+                else:
+                    data = lzma.decompress(f.read(compressed_size))
+                    if len(data) != decompressed_size:
+                        raise Exception("Corrupted PBZX payload?")
+                    out.write(data)
+
+
+if __name__ == "__main__":
+    extract_pbzx(sys.argv[1])
--- a/tools/crashreporter/system-symbols/mac/parse_pbzx.py
+++ b/tools/crashreporter/system-symbols/mac/parse_pbzx.py
@ -1,107 +0,0 @@
-#!/usr/bin/env python3
-# This code is from https://gist.github.com/pudquick/ff412bcb29c9c1fa4b8d
-#
-# v2 pbzx stream handler
-# My personal writeup on the differences here:
-# https://gist.github.com/pudquick/29fcfe09c326a9b96cf5
-#
-# Pure python reimplementation of .cpio.xz content extraction from pbzx file
-# payload originally here:
-# http://www.tonymacx86.com/general-help/135458-pbzx-stream-parser.html
-#
-# Cleaned up C version (as the basis for my code) here, thanks to Pepijn Bruienne / @bruienne
-# https://gist.github.com/bruienne/029494bbcfb358098b41
-#
-# The python version of this code does not have an explicit license, but
-# is based on GPLv3 C code linked above.
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-from __future__ import absolute_import
-
-import struct
-import sys
-
-
-def seekread(f, offset=None, length=0, relative=True):
-    if offset is not None:
-        # offset provided, let's seek
-        f.seek(offset, [0, 1, 2][relative])
-    if length != 0:
-        return f.read(length)
-
-
-def parse_pbzx(pbzx_path):
-    section = 0
-    xar_out_path = "%s.part%02d.cpio.xz" % (pbzx_path, section)
-    f = open(pbzx_path, "rb")
-    # pbzx = f.read()
-    # f.close()
-    magic = seekread(f, length=4)
-    if magic != b"pbzx":
-        raise Exception("Error: Not a pbzx file")
-    # Read 8 bytes for initial flags
-    flags = seekread(f, length=8)
-    # Interpret the flags as a 64-bit big-endian unsigned int
-    flags = struct.unpack(">Q", flags)[0]
-    xar_f = open(xar_out_path, "wb")
-    while flags & (1 << 24):
-        # Read in more flags
-        flags = seekread(f, length=8)
-        flags = struct.unpack(">Q", flags)[0]
-        # Read in length
-        f_length = seekread(f, length=8)
-        f_length = struct.unpack(">Q", f_length)[0]
-        xzmagic = seekread(f, length=6)
-        if xzmagic != b"\xfd7zXZ\x00":
-            # This isn't xz content, this is actually _raw decompressed cpio_
-            # chunk of 16MB in size...
-            # Let's back up ...
-            seekread(f, offset=-6, length=0)
-            # ... and split it out ...
-            f_content = seekread(f, length=f_length)
-            section += 1
-            decomp_out = "%s.part%02d.cpio" % (pbzx_path, section)
-            g = open(decomp_out, "wb")
-            g.write(f_content)
-            g.close()
-            # Now to start the next section, which should hopefully be .xz
-            # (we'll just assume it is ...)
-            xar_f.close()
-            section += 1
-            new_out = "%s.part%02d.cpio.xz" % (pbzx_path, section)
-            xar_f = open(new_out, "wb")
-        else:
-            f_length -= 6
-            # This part needs buffering
-            f_content = seekread(f, length=f_length)
-            tail = seekread(f, offset=-2, length=2)
-            xar_f.write(xzmagic)
-            xar_f.write(f_content)
-            if tail != b"YZ":
-                xar_f.close()
-                raise Exception("Error: Footer is not xar file footer")
-    try:
-        f.close()
-        xar_f.close()
-    except BaseException:
-        pass
-
-
-def main():
-    parse_pbzx(sys.argv[1])
-
-
-if __name__ == "__main__":
-    main()