Bug 1835431 - Handle hardlinks when unpacking a SDK. r=gsvelto

Differential Revision: https://phabricator.services.mozilla.com/D179281
2024-11-27 06:43:32 +00:00 · 2023-05-31 22:24:34 +00:00 · 2023-05-31 22:24:34 +00:00 · c6281d6814
commit c6281d6814
parent 59070c7a5d
4 changed files with 37 additions and 9 deletions
--- a/build/macosx/catalog.py
+++ b/build/macosx/catalog.py
@ -42,7 +42,7 @@ def show_package_content(url, digest=None, size=None):
    #     sys.exit(1)
    for name, content in unxar(BytesIO(package)):
        if name == "Payload":
-            for path, mode, _ in uncpio(Pbzx(content)):
+            for path, _, __ in uncpio(Pbzx(content)):
                if path:
                    print(path.decode("utf-8"))

--- a/python/mozbuild/mozpack/macpkg.py
+++ b/python/mozbuild/mozpack/macpkg.py
@ -14,6 +14,8 @@ import struct
 import zlib
 from xml.etree.ElementTree import XML

+from mozbuild.util import ReadOnlyNamespace
+

 class ZlibFile(object):
    def __init__(self, fileobj):
@ -179,6 +181,8 @@ def uncpio(fileobj):
            namesize,
            filesize,
        ) = struct.unpack(">6s6s6s6s6s6s6s11s6s11s", header)
+        dev = int(dev, 8)
+        ino = int(ino, 8)
        mode = int(mode, 8)
        nlink = int(nlink, 8)
        namesize = int(namesize, 8)
@ -197,7 +201,7 @@ def uncpio(fileobj):
        if name.startswith(b"/"):
            name = name[1:]
        content = Take(fileobj, filesize)
-        yield name, mode, content
+        yield name, ReadOnlyNamespace(mode=mode, nlink=nlink, dev=dev, ino=ino), content
        # Ensure the content is totally consumed
        while content.read(4096):
            pass
--- a/taskcluster/scripts/misc/unpack-sdk.py
+++ b/taskcluster/scripts/misc/unpack-sdk.py
@ -8,6 +8,7 @@ import shutil
 import stat
 import sys
 import tempfile
+from io import BytesIO
 from urllib.request import urlopen

 from mozpack.macpkg import Pbzx, uncpio, unxar
@ -36,27 +37,50 @@ def unpack_sdk(url, sha256, extract_prefix, out_dir="."):


 def extract_payload(fileobj, extract_prefix, out_dir="."):
-    for path, mode, content in uncpio(Pbzx(fileobj)):
+    hardlinks = {}
+    for path, st, content in uncpio(Pbzx(fileobj)):
+        # When there are hardlinks, normally a cpio stream is supposed to
+        # contain the data for all of them, but, even with compression, that
+        # can be a waste of space, so in some cpio streams (*cough* *cough*,
+        # Apple's, e.g. in Xcode), the files after the first one have dummy
+        # data.
+        # As we may be filtering the first file out (if it doesn't match
+        # extract_prefix), we need to keep its data around (we're not going
+        # to be able to rewind).
+        # We could do something fancy in the case where the first file is not
+        # filtered out, but in practice, it's not worth the extra complexity.
+        if stat.S_ISREG(st.mode) and st.nlink > 1:
+            key = (st.dev, st.ino)
+            hardlink = hardlinks.get(key)
+            if hardlink:
+                hardlink[0] -= 1
+                if hardlink[0] == 0:
+                    del hardlinks[key]
+            else:
+                hardlink = hardlinks[key] = [st.nlink - 1, BytesIO(content.read())]
+            content = hardlink[1]
+            content.seek(0)
+
        if not path:
            continue
        path = path.decode()
        if not path.startswith(extract_prefix):
            continue
        path = os.path.join(out_dir, path[len(extract_prefix) :].lstrip("/"))
-        if stat.S_ISDIR(mode):
+        if stat.S_ISDIR(st.mode):
            os.makedirs(path, exist_ok=True)
        else:
            parent = os.path.dirname(path)
            if parent:
                os.makedirs(parent, exist_ok=True)

-            if stat.S_ISLNK(mode):
+            if stat.S_ISLNK(st.mode):
                os.symlink(content.read(), path)
-            elif stat.S_ISREG(mode):
+            elif stat.S_ISREG(st.mode):
                with open(path, "wb") as out:
                    shutil.copyfileobj(content, out)
            else:
-                raise Exception(f"File mode {mode:o} is not supported")
+                raise Exception(f"File mode {st.mode:o} is not supported")


 if __name__ == "__main__":
--- a/tools/crashreporter/system-symbols/mac/PackageSymbolDumper.py
+++ b/tools/crashreporter/system-symbols/mac/PackageSymbolDumper.py
@ -181,8 +181,8 @@ def extract_payload(payload_path, output_path):
        elif header == b"pb":
            logging.info("Extracting pbzx payload")

-            for path, mode, content in uncpio(Pbzx(open(payload_path, "rb"))):
-                if not path or not stat.S_ISREG(mode):
+            for path, st, content in uncpio(Pbzx(open(payload_path, "rb"))):
+                if not path or not stat.S_ISREG(st.mode):
                    continue
                out = os.path.join(output_path, path.decode())
                os.makedirs(os.path.dirname(out), exist_ok=True)