Bug 1761150: Mach should use fresh system python, don't inherit sys.path r=ahal

Previously, when using the system Python packages, Mach would reuse the
values already existing in the `sys.path`. This had two benefits:
1. We didn't have to do work to calculate which paths the "system
   Python" specifically referred to.
2. This allowed us to support nested Mach calls
   (such as `./mach --virtualenv psutil ./mach build`).

However, it came with its own headaches, specifically around "consistent
imports" and Python subprocesses, such as in the following example
1. Mach runs "using system Python"
2. The "build" site (for example) would be activated. The current
   `sys.path` is included in the virtualenv's `mach.pth` file so that
   subprocesses will have access to the same packages as the primary
   Mach process
3. //Something// adds something to the `sys.path`
4. The build virtualenv is redundantly re-activated (such as due to
   generic setup code for some module). Though we don't physically
   re-activate the virtualenv, we //do// check that it's up-to-date to
   validate assumptions - **and it's not!** It's missing the
   //something// that was added to the `sys.path`.
5. We can't re-create the build virtualenv because it's already active,
   so:
6. An error is raised so that unexpected out-of-date-ness doesn't fly
   under the radar.

-----

This patch solves this by calculating the "system Python" paths in a
deterministic way. Then, even when using the system Python, the Mach and
command sites use define themselves in a consistent way.

This means that we can't do `./mach --virtualenv <venv> ./mach ...` to
shares `venv`'s packages with the whole new Mach process. Fortunately,
this has been replaced with  moving such packages into the nested Mach
command's requirements  definition instead.

TL;DR: more consistent, less failures. 👍

-----

One more detail, this time around the `sys_path()` function:
Ideally, we could calculate the standard library paths *and* the
"system" paths with only one Python subprocess. Unfortunately,
that's not the case:
* If `-S` isn't provided, then it's tricky to separate the standard
  library paths from the system paths
* If `-S` is provided, then doing `site.getsitepackages()` in a
  virtualenv returns the *system* site packages, not the virtualenv's.

To work around this, we call the external python twice, and in
parallel. This allows resolving the information we need while avoiding
performance costs.

Differential Revision: https://phabricator.services.mozilla.com/D143200
This commit is contained in:
Mitchell Hentges 2022-04-12 18:40:14 +00:00
parent 7f0d8e4745
commit 15510f9f26
3 changed files with 96 additions and 70 deletions

View File

@ -17,7 +17,6 @@ import shutil
import site
import subprocess
import sys
from collections import OrderedDict
import sysconfig
from pathlib import Path
import tempfile
@ -320,12 +319,8 @@ class MachSiteManager:
if self._site_packages_source == SitePackagesSource.NONE:
return SiteUpToDateResult(True)
elif self._site_packages_source == SitePackagesSource.SYSTEM:
pthfile_lines = [
*self._requirements.pths_as_absolute(self._topsrcdir),
*sys.path,
]
_assert_pip_check(
self._topsrcdir, pthfile_lines, "mach", self._requirements
self._topsrcdir, self._sys_path(), "mach", self._requirements
)
return SiteUpToDateResult(True)
elif self._site_packages_source == SitePackagesSource.VENV:
@ -367,35 +362,17 @@ class MachSiteManager:
if self._site_packages_source == SitePackagesSource.VENV
else sys.executable,
):
if self._site_packages_source == SitePackagesSource.SYSTEM:
# Add our Mach modules to the sys.path.
# Note that "[0:0]" is used to ensure that Mach's modules are prioritized
# over the system modules. Since Mach-env activation happens so early in
# the Mach lifecycle, we can assume that no system packages have been
# imported yet, and this is a safe operation to do.
sys.path[0:0] = self._requirements.pths_as_absolute(self._topsrcdir)
# Remove duplicates (most likely to creep in during nested Mach calls
# that use the "system" python).
sys.path = list(OrderedDict.fromkeys(sys.path))
elif self._site_packages_source == SitePackagesSource.NONE:
# Since the system packages aren't used, clean the sys.path
# down to just the standard library.
sys.path = list(self._metadata.original_python.stdlib_paths())
sys.path[0:0] = self._requirements.pths_as_absolute(self._topsrcdir)
elif self._site_packages_source == SitePackagesSource.VENV:
# Don't activate Mach virtualenv if this Python process was already
# started from the Mach virtualenv.
if Path(sys.prefix) != Path(self._metadata.prefix):
# Since the system packages aren't used, clean the sys.path
# down to just the standard library.
sys.path = list(self._metadata.original_python.stdlib_paths())
# Activate the Mach virtualenv in the current Python context. This
# automatically adds the virtualenv's "site-packages" to our scope, in
# addition to our first-party/vendored modules since they're specified
# in the "mach.pth" file.
activate_virtualenv(self._virtualenv())
# Reset the sys.path to insulate ourselves from the environment.
# This should be safe to do, since activation of the Mach site happens so
# early in the Mach lifecycle that no packages should have been imported
# from external sources yet.
sys.path = self._sys_path()
if self._site_packages_source == SitePackagesSource.VENV:
# Activate the Mach virtualenv in the current Python context. This
# automatically adds the virtualenv's "site-packages" to our scope, in
# addition to our first-party/vendored modules since they're specified
# in the "mach.pth" file.
activate_virtualenv(self._virtualenv())
def _build(self):
if self._site_packages_source != SitePackagesSource.VENV:
@ -413,6 +390,27 @@ class MachSiteManager:
self._metadata,
)
def _sys_path(self):
if self._site_packages_source == SitePackagesSource.SYSTEM:
stdlib_paths, system_site_paths = self._metadata.original_python.sys_path()
return [
*stdlib_paths,
*self._requirements.pths_as_absolute(self._topsrcdir),
*system_site_paths,
]
elif self._site_packages_source == SitePackagesSource.NONE:
stdlib_paths = self._metadata.original_python.sys_path_stdlib()
return [
*stdlib_paths,
*self._requirements.pths_as_absolute(self._topsrcdir),
]
elif self._site_packages_source == SitePackagesSource.VENV:
stdlib_paths = self._metadata.original_python.sys_path_stdlib()
return [
*stdlib_paths,
# self._requirements will be added as part of the virtualenv activation.
]
def _pthfile_lines(self, environment):
return [
# Prioritize vendored and first-party modules first.
@ -702,28 +700,8 @@ class CommandSiteManager:
mach_site_packages_source = self._mach_site_packages_source
if mach_site_packages_source == SitePackagesSource.SYSTEM:
# When Mach is using the system environment, add it next.
stdlib_paths = self._metadata.original_python.stdlib_paths()
system_sys_path = [p for p in sys.path if p not in stdlib_paths]
# When a virtualenv is activated, it implicitly adds some paths to the
# sys.path. When this function is run from such an activated virtualenv,
# we don't want to include its paths in the pthfile because they'd be
# redundant - so, scrub them.
# Note that some platforms include just a site's $site-packages-dir to the
# sys.path, while other platforms (such as Windows) add the $prefix as well.
# We can catch these cases by matching all paths that start with $prefix.
prefix_normalized = os.path.normcase(
os.path.normpath(self._virtualenv.prefix)
)
system_sys_path = [
p
for p in system_sys_path
if not os.path.normcase(os.path.normpath(p)).startswith(
prefix_normalized
)
]
lines.extend(system_sys_path)
_, system_site_paths = self._metadata.original_python.sys_path()
lines.extend(system_site_paths)
elif mach_site_packages_source == SitePackagesSource.VENV:
# When Mach is using its on-disk virtualenv, add its site-packages directory.
assert self._mach_virtualenv_root
@ -738,9 +716,6 @@ class CommandSiteManager:
# source to import from.
lines.extend(_deprioritize_venv_packages(self._virtualenv))
# De-duplicate
lines = list(OrderedDict.fromkeys(lines))
# Note that an on-disk virtualenv is always created for commands, even if they
# are using the system as their site-packages source. This is to support use
# cases where a fresh Python process must be created, but it also must have
@ -804,10 +779,13 @@ class PythonVirtualenv:
return os.path.join(normalized_venv_root, relative_path)
def site_packages_dirs(self):
return [
self.resolve_sysconfig_packages_path("purelib"),
self.resolve_sysconfig_packages_path("platlib"),
]
# De-dupe with set literal in case purelib and platlib are identical
return list(
{
self.resolve_sysconfig_packages_path("purelib"),
self.resolve_sysconfig_packages_path("platlib"),
}
)
def pip_install_with_constraints(self, pip_args):
"""Create a pip constraints file or existing packages
@ -935,19 +913,64 @@ class ExternalPythonSite:
self.python_path = python_executable
@functools.lru_cache(maxsize=None)
def stdlib_paths(self):
stdlib_paths = subprocess.check_output(
def sys_path(self):
"""Return lists of sys.path entries: one for standard library, one for the site
These two lists are calculated at the same time so that we can interpret them
in a single Python subprocess, as running a whole Python instance is
very expensive in the context of Mach initialization.
"""
env = {
k: v
for k, v in os.environ.items()
# Don't include items injected by IDEs into the system path.
if k not in ("PYTHONPATH", "PYDEVD_LOAD_VALUES_ASYNC")
}
stdlib = subprocess.Popen(
[
self.python_path,
# Don't "import site", so we don't include system/user pip-installed
# packages.
# Don't "import site" right away, so we can split the standard library
# paths from the site paths.
"-S",
"-c",
"import sys; print(sys.path)",
"import sys; from collections import OrderedDict; "
# Skip the first item in the sys.path, as it's the working directory
# of the invoked script (so, in this case, "").
# Use list(OrderectDict...) to de-dupe items, such as when using
# pyenv on Linux.
"print(list(OrderedDict.fromkeys(sys.path[1:])))",
],
universal_newlines=True,
env=env,
stdout=subprocess.PIPE,
)
return ast.literal_eval(stdlib_paths)
system = subprocess.Popen(
[
self.python_path,
"-c",
"import sys; import site; " "packages = site.getsitepackages(); "
# Only add the "user site packages" if not in a virtualenv (which is
# identified by the prefix == base_prefix check
"packages.insert(0, site.getusersitepackages()) if "
" sys.prefix == sys.base_prefix else None;"
"print(packages)",
],
universal_newlines=True,
env=env,
stdout=subprocess.PIPE,
)
# Run python processes in parallel - they take roughly the same time, so this
# cuts this functions run time in half.
stdlib_out, _ = stdlib.communicate()
system_out, _ = system.communicate()
assert stdlib.returncode == 0
assert system.returncode == 0
return ast.literal_eval(stdlib_out), ast.literal_eval(system_out)
def sys_path_stdlib(self):
"""Return list of default sys.path entries for the standard library"""
stdlib, _ = self.sys_path()
return stdlib
@functools.lru_cache(maxsize=None)

2
python/sites/lint.txt Normal file
View File

@ -0,0 +1,2 @@

View File

@ -72,6 +72,7 @@ def get_global_excludes(**lintargs):
category="devenv",
description="Run linters.",
parser=setup_argument_parser,
virtualenv_name="lint",
)
def lint(command_context, *runargs, **lintargs):
"""Run linters."""