Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate conda_build.utils.samefile in favor of path to package mapping #5130

Merged
merged 10 commits into from
Jan 4, 2024
Merged
45 changes: 37 additions & 8 deletions conda_build/inspect_pkg.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@
)

from .deprecations import deprecated
from .utils import on_mac, on_win, samefile
from .utils import on_mac, on_win


@deprecated("3.28.0", "24.1.0")
@lru_cache(maxsize=None)
def dist_files(prefix: str | os.PathLike | Path, dist: Dist) -> set[str]:
if (prec := PrefixData(prefix).get(dist.name, None)) is None:
if (prec := PrefixData(str(prefix)).get(dist.name, None)) is None:
return set()
elif MatchSpec(dist).match(prec):
return set(prec["files"])
Expand All @@ -62,19 +62,48 @@ def which_package(
path: str | os.PathLike | Path,
prefix: str | os.PathLike | Path,
) -> Iterable[PrefixRecord]:
"""
"""Detect which package(s) a path belongs to.

Given the path (of a (presumably) conda installed file) iterate over
the conda packages the file came from. Usually the iteration yields
only one package.

We use lstat since a symlink doesn't clobber the file it points to.
"""
prefix = Path(prefix)
# historically, path was relative to prefix just to be safe we append to prefix
# (pathlib correctly handles this even if path is absolute)
path = prefix / path

# historically, path was relative to prefix, just to be safe we append to prefix
# get lstat before calling _file_package_mapping in case path doesn't exist
try:
lstat = (prefix / path).lstat()
except FileNotFoundError:
# FileNotFoundError: path doesn't exist
return
else:
yield from _file_package_mapping(prefix).get(lstat, ())


@lru_cache(maxsize=None)
def _file_package_mapping(prefix: Path) -> dict[os.stat_result, set[PrefixRecord]]:
"""Map paths to package records.

We use lstat since a symlink doesn't clobber the file it points to.
"""
mapping: dict[os.stat_result, set[PrefixRecord]] = {}
for prec in PrefixData(str(prefix)).iter_records():
if any(samefile(prefix / file, path) for file in prec["files"]):
yield prec
for file in prec["files"]:
# packages are capable of removing files installed by other dependencies from
# the build prefix, in those cases lstat will fail, while which_package wont
# return the correct package(s) in such a condition we choose to not worry about
# it since this file to package lookup exists primarily to detect clobbering
try:
lstat = (prefix / file).lstat()
except FileNotFoundError:
# FileNotFoundError: path doesn't exist
continue
else:
mapping.setdefault(lstat, set()).add(prec)
return mapping


def print_object_info(info, key):
Expand Down
1 change: 1 addition & 0 deletions conda_build/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2222,6 +2222,7 @@ def is_conda_pkg(pkg_path: str) -> bool:
)


@deprecated("3.28.3", "24.1.0")
def samefile(path1: Path, path2: Path) -> bool:
try:
return path1.samefile(path2)
Expand Down
19 changes: 19 additions & 0 deletions news/5126-samefile-regression
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
### Enhancements

* <news item>

### Bug fixes

* Update `which_package` to use a cached mapping of files to packages (`O(1)`) instead of relying on `Path.samefile` comparisons (`O(n * m)`). (#5126 via #5130)

### Deprecations

* <news item>

### Docs

* <news item>

### Other

* <news item>
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,5 @@ markers = [
"slow: execute the slow tests if active",
"sanity: execute the sanity tests",
"no_default_testing_config: used internally to disable monkeypatching for testing_config",
"benchmark: execute the benchmark tests",
]
17 changes: 11 additions & 6 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pytest

from conda_build.config import Config, get_or_merge_config
from conda_build.utils import on_win, samefile
from conda_build.utils import on_win


@pytest.fixture
Expand Down Expand Up @@ -39,20 +39,25 @@ def test_keep_old_work(config: Config, build_id: str, tmp_path: Path):
config.croot = tmp_path
config.build_id = build_id

magic = "a_touched_file.magic"

# empty working directory
orig_dir = Path(config.work_dir)
assert orig_dir.exists()
assert not len(os.listdir(config.work_dir))

# touch a file so working directory is not empty
(orig_dir / "a_touched_file.magic").touch()
assert len(os.listdir(config.work_dir))
(orig_dir / magic).touch()
assert orig_dir.exists()
assert len(os.listdir(config.work_dir)) == 1
assert Path(config.work_dir, magic).exists()

config.compute_build_id("a_new_name", reset=True)

# working directory should still exist and have the touched file
assert not samefile(orig_dir, config.work_dir)
# working directory should still exist (in new location) and have the touched file
assert not orig_dir.exists()
assert len(os.listdir(config.work_dir))
assert len(os.listdir(config.work_dir)) == 1
assert Path(config.work_dir, magic).exists()


@pytest.mark.skipif(on_win, reason="Windows uses only the short prefix")
Expand Down
74 changes: 68 additions & 6 deletions tests/test_inspect_pkg.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
from __future__ import annotations

import json
import os
from pathlib import Path
from uuid import uuid4

import pytest
from conda.core.prefix_data import PrefixData

from conda_build.inspect_pkg import which_package
Expand Down Expand Up @@ -100,20 +103,79 @@ def test_which_package(tmp_path: Path):

precs_hardlinkA = list(which_package(tmp_path / "hardlinkA", tmp_path))
assert len(precs_hardlinkA) == 1
assert precs_hardlinkA[0] == precA
assert set(precs_hardlinkA) == {precA}

precs_shared = list(which_package(tmp_path / "shared", tmp_path))
assert len(precs_shared) == 2
assert set(precs_shared) == {precA, precB}

precs_internal = list(which_package(tmp_path / "internal", tmp_path))
assert len(precs_internal) == 1
assert precs_internal[0] == precA
assert set(precs_internal) == {precA}

precs_external = list(which_package(tmp_path / "external", tmp_path))
assert len(precs_external) == 2
assert set(precs_external) == {precA, precB}
assert len(precs_external) == 1
assert set(precs_external) == {precA}

precs_hardlinkB = list(which_package(tmp_path / "hardlinkB", tmp_path))
assert len(precs_hardlinkB) == 2
assert set(precs_hardlinkB) == {precA, precB}
assert len(precs_hardlinkB) == 1
assert set(precs_hardlinkB) == {precB}


@pytest.mark.benchmark
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Noice!! This'll go well with codspeed.

def test_which_package_battery(tmp_path: Path):
# regression: https://github.com/conda/conda-build/issues/5126
# create a dummy environment
(tmp_path / "conda-meta").mkdir()
(tmp_path / "conda-meta" / "history").touch()
(tmp_path / "lib").mkdir()

# dummy packages with files
removed = []
for _ in range(100):
name = f"package_{uuid4().hex}"

# mock a package with 100 files
files = [f"lib/{uuid4().hex}" for _ in range(100)]
for file in files:
(tmp_path / file).touch()

# mock a removed file
remove = f"lib/{uuid4().hex}"
files.append(remove)
removed.append(remove)

(tmp_path / "conda-meta" / f"{name}-1-0.json").write_text(
json.dumps(
{
"build": "0",
"build_number": 0,
"channel": f"{name}-channel",
"files": files,
"name": name,
"paths_data": {
"paths": [
{"_path": file, "path_type": "hardlink", "size_in_bytes": 0}
for file in files
],
"paths_version": 1,
},
"version": "1",
}
)
)

# every path should return exactly one package
for subdir, _, files in os.walk(tmp_path / "lib"):
for file in files:
path = Path(subdir, file)

assert len(list(which_package(path, tmp_path))) == 1

# removed files should return no packages
# this occurs when, e.g., a package removes files installed by another package
for file in removed:
assert not len(list(which_package(tmp_path / file, tmp_path)))

# missing files should return no packages
assert not len(list(which_package(tmp_path / "missing", tmp_path)))
Loading