Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Miscellaneous render speedups #5225

Closed
wants to merge 10 commits into from
6 changes: 4 additions & 2 deletions conda_build/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import copy
import math
import os
import pickle
import re
import shutil
import time
Expand Down Expand Up @@ -822,9 +823,10 @@ def clean_pkgs(self):

def copy(self):
new = copy.copy(self)
new.variant = copy.deepcopy(self.variant)
# Use picke.loads(pickle.dumps(...) as a faster copy.deepcopy alternative.
new.variant = pickle.loads(pickle.dumps(self.variant, -1))
if hasattr(self, "variants"):
new.variants = copy.deepcopy(self.variants)
new.variants = pickle.loads(pickle.dumps(self.variants, -1))
beckermr marked this conversation as resolved.
Show resolved Hide resolved
return new

# context management - automatic cleanup if self.dirty or self.keep_old_work is not True
Expand Down
136 changes: 109 additions & 27 deletions conda_build/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,31 @@ def remove_constructor(cls, tag):
# used to avoid recomputing/rescanning recipe contents for used variables
used_vars_cache = {}

# Placeholders singletons for os/os.environ used to memoize select_lines().
_selector_placeholder_os = object()
_selector_placeholder_os_environ = object()


@lru_cache(maxsize=200)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These maxsize=200 are more or less arbitrarily chosen.
The default is 128 IIRC, but I just wanted to specify some (reasonable) number such that the temptation to add maxsize=None later is lessened.
We may not want to use maxsize=None deliberately to avoid unbound resource usage downstream, e.,g. in case multiple recipes are rendered in the same process.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Memoizing compiled patterns might give a slight advantage here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned by @dholth below, the re module already caches compiled patterns.
In contrast to re's functions, the wrappers here not only cache for the pattern but also the matched string.
This is because we have redundant re.*(same_pattern, same_string) calls due to the repeated re-processing mentioned in #5224 (comment) .
As such, the memoizations here not try to sidestep the recompilation of patterns but the actual regex search/matches.

def re_findall(pattern, text, flags=0):
if isinstance(pattern, re.Pattern):
return pattern.findall(text, flags)
return re.findall(pattern, text, flags)


@lru_cache(maxsize=200)
def re_match(pattern, text, flags=0):
if isinstance(pattern, re.Pattern):
return pattern.match(text, flags)
return re.match(pattern, text, flags)


@lru_cache(maxsize=200)
def re_search(pattern, text, flags=0):
if isinstance(pattern, re.Pattern):
return pattern.search(text, flags)
return re.search(pattern, text, flags)
Comment on lines +118 to +135
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a fan of this repetition and the pattern of having to cache regex

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally, I'm also triggered by unnecessary repetitions, but I'm fine with it in this case since it should be a set-and-forget thing.

That said, I would make sense to profile this (for majorly affected cases) again to see which of these memoization wrappers have individually significant impact.
Meaning, we possibly only need a subset of them to get roughly the same results.



def get_selectors(config: Config) -> dict[str, bool]:
"""Aggregates selectors for use in recipe templating.
Expand Down Expand Up @@ -241,7 +266,7 @@ def ns_cfg(config: Config) -> dict[str, bool]:
# "NameError: name 'var' is not defined", where var is the variable that is not defined. This gets
# returned
def parseNameNotFound(error):
m = re.search("'(.+?)'", str(error))
m = re_search("'(.+?)'", str(error))
if len(m.groups()) == 1:
return m.group(1)
else:
Expand All @@ -268,6 +293,58 @@ def eval_selector(selector_string, namespace, variants_in_place):


def select_lines(data, namespace, variants_in_place):
# Try to turn namespace into a hashable representation for memoization.
try:
namespace_copy = namespace.copy()
if namespace_copy.get("os") is os:
namespace_copy["os"] = _selector_placeholder_os
if namespace_copy.get("environ") is os.environ:
namespace_copy["environ"] = _selector_placeholder_os_environ
if "pin_run_as_build" in namespace_copy:
# This raises TypeError if pin_run_as_build is not a dict of dicts.
try:
namespace_copy["pin_run_as_build"] = tuple(
(key, tuple((k, v) for k, v in value.items()))
for key, value in namespace_copy["pin_run_as_build"].items()
)
except (AttributeError, TypeError, ValueError):
# AttributeError: no .items method
# TypeError: .items() not iterable of iterables
# ValueError: .items() not iterable of only (k, v) tuples
raise TypeError
for k, v in namespace_copy.items():
# Convert list/sets/tuple to tuples (of tuples if it contains
# list/set elements). Copy any other type verbatim and rather fall
# back to the non-memoized version to avoid wrong/lossy conversions.
if isinstance(v, (list, set, tuple)):
namespace_copy[k] = tuple(
tuple(e) if isinstance(e, (list, set)) else e for e in v
)
namespace_tuple = tuple(namespace_copy.items())
# Raise TypeError if anything in namespace_tuple is not hashable.
hash(namespace_tuple)
except TypeError:
return _select_lines(data, namespace, variants_in_place)
return _select_lines_memoized(data, namespace_tuple, variants_in_place)
Comment on lines +296 to +328
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs tests to verify that it's doing the right thing

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 Makes sense to add/extend tests for select_lines which ensure repeated calls to select_lines and the non-memoized _select_lines always yield the same results.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Enabling CodSpeed benchmarks and adding a new test_select_lines_battery test in #5233 so we can better measure improvements



@lru_cache(maxsize=200)
def _select_lines_memoized(data, namespace_tuple, variants_in_place):
# Convert namespace_tuple to dict and undo the os/environ/pin_run_as_build
# replacements done in select_lines.
namespace = dict(namespace_tuple)
if namespace.get("os") is _selector_placeholder_os:
namespace["os"] = os
if namespace.get("environ") is _selector_placeholder_os_environ:
namespace["environ"] = os.environ
if "pin_run_as_build" in namespace:
namespace["pin_run_as_build"] = {
key: dict(value) for key, value in namespace["pin_run_as_build"]
}
return _select_lines(data, namespace, variants_in_place)


def _select_lines(data, namespace, variants_in_place):
lines = []

for i, line in enumerate(data.splitlines()):
Expand All @@ -280,8 +357,7 @@ def select_lines(data, namespace, variants_in_place):
if line.lstrip().startswith("#"):
# Don't bother with comment only lines
continue
m = sel_pat.match(line)
if m:
if "[" in line and "]" in line and (m := re_match(sel_pat, line)):
cond = m.group(3)
try:
if eval_selector(cond, namespace, variants_in_place):
Expand Down Expand Up @@ -888,8 +964,8 @@ def get_output_dicts_from_metadata(metadata, outputs=None):
outputs.append(OrderedDict(name=metadata.name()))
for out in outputs:
if (
"package:" in metadata.get_recipe_text()
and out.get("name") == metadata.name()
out.get("name") == metadata.name()
and "package:" in metadata.get_recipe_text()
):
combine_top_level_metadata_with_output(metadata, out)
return outputs
Expand Down Expand Up @@ -1014,9 +1090,10 @@ def get_updated_output_dict_from_reparsed_metadata(original_dict, new_outputs):
return output_d


@lru_cache(maxsize=200)
def _filter_recipe_text(text, extract_pattern=None):
if extract_pattern:
match = re.search(extract_pattern, text, flags=re.MULTILINE | re.DOTALL)
match = re_search(extract_pattern, text, flags=re.MULTILINE | re.DOTALL)
text = (
"\n".join({string for string in match.groups() if string}) if match else ""
)
Expand Down Expand Up @@ -1622,7 +1699,7 @@ def get_hash_contents(self):
dependencies = [
req
for req in dependencies
if not exclude_pattern.match(req) or " " in self.config.variant[req]
if not re_match(exclude_pattern, req) or " " in self.config.variant[req]
]

# retrieve values - this dictionary is what makes up the hash.
Expand Down Expand Up @@ -1666,12 +1743,11 @@ def build_id(self):
raise RuntimeError(
f"Couldn't extract raw recipe text for {self.name()} output"
)
raw_recipe_text = self.extract_package_and_build_text()
raw_manual_build_string = re.search(r"\s*string:", raw_recipe_text)
raw_manual_build_string = re_search(r"\s*string:", raw_recipe_text)
# user setting their own build string. Don't modify it.
if manual_build_string and not (
raw_manual_build_string
and re.findall(r"h\{\{\s*PKG_HASH\s*\}\}", raw_manual_build_string.string)
and re_findall(r"h\{\{\s*PKG_HASH\s*\}\}", raw_manual_build_string.string)
):
check_bad_chrs(manual_build_string, "build/string")
out = manual_build_string
Expand All @@ -1680,7 +1756,7 @@ def build_id(self):
out = build_string_from_metadata(self)
if self.config.filename_hashing and self.final:
hash_ = self.hash_dependencies()
if not re.findall("h[0-9a-f]{%s}" % self.config.hash_length, out):
if not re_findall("h[0-9a-f]{%s}" % self.config.hash_length, out):
ret = out.rsplit("_", 1)
try:
int(ret[0])
Expand Down Expand Up @@ -2038,7 +2114,7 @@ def uses_jinja(self):
return False
with open(self.meta_path, "rb") as f:
meta_text = UnicodeDammit(f.read()).unicode_markup
matches = re.findall(r"{{.*}}", meta_text)
matches = re_findall(r"{{.*}}", meta_text)
return len(matches) > 0

@property
Expand All @@ -2053,7 +2129,7 @@ def uses_vcs_in_meta(self) -> Literal["git", "svn", "mercurial"] | None:
with open(self.meta_path, "rb") as f:
meta_text = UnicodeDammit(f.read()).unicode_markup
for _vcs in vcs_types:
matches = re.findall(rf"{_vcs.upper()}_[^\.\s\'\"]+", meta_text)
matches = re_findall(rf"{_vcs.upper()}_[^\.\s\'\"]+", meta_text)
if len(matches) > 0 and _vcs != self.get_value("package/name"):
if _vcs == "hg":
_vcs = "mercurial"
Expand All @@ -2076,7 +2152,7 @@ def uses_vcs_in_build(self) -> Literal["git", "svn", "mercurial"] | None:
# 1. the vcs command, optionally with an exe extension
# 2. a subcommand - for example, "clone"
# 3. a target url or other argument
matches = re.findall(
matches = re_findall(
rf"{vcs}(?:\.exe)?(?:\s+\w+\s+[\w\/\.:@]+)",
build_script,
flags=re.IGNORECASE,
Expand Down Expand Up @@ -2146,7 +2222,7 @@ def extract_single_output_text(
# first, need to figure out which index in our list of outputs the name matches.
# We have to do this on rendered data, because templates can be used in output names
recipe_text = self.extract_outputs_text(apply_selectors=apply_selectors)
output_matches = output_re.findall(recipe_text)
output_matches = re_findall(output_re, recipe_text)
outputs = self.meta.get("outputs") or (
self.parent_outputs if hasattr(self, "parent_outputs") else None
)
Expand Down Expand Up @@ -2187,15 +2263,15 @@ def numpy_xx(self) -> bool:
"""This is legacy syntax that we need to support for a while. numpy x.x means
"pin run as build" for numpy. It was special-cased to only numpy."""
text = self.extract_requirements_text()
return bool(numpy_xx_re.search(text))
return bool(re_search(numpy_xx_re, text))

@property
def uses_numpy_pin_compatible_without_xx(self) -> tuple[bool, bool]:
text = self.extract_requirements_text()
compatible_search = numpy_compatible_re.search(text)
compatible_search = re_search(numpy_compatible_re, text)
max_pin_search = None
if compatible_search:
max_pin_search = numpy_compatible_x_re.search(text)
max_pin_search = re_search(numpy_compatible_x_re, text)
# compatible_search matches simply use of pin_compatible('numpy')
# max_pin_search quantifies the actual number of x's in the max_pin field. The max_pin
# field can be absent, which is equivalent to a single 'x'
Expand All @@ -2212,21 +2288,22 @@ def uses_subpackage(self):
if "name" in out:
name_re = re.compile(r"^{}(\s|\Z|$)".format(out["name"]))
in_reqs = any(
name_re.match(req) for req in self.get_depends_top_and_out("run")
re_match(name_re, req)
for req in self.get_depends_top_and_out("run")
)
if in_reqs:
break
subpackage_pin = False
if not in_reqs and self.meta_path:
data = self.extract_requirements_text(force_top_level=True)
if data:
subpackage_pin = re.search(r"{{\s*pin_subpackage\(.*\)\s*}}", data)
subpackage_pin = re_search(r"{{\s*pin_subpackage\(.*\)\s*}}", data)
return in_reqs or bool(subpackage_pin)

@property
def uses_new_style_compiler_activation(self):
text = self.extract_requirements_text()
return bool(re.search(r"\{\{\s*compiler\(.*\)\s*\}\}", text))
return bool(re_search(r"\{\{\s*compiler\(.*\)\s*\}\}", text))

def validate_features(self):
if any(
Expand Down Expand Up @@ -2287,7 +2364,7 @@ def variant_in_source(self):
# We use this variant in the top-level recipe.
# constrain the stored variants to only this version in the output
# variant mapping
if re.search(
if re_search(
r"\s*\{\{\s*%s\s*(?:.*?)?\}\}" % key, self.extract_source_text()
):
return True
Expand Down Expand Up @@ -2371,15 +2448,19 @@ def get_output_metadata(self, output):
)
if build_reqs:
build_reqs = [
req for req in build_reqs if not subpackage_pattern.match(req)
req
for req in build_reqs
if not re_match(subpackage_pattern, req)
]
if host_reqs:
host_reqs = [
req for req in host_reqs if not subpackage_pattern.match(req)
req
for req in host_reqs
if not re_match(subpackage_pattern, req)
]
if run_reqs:
run_reqs = [
req for req in run_reqs if not subpackage_pattern.match(req)
req for req in run_reqs if not re_match(subpackage_pattern, req)
]

requirements = {}
Expand Down Expand Up @@ -2642,12 +2723,13 @@ def get_loop_vars(self):
return variants.get_vars(_variants, loop_only=True)

def get_used_loop_vars(self, force_top_level=False, force_global=False):
loop_vars = set(self.get_loop_vars())
return {
var
for var in self.get_used_vars(
force_top_level=force_top_level, force_global=force_global
)
if var in self.get_loop_vars()
if var in loop_vars
}

def get_rendered_recipe_text(
Expand Down Expand Up @@ -2813,7 +2895,7 @@ def _get_used_vars_meta_yaml_helper(
reqs_re = re.compile(
r"requirements:.+?(?=^\w|\Z|^\s+-\s(?=name|type))", flags=re.M | re.S
)
reqs_text = reqs_re.search(recipe_text)
reqs_text = re_search(reqs_re, recipe_text)
reqs_text = reqs_text.group() if reqs_text else ""

return reqs_text, recipe_text
Expand Down
18 changes: 10 additions & 8 deletions conda_build/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,16 +707,14 @@ def get_package_variants(recipedir_or_metadata, config=None, variants=None):
def get_vars(variants, loop_only=False):
"""For purposes of naming/identifying, provide a way of identifying which variables contribute
to the matrix dimensionality"""
first_variant, *other_variants = variants
special_keys = {"pin_run_as_build", "zip_keys", "ignore_version"}
special_keys.update(set(ensure_list(variants[0].get("extend_keys"))))
special_keys.update(set(ensure_list(first_variant.get("extend_keys"))))
loop_vars = [
k
for k in variants[0]
for k, v in first_variant.items()
if k not in special_keys
and (
not loop_only
or any(variant[k] != variants[0][k] for variant in variants[1:])
)
and (not loop_only or any(variant[k] != v for variant in other_variants))
]
return loop_vars

Expand Down Expand Up @@ -773,7 +771,9 @@ def find_used_variables_in_shell_script(variant, file_path):
text = f.read()
used_variables = set()
for v in variant:
variant_regex = r"(^[^$]*?\$\{?\s*%s\s*[\s|\}])" % v
if v not in text:
continue
variant_regex = rf"(^[^$]*?\$\{{?\s*{re.escape(v)}\s*[\s|\}}])"
if re.search(variant_regex, text, flags=re.MULTILINE | re.DOTALL):
used_variables.add(v)
return used_variables
Expand All @@ -784,7 +784,9 @@ def find_used_variables_in_batch_script(variant, file_path):
text = f.read()
used_variables = set()
for v in variant:
variant_regex = r"\%" + v + r"\%"
if v not in text:
continue
variant_regex = rf"\%{re.escape(v)}\%"
if re.search(variant_regex, text, flags=re.MULTILINE | re.DOTALL):
used_variables.add(v)
return used_variables
19 changes: 19 additions & 0 deletions news/5225-render-misc-speedups
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
### Enhancements

* Miscellaneous recipe rendering speed ups. (#5224 via #5225)

### Bug fixes

* <news item>

### Deprecations

* <news item>

### Docs

* <news item>

### Other

* <news item>
Loading