diff --git a/docs/deployment.rst b/docs/deployment.rst index 0c146756..652c2791 100644 --- a/docs/deployment.rst +++ b/docs/deployment.rst @@ -124,28 +124,55 @@ exodus-gw will continue to function but will skip cache flush operations. Enabling the feature requires the deployment of two sets of configuration. -Firstly, in the ``exodus-gw.ini`` section for the relevant environment, -set ``cache_flush_urls`` to enable cache flush by URL and/or -``cache_flush_arl_templates`` to enable cache flushing by ARL. Both options -can be used together as needed. +Firstly, in ``exodus-gw.ini``, define some cache flush rules under +sections named ``[cache_flush.{rule_name}]``. + +Each rule must define a list of URL/ARL ``templates`` for calculating +the cache keys to flush. Rules may optionally define ``includes`` and +``excludes`` to select specific paths where the rule should be applied. + +Once rules are defined, enable them for a specific environment by listing +them in ``cache_flush_rules`` under that environment's configuration. +See the following example: .. code-block:: ini [env.live] - # Root URL(s) of CDN properties for which to flush cache. - # Several can be provided. - cache_flush_urls = - https://cdn1.example.com - https://cdn2.example.com - - # Templates of ARL(s) for which to flush cache. + # Rule(s) to activate for this environment. + # + # This example supposes that there are two CDN hostnames in use, + # one of which exposes all content *except* a certain subtree + # and one which exposes *only* that subtree. + cache_flush_rules = + cdn1 + cdn2 + + [cache_flush.cdn1] + # URL or ARL template(s) for which to flush cache. + # # Templates can use placeholders: # - path: path of a file under CDN root - # - ttl (optional): a TTL value will be substituted - cache_flush_arl_templates = + # - ttl: a TTL value will be substituted + templates = + https://cdn1.example.com S/=/123/22334455/{ttl}/cdn1.example.com/{path} + + # Suppose that "/files" is restricted to cdn2, then the + # exclusion pattern here will avoid unnecessarily flushing + # cdn1 cache for paths underneath that subtree. + excludes = + ^/files/ + + [cache_flush.cdn2] + templates = + https://cdn2.example.com S/=/123/22334455/{ttl}/cdn2.example.com/{path} + # This rule only applies to this subtree, which was excluded + # from the other rule. + includes = + ^/files/ + Secondly, use environment variables to deploy credentials for the Fast Purge API, according to the below table. The fields here correspond to those used by the `.edgerc file `_ diff --git a/exodus_gw/settings.py b/exodus_gw/settings.py index c20f9bea..ffc4eee2 100644 --- a/exodus_gw/settings.py +++ b/exodus_gw/settings.py @@ -1,5 +1,8 @@ import configparser import os +import re +from collections.abc import Iterable +from dataclasses import dataclass from enum import Enum from typing import Any @@ -26,6 +29,136 @@ def split_ini_list(raw: str | None) -> list[str]: return [elem.strip() for elem in raw.split("\n") if elem.strip()] +@dataclass +class CacheFlushRule: + name: str + """Name of this rule (from the config file).""" + + templates: list[str] + """List of URL/ARL templates. + + Each template may be either: + - a base URL, e.g. "https://cdn.example.com/cdn-root" + - an ARL template, e.g. "S/=/123/22334455/{ttl}/cdn1.example.com/{path}" + + Templates may contain 'ttl' and 'path' placeholders to be substituted + when calculating cache keys for flush. + When there is no 'path' in a template, the path will instead be + appended. + """ + + includes: list[re.Pattern[str]] + """List of patterns applied to decide whether this rule is + applicable to any given path. + + Patterns are non-anchored regular expressions. + A path must match at least one pattern in order for cache flush + to occur for that path. + + There is a default pattern of ".*", meaning that all paths will + be included by default. + + Note that these includes are evaluated *after* the set of paths + for flush have already been filtered to include only entry points + (e.g. repomd.xml and other mutable paths). It is not possible to + use this mechanism to enable cache flushing of non-entry-point + paths. + """ + + excludes: list[re.Pattern[str]] + """List of patterns applied to decide whether this rule should + be skipped for any given path. + + Patterns are non-anchored regular expressions. + If a path matches any pattern, cache flush won't occur. + + excludes are applied after includes. + """ + + def matches(self, path: str) -> bool: + """True if this rule matches the given path.""" + + # We always match against absolute paths with a leading /, + # regardless of how the input was formatted. + path = "/" + path.removeprefix("/") + + # Must match at least one 'includes'. + for pattern in self.includes: + if pattern.search(path): + break + else: + return False + + # Must not match any 'excludes'. + for pattern in self.excludes: + if pattern.search(path): + return False + + return True + + @classmethod + def load_all( + cls: type["CacheFlushRule"], + config: configparser.ConfigParser, + env_section: str, + names: Iterable[str], + ) -> list["CacheFlushRule"]: + + out: list[CacheFlushRule] = [] + for rule_name in names: + section_name = f"cache_flush.{rule_name}" + templates = split_ini_list(config.get(section_name, "templates")) + includes = [ + re.compile(s) + for s in split_ini_list( + config.get(section_name, "includes", fallback=".*") + ) + ] + excludes = [ + re.compile(s) + for s in split_ini_list( + config.get(section_name, "excludes", fallback=None) + ) + ] + out.append( + cls( + name=rule_name, + templates=templates, + includes=includes, + excludes=excludes, + ) + ) + + # backwards-compatibility: if no rules were defined, but old-style + # cache flush config was specified, read it into a rule with default + # 'includes' and 'excludes'. + if not names and ( + config.has_option(env_section, "cache_flush_urls") + or config.has_option(env_section, "cache_flush_arl_templates") + ): + out.append( + cls( + name=f"{env_section}-legacy", + templates=split_ini_list( + config.get( + env_section, "cache_flush_urls", fallback=None + ) + ) + + split_ini_list( + config.get( + env_section, + "cache_flush_arl_templates", + fallback=None, + ) + ), + includes=[re.compile(r".*")], + excludes=[], + ) + ) + + return out + + class Environment(object): def __init__( self, @@ -36,8 +169,7 @@ def __init__( config_table, cdn_url, cdn_key_id, - cache_flush_urls=None, - cache_flush_arl_templates=None, + cache_flush_rules=None, ): self.name = name self.aws_profile = aws_profile @@ -46,10 +178,7 @@ def __init__( self.config_table = config_table self.cdn_url = cdn_url self.cdn_key_id = cdn_key_id - self.cache_flush_urls = split_ini_list(cache_flush_urls) - self.cache_flush_arl_templates = split_ini_list( - cache_flush_arl_templates - ) + self.cache_flush_rules: list[CacheFlushRule] = cache_flush_rules or [] @property def cdn_private_key(self): @@ -63,8 +192,8 @@ def fastpurge_enabled(self) -> bool: are available for this environment. """ return ( - # *at least one* URL or ARL template must be set... - (self.cache_flush_urls or self.cache_flush_arl_templates) + # There must be at least one cache flush rule in config... + bool(self.cache_flush_rules) # ... and *all* fastpurge credentials must be set and self.fastpurge_access_token and self.fastpurge_client_secret @@ -373,9 +502,12 @@ def load_settings() -> Settings: config_table = config.get(env, "config_table", fallback=None) cdn_url = config.get(env, "cdn_url", fallback=None) cdn_key_id = config.get(env, "cdn_key_id", fallback=None) - cache_flush_urls = config.get(env, "cache_flush_urls", fallback=None) - cache_flush_arl_templates = config.get( - env, "cache_flush_arl_templates", fallback=None + + cache_flush_rule_names = split_ini_list( + config.get(env, "cache_flush_rules", fallback=None) + ) + cache_flush_rules = CacheFlushRule.load_all( + config, env, cache_flush_rule_names ) settings.environments.append( @@ -387,8 +519,7 @@ def load_settings() -> Settings: config_table=config_table, cdn_url=cdn_url, cdn_key_id=cdn_key_id, - cache_flush_urls=cache_flush_urls, - cache_flush_arl_templates=cache_flush_arl_templates, + cache_flush_rules=cache_flush_rules, ) ) diff --git a/exodus_gw/worker/cache.py b/exodus_gw/worker/cache.py index f7f6c853..73545f6a 100644 --- a/exodus_gw/worker/cache.py +++ b/exodus_gw/worker/cache.py @@ -81,18 +81,25 @@ def urls_for_flush(self): for p in uris_with_aliases(self.paths, self.aliases) ] - for cdn_base_url in self.env.cache_flush_urls: - for path in path_list: - out.append(os.path.join(cdn_base_url, path)) - - for arl_template in self.env.cache_flush_arl_templates: - for path in path_list: - out.append( - arl_template.format( - path=path, - ttl=self.arl_ttl(path), + for path in path_list: + # Figure out the templates applicable to this path + templates: list[str] = [] + for rule in self.env.cache_flush_rules: + if rule.matches(path): + templates.extend(rule.templates) + + for template in templates: + if "{path}" in template: + # interpret as a template with placeholders + out.append( + template.format( + path=path.removeprefix("/"), + ttl=self.arl_ttl(path), + ) ) - ) + else: + # no {path} placeholder, interpret as a root URL + out.append(os.path.join(template, path)) return out diff --git a/tests/worker/test_cdn_cache.py b/tests/worker/test_cdn_cache.py index 41a67d20..ac4ef6e2 100644 --- a/tests/worker/test_cdn_cache.py +++ b/tests/worker/test_cdn_cache.py @@ -154,14 +154,33 @@ def test_flush_cdn_cache_typical( cdn_url = http://localhost:8049/_/cookie cdn_key_id = XXXXXXXXXXXXXX -cache_flush_urls = - https://cdn1.example.com - https://cdn2.example.com/root +cache_flush_rules = + cdn1 + cdn2 + repomd -cache_flush_arl_templates = +[cache_flush.cdn1] +templates = + https://cdn1.example.com S/=/123/4567/{ttl}/cdn1.example.com/{path} cid=/// +excludes = + /not-cdn1/ + /also-not-cdn1/ + +[cache_flush.cdn2] +templates = + https://cdn2.example.com/root S/=/234/6677/{ttl}/cdn2.example.com/other/{path} x/y/z +[cache_flush.repomd] +templates = + S/=/234/6677/{ttl}/special-repomd-template/{path} x/y/z +# This contrived rule applies to repomd.xml files, but excludes +# one path to test includes and excludes together. +includes = + /repomd\\.xml$ +excludes = + /also """ ) @@ -211,6 +230,8 @@ def test_flush_cdn_cache_typical( # - alias resolution # - treeinfo special case "/path/one/repodata/repomd.xml", + "path/not-cdn1/repodata/repomd.xml", + "path/also-not-cdn1/repodata/repomd.xml", "path/rhui/two/listing", "third/path", "/some/misc/treeinfo", @@ -240,35 +261,145 @@ def test_flush_cdn_cache_typical( # It should have flushed cache for all the expected URLs, # using both the CDN root URLs and the ARL templates - assert sorted(fp_client._purged_urls) == [ - # Used the ARL templates. Note the different TTL values - # for different paths, and also the paths both before and - # after alias resolution are flushed. - "S/=/123/4567/10m/cdn1.example.com/path/rhui/two/listing cid=///", - "S/=/123/4567/10m/cdn1.example.com/path/two/listing cid=///", - # note only the kickstart treeinfo appears, the other is filtered. - "S/=/123/4567/30d/cdn1.example.com/some/kickstart/treeinfo cid=///", - "S/=/123/4567/30d/cdn1.example.com/third/path cid=///", - "S/=/123/4567/4h/cdn1.example.com/path/one-dest/repodata/repomd.xml cid=///", - "S/=/123/4567/4h/cdn1.example.com/path/one/repodata/repomd.xml cid=///", - "S/=/234/6677/10m/cdn2.example.com/other/path/rhui/two/listing x/y/z", - "S/=/234/6677/10m/cdn2.example.com/other/path/two/listing x/y/z", - "S/=/234/6677/30d/cdn2.example.com/other/some/kickstart/treeinfo x/y/z", - "S/=/234/6677/30d/cdn2.example.com/other/third/path x/y/z", - "S/=/234/6677/4h/cdn2.example.com/other/path/one-dest/repodata/repomd.xml x/y/z", - "S/=/234/6677/4h/cdn2.example.com/other/path/one/repodata/repomd.xml x/y/z", - # Used the CDN URL which didn't have a leading path. - "https://cdn1.example.com/path/one-dest/repodata/repomd.xml", - "https://cdn1.example.com/path/one/repodata/repomd.xml", - "https://cdn1.example.com/path/rhui/two/listing", - "https://cdn1.example.com/path/two/listing", - "https://cdn1.example.com/some/kickstart/treeinfo", - "https://cdn1.example.com/third/path", - # Used the CDN URL which had a leading path. - "https://cdn2.example.com/root/path/one-dest/repodata/repomd.xml", - "https://cdn2.example.com/root/path/one/repodata/repomd.xml", - "https://cdn2.example.com/root/path/rhui/two/listing", - "https://cdn2.example.com/root/path/two/listing", - "https://cdn2.example.com/root/some/kickstart/treeinfo", - "https://cdn2.example.com/root/third/path", - ] + assert sorted(fp_client._purged_urls) == sorted( + [ + # Used the ARL templates. Note the different TTL values + # for different paths, and also the paths both before and + # after alias resolution are flushed. + "S/=/123/4567/10m/cdn1.example.com/path/rhui/two/listing cid=///", + "S/=/123/4567/10m/cdn1.example.com/path/two/listing cid=///", + # note only the kickstart treeinfo appears, the other is filtered. + "S/=/123/4567/30d/cdn1.example.com/some/kickstart/treeinfo cid=///", + "S/=/123/4567/30d/cdn1.example.com/third/path cid=///", + "S/=/123/4567/4h/cdn1.example.com/path/one-dest/repodata/repomd.xml cid=///", + "S/=/123/4567/4h/cdn1.example.com/path/one/repodata/repomd.xml cid=///", + "S/=/234/6677/10m/cdn2.example.com/other/path/rhui/two/listing x/y/z", + "S/=/234/6677/10m/cdn2.example.com/other/path/two/listing x/y/z", + "S/=/234/6677/30d/cdn2.example.com/other/some/kickstart/treeinfo x/y/z", + "S/=/234/6677/30d/cdn2.example.com/other/third/path x/y/z", + "S/=/234/6677/4h/cdn2.example.com/other/path/one-dest/repodata/repomd.xml x/y/z", + "S/=/234/6677/4h/cdn2.example.com/other/path/one/repodata/repomd.xml x/y/z", + # note the following two repomd.xml did NOT get flushed against cdn1.example.com + # due to matching 'excludes' + "S/=/234/6677/4h/cdn2.example.com/other/path/not-cdn1/repodata/repomd.xml x/y/z", + "S/=/234/6677/4h/cdn2.example.com/other/path/also-not-cdn1/repodata/repomd.xml x/y/z", + # special rule just for repomd.xml was activated for these paths. + # Note the "also-not-cdn1" path was filtered by an exclude. + "S/=/234/6677/4h/special-repomd-template/path/not-cdn1/repodata/repomd.xml x/y/z", + "S/=/234/6677/4h/special-repomd-template/path/one-dest/repodata/repomd.xml x/y/z", + "S/=/234/6677/4h/special-repomd-template/path/one/repodata/repomd.xml x/y/z", + # Used the CDN URL which didn't have a leading path. + "https://cdn1.example.com/path/one-dest/repodata/repomd.xml", + "https://cdn1.example.com/path/one/repodata/repomd.xml", + "https://cdn1.example.com/path/rhui/two/listing", + "https://cdn1.example.com/path/two/listing", + "https://cdn1.example.com/some/kickstart/treeinfo", + "https://cdn1.example.com/third/path", + # Used the CDN URL which had a leading path. + "https://cdn2.example.com/root/path/one-dest/repodata/repomd.xml", + "https://cdn2.example.com/root/path/one/repodata/repomd.xml", + "https://cdn2.example.com/root/path/not-cdn1/repodata/repomd.xml", + "https://cdn2.example.com/root/path/also-not-cdn1/repodata/repomd.xml", + "https://cdn2.example.com/root/path/rhui/two/listing", + "https://cdn2.example.com/root/path/two/listing", + "https://cdn2.example.com/root/some/kickstart/treeinfo", + "https://cdn2.example.com/root/third/path", + ] + ) + + +def test_flush_cdn_cache_legacy_config( + db: Session, + caplog: pytest.LogCaptureFixture, + mock_boto3_client, + fake_message_id: str, + tmp_path: pathlib.Path, + monkeypatch: pytest.MonkeyPatch, +): + """flush_cdn_cache performs expected cache flushes in + a typical usage scenario where old-style config (no "rules") + has been used in exodus-gw.ini. + """ + + # Write an ini file with some fastpurge stuff under our control. + conf_path = tmp_path / "exodus-gw.ini" + conf_path.write_text( + """ +[env.cachetest] +aws_profile = cachetest +bucket = my-bucket +table = my-table +config_table = my-config + +cdn_url = http://localhost:8049/_/cookie +cdn_key_id = XXXXXXXXXXXXXX + +cache_flush_urls = + https://cdn1.example.com + https://cdn2.example.com + +cache_flush_arl_templates = + S/=/234/6677/{ttl}/cdn1.example.com/other/{path} x/y/z + S/=/234/6677/{ttl}/cdn2.example.com/other/{path} x/y/z +""" + ) + + # Make load_settings use our config file above. + monkeypatch.setenv("EXODUS_GW_INI_PATH", str(conf_path)) + + # Provide some fastpurge credentials + monkeypatch.setenv("EXODUS_GW_FASTPURGE_HOST_CACHETEST", "fphost") + monkeypatch.setenv("EXODUS_GW_FASTPURGE_CLIENT_TOKEN_CACHETEST", "ctok") + monkeypatch.setenv("EXODUS_GW_FASTPURGE_CLIENT_SECRET_CACHETEST", "csec") + monkeypatch.setenv("EXODUS_GW_FASTPURGE_ACCESS_TOKEN_CACHETEST", "atok") + + settings = load_settings() + + task = Task(id=fake_message_id) + task.state = "NOT_STARTED" + db.add(task) + db.commit() + + # It should run to completion... + flush_cdn_cache( + paths=[ + "/path/to/repo1/repodata/repomd.xml", + "/path/to/repo2/repodata/repomd.xml", + ], + env="cachetest", + settings=settings, + ) + + # The task should have succeeded + db.refresh(task) + assert task.state == "COMPLETE" + + # Check how it used the fastpurge client + fp_client = FakeFastPurgeClient.INSTANCE + + # It should have created a client + assert fp_client + + # It should have provided the credentials from env vars + assert fp_client._kwargs["auth"] == { + "access_token": "atok", + "client_secret": "csec", + "client_token": "ctok", + "host": "fphost", + } + + # It should have flushed cache for all the expected URLs + assert sorted(fp_client._purged_urls) == sorted( + [ + # This is flat legacy config which applies the same templates to all paths, + # so cdn1 and cdn2 get exactly the same content flushed. + "S/=/234/6677/4h/cdn1.example.com/other/path/to/repo1/repodata/repomd.xml x/y/z", + "S/=/234/6677/4h/cdn1.example.com/other/path/to/repo2/repodata/repomd.xml x/y/z", + "S/=/234/6677/4h/cdn2.example.com/other/path/to/repo1/repodata/repomd.xml x/y/z", + "S/=/234/6677/4h/cdn2.example.com/other/path/to/repo2/repodata/repomd.xml x/y/z", + "https://cdn1.example.com/path/to/repo1/repodata/repomd.xml", + "https://cdn1.example.com/path/to/repo2/repodata/repomd.xml", + "https://cdn2.example.com/path/to/repo1/repodata/repomd.xml", + "https://cdn2.example.com/path/to/repo2/repodata/repomd.xml", + ] + )