From d9abb29ca34c8956ab8885359eff81a383ae51ab Mon Sep 17 00:00:00 2001 From: Rohan McGovern Date: Tue, 13 Aug 2024 12:56:32 +1000 Subject: [PATCH] Fix rhui_alias resolution during cache flush [RHELDST-26143] It was intended to take RHUI aliases into account during cache flush, but this was not implemented correctly. The main issue was that alias resolution was only implemented in the direction of (src => dest), but in the case of RHUI, we are always publishing content on the destination (non-RHUI) side of the alias. Hence we need to also consider the aliases in the (dest => src) direction to figure out that the src (RHUI) side of the alias is relevant for cache flush. A second issue was that, if multiple aliases were involved in a path, we would not flush cache for all of them. This couldn't work because the uri_alias function was designed to return only a single value, after resolution of all aliases. When multiple aliases are in play, content is actually accessible at all the paths calculated during each step of the resolution, so we need to keep track of intermediate paths and flush cache for all of them. This now behaves correctly in the typical case of both $releasever and RHUI aliases being in play, e.g. for /content/dist/rhel9/9/foo, it will now correctly calculate the 4 typical paths under which content is accessible: - /content/dist/rhel9/9/foo - /content/dist/rhel9/9.4/foo - /content/dist/rhel9/rhui/9/foo - /content/dist/rhel9/rhui/9.4/foo --- exodus_gw/aws/dynamodb.py | 75 +++++++++++++++++-- exodus_gw/aws/util.py | 133 ++++++++++++++++++++++++--------- tests/aws/test_uri_alias.py | 109 ++++++++++++++++++++++++++- tests/worker/test_cdn_cache.py | 12 +-- tests/worker/test_publish.py | 7 +- 5 files changed, 286 insertions(+), 50 deletions(-) diff --git a/exodus_gw/aws/dynamodb.py b/exodus_gw/aws/dynamodb.py index 395f5fd2..8d4b5ca1 100644 --- a/exodus_gw/aws/dynamodb.py +++ b/exodus_gw/aws/dynamodb.py @@ -60,15 +60,77 @@ def _aliases(self, alias_types: list[str]) -> list[tuple[str, str]]: def aliases_for_write(self) -> list[tuple[str, str]]: # Aliases used when writing items to DynamoDB. # - # Exclude rhui aliases (for now? RHELDST-18849). + # Note that these aliases are traversed only in the src => dest + # direction, which intentionally results in a kind of + # canonicalization of paths at publish time. + # + # Example: + # + # Given an alias of: + # src: /content/dist/rhel8/8 + # dest: /content/dist/rhel8/8.8 + # + # If /content/dist/rhel8/8/foo is published, we write + # an item with path /content/dist/rhel8/8.8/foo and DO NOT + # write /content/dist/rhel8/8/foo. + # + # Note also that rhui_alias is not included here. + # It's not needed because, in practice, all of the content + # accessed via those aliases is always *published* on the + # destination side of an alias. + # + # Example: + # + # Given an alias of: + # src: /content/dist/rhel8/rhui + # dest: /content/dist/rhel8 + # + # Content is always published under the /content/dist/rhel8 paths, + # therefore there is no need to resolve the above alias at publish + # time. + # + # It is possible that processing rhui_alias here would not be + # incorrect, but also possible that adding it might have some + # unintended effects. return self._aliases(["origin_alias", "releasever_alias"]) @property def aliases_for_flush(self) -> list[tuple[str, str]]: # Aliases used when flushing cache. - return self._aliases( - ["origin_alias", "releasever_alias", "rhui_alias"] - ) + out = self._aliases(["origin_alias", "releasever_alias", "rhui_alias"]) + + # When calculating paths for cache flush, the aliases should be resolved + # in *both* directions, so also return inverted copies of the aliases. + # + # Example: + # + # Given an alias of: + # src: /content/dist/rhel8/8 + # dest: /content/dist/rhel8/8.8 + # + # - If /content/dist/rhel8/8/foo is published, then + # /content/dist/rhel8/8.8/foo should also have cache flushed + # (src => dest) + # - If /content/dist/rhel8/8.8/foo is published, then + # /content/dist/rhel8/8/foo should also have cache flushed + # (dest => src) + # + # In practice, while it seems technically correct to do this for all aliases, + # this is mainly needed for RHUI content. + # + # Example: + # + # Given an alias of: + # src: /content/dist/rhel8/rhui + # dest: /content/dist/rhel8 + # + # We are always publishing content only on the destination side of + # this alias. If we don't traverse the alias from dest => src then we + # will miss the fact that /content/dist/rhel8/rhui paths should also + # have cache flushed. + out = out + [(dest, src) for (src, dest) in out] + + return out def query_definitions(self) -> dict[str, Any]: """Query the definitions in the config_table. If definitions are found, return them. Otherwise, @@ -123,7 +185,10 @@ def create_request( # updated timestamp. from_date = str(item.updated) - web_uri = uri_alias(item.web_uri, uri_aliases) + # Resolve aliases. We only write to the deepest path + # after all alias resolution, hence only using the + # first result from uri_alias. + web_uri = uri_alias(item.web_uri, uri_aliases)[0] if delete: request[table_name].append( diff --git a/exodus_gw/aws/util.py b/exodus_gw/aws/util.py index d0ffc503..ef0d4816 100644 --- a/exodus_gw/aws/util.py +++ b/exodus_gw/aws/util.py @@ -161,43 +161,109 @@ def get_reader(cls, request): return cls(request) -def uri_alias(uri: str, aliases: list[tuple[str, str]]): +def uri_alias(uri: str, aliases: list[tuple[str, str]]) -> list[str]: # Resolve every alias between paths within the uri (e.g. # allow RHUI paths to be aliased to non-RHUI). # # Aliases are expected to come from cdn-definitions. + # + # Returns an ordered list of URIs, guaranteed to always contain + # at least one element (the original URI). + # URIs are returned for each intermediate step in alias resolution, + # and are ordered by depth, from deepest to most shallow. + # + # For example: + # - if there are no applicable aliases, returns [uri] + # - if there is one matching alias, returns [uri_beyond_alias, uri] + # - if one alias matches, and then beyond that another alias matches + # (meaning two levels deep of alias resolution), returns: + # [uri_beyond_both_aliases, uri_beyond_first_alias, uri] + # + out: list[str] = [uri] + uri_alias_recurse(out, uri, aliases) + return out - new_uri = "" - remaining: list[tuple[str, str]] = aliases - - # We do multiple passes here to ensure that nested aliases - # are resolved correctly, regardless of the order in which - # they're provided. - while remaining: - processed: list[tuple[str, str]] = [] - - for src, dest in remaining: - if uri.startswith(src + "/") or uri == src: - new_uri = uri.replace(src, dest, 1) - LOG.debug( - "Resolved alias:\n\tsrc: %s\n\tdest: %s", - uri, + +def uri_alias_recurse( + accum: list[str], + uri: str, + aliases: list[tuple[str, str]], + depth=0, + maxdepth=4, +): + if depth > maxdepth: + # Runaway recursion breaker. + # + # There is no known path to get here, and if we ever do it's + # probably a bug. + # + # The point of this check is to avoid such a bug + # causing publish to completely break or hang. + LOG.warning( + "Aliases too deeply nested, bailing out at %s (URIs so far: %s)", + uri, + accum, + ) + return + + def add_out(new_uri: str) -> bool: + # Prepends a new URI to the output list + # (or shifts the existing URI to the front if it was already there) + # Return True if the URI was newly added. + out = True + if new_uri in accum: + accum.remove(new_uri) + out = False + accum.insert(0, new_uri) + return out + + for src, dest in aliases: + if uri.startswith(src + "/") or uri == src: + new_uri = uri.replace(src, dest, 1) + LOG.debug( + "Resolved alias:\n\tsrc: %s\n\tdest: %s", + uri, + new_uri, + extra={"event": "publish", "success": True}, + ) + if add_out(new_uri): + # We've calculated a new URI and it's the first + # time we've seen it. We have to recursively apply + # the aliases again to this new URI. + # + # Before doing that, we'll subtract *this* alias from the set, + # meaning that it won't be considered recursively. + # + # We do this because otherwise RHUI aliases are infinitely + # recursive. For example, since: + # + # /content/dist/rhel8/rhui + # => /content/dist/rhel8 + # + # That means this also works: + # /content/dist/rhel8/rhui/rhui + # => /content/dist/rhel8/rhui + # => /content/dist/rhel8 + # + # In fact you could insert as many "rhui" components into the path + # as you want and still ultimately resolve to the same path. + # That's the way the symlinks actually worked on the legacy CDN. + # + # But this is not desired behavior, we know that every alias is intended + # to be resolved in the URL a maximum of once, hence this adjustment. + sub_aliases = [ + (subsrc, subdest) + for (subsrc, subdest) in aliases + if (subsrc, subdest) != (src, dest) + ] + + uri_alias_recurse( + accum, new_uri, - extra={"event": "publish", "success": True}, + sub_aliases, + depth=depth + 1, + maxdepth=maxdepth, ) - uri = new_uri - processed.append((src, dest)) - - if not processed: - # We didn't resolve any alias, then we're done processing. - break - - # We resolved at least one alias, so we need another round - # in case others apply now. But take out anything we've already - # processed, so it is not possible to recurse. - remaining = [r for r in remaining if r not in processed] - - return uri def uris_with_aliases( @@ -211,10 +277,7 @@ def uris_with_aliases( # We accept inputs both with and without leading '/', normalize. uri = "/" + uri.removeprefix("/") - # This always goes into the set we'll process. - out.add(uri) - - # The value after alias resolution also goes into the set. - out.add(uri_alias(uri, aliases)) + for resolved in uri_alias(uri, aliases): + out.add(resolved) return sorted(out) diff --git a/tests/aws/test_uri_alias.py b/tests/aws/test_uri_alias.py index 4c86d8e5..b15ab4b0 100644 --- a/tests/aws/test_uri_alias.py +++ b/tests/aws/test_uri_alias.py @@ -14,14 +14,20 @@ ("/content/origin", "/origin"), ("/origin/rpm", "/origin/rpms"), ], - "/origin/rpms/path/to/file.iso", + [ + "/origin/rpms/path/to/file.iso", + "/content/origin/rpms/path/to/file.iso", + ], ), ( "/content/dist/rhel8/8/path/to/file.rpm", [ ("/content/dist/rhel8/8", "/content/dist/rhel8/8.5"), ], - "/content/dist/rhel8/8.5/path/to/file.rpm", + [ + "/content/dist/rhel8/8.5/path/to/file.rpm", + "/content/dist/rhel8/8/path/to/file.rpm", + ], ), ], ids=["origin", "releasever"], @@ -30,7 +36,104 @@ def test_uri_alias(input, aliases, output, caplog): caplog.set_level(DEBUG, logger="exodus-gw") assert uri_alias(input, aliases) == output assert ( - f'"message": "Resolved alias:\\n\\tsrc: {input}\\n\\tdest: {output}", ' + f'"message": "Resolved alias:\\n\\tsrc: {input}\\n\\tdest: {output[0]}", ' '"event": "publish", ' '"success": true' in caplog.text ) + + +def test_uri_alias_multi_level_write(): + # uri_alias should support resolving aliases multiple levels deep + # and return values in the right order, using single-direction aliases + # (as is typical in the write case) + uri = "/content/other/1/repo" + aliases = [ + # The data here is made up as there is not currently any identified + # realistic scenario having multi-level aliases during write. + ("/content/testproduct/1", "/content/testproduct/1.1.0"), + ("/content/other", "/content/testproduct"), + ] + + out = uri_alias(uri, aliases) + assert out == [ + "/content/testproduct/1.1.0/repo", + "/content/testproduct/1/repo", + "/content/other/1/repo", + ] + + +def test_uri_alias_multi_level_flush(): + # uri_alias should support resolving aliases multiple levels deep + # and return values in the right order, using bi-directional aliases + # (as is typical in the flush case). + # + # This data is realistic for the common case where releasever + # and rhui aliases are both in play. + + uri = "/content/dist/rhel8/8/some-repo/" + aliases = [ + # The caller is providing aliases in both src => dest and + # dest => src directions, as in the "cache flush" case. + ("/content/dist/rhel8/8", "/content/dist/rhel8/8.8"), + ("/content/dist/rhel8/8.8", "/content/dist/rhel8/8"), + ("/content/dist/rhel8/rhui", "/content/dist/rhel8"), + ("/content/dist/rhel8", "/content/dist/rhel8/rhui"), + ] + + out = uri_alias(uri, aliases) + # We don't verify the order here because, with bi-directional aliases + # provided, it does not really make sense to consider either side of + # the alias as "deeper" than the other. + assert sorted(out) == sorted( + [ + # It should return the repo on both sides of the + # releasever alias... + "/content/dist/rhel8/8/some-repo/", + "/content/dist/rhel8/8.8/some-repo/", + # And *also* on both sides of the releasever alias, beyond + # the RHUI alias. + "/content/dist/rhel8/rhui/8/some-repo/", + "/content/dist/rhel8/rhui/8.8/some-repo/", + ] + ) + + +def test_uri_alias_limit(caplog: pytest.LogCaptureFixture): + # uri_alias applies some limit on the alias resolution depth. + # + # This test exists to exercise the path of code intended to + # prevent runaway recursion. There is no known way how to + # actually trigger runaway recursion, so we are just providing + # an unrealistic config with more levels of alias than are + # actually used on production. + + uri = "/path/a/repo" + aliases = [ + ("/path/a", "/path/b"), + ("/path/b", "/path/c"), + ("/path/c", "/path/d"), + ("/path/d", "/path/e"), + ("/path/e", "/path/f"), + ("/path/f", "/path/g"), + ("/path/g", "/path/h"), + ("/path/h", "/path/i"), + ] + + out = uri_alias(uri, aliases) + + # It should have stopped resolving aliases at some point. + # Note that exactly where it stops is rather abitrary, the + # max depth currently is simply hardcoded. + assert out == [ + "/path/f/repo", + "/path/e/repo", + "/path/d/repo", + "/path/c/repo", + "/path/b/repo", + "/path/a/repo", + ] + + # It should have warned us about this. + assert ( + "Aliases too deeply nested, bailing out at /path/f/repo" in caplog.text + ) diff --git a/tests/worker/test_cdn_cache.py b/tests/worker/test_cdn_cache.py index a51c925b..41a67d20 100644 --- a/tests/worker/test_cdn_cache.py +++ b/tests/worker/test_cdn_cache.py @@ -193,7 +193,7 @@ def test_flush_cdn_cache_typical( {"src": "/path/one", "dest": "/path/one-dest"}, ], "rhui_alias": [ - {"src": "/path/two", "dest": "/path/two-dest"}, + {"src": "/path/rhui/two", "dest": "/path/two"}, ], } ) @@ -211,7 +211,7 @@ def test_flush_cdn_cache_typical( # - alias resolution # - treeinfo special case "/path/one/repodata/repomd.xml", - "path/two/listing", + "path/rhui/two/listing", "third/path", "/some/misc/treeinfo", "/some/kickstart/treeinfo", @@ -244,14 +244,14 @@ def test_flush_cdn_cache_typical( # Used the ARL templates. Note the different TTL values # for different paths, and also the paths both before and # after alias resolution are flushed. - "S/=/123/4567/10m/cdn1.example.com/path/two-dest/listing cid=///", + "S/=/123/4567/10m/cdn1.example.com/path/rhui/two/listing cid=///", "S/=/123/4567/10m/cdn1.example.com/path/two/listing cid=///", # note only the kickstart treeinfo appears, the other is filtered. "S/=/123/4567/30d/cdn1.example.com/some/kickstart/treeinfo cid=///", "S/=/123/4567/30d/cdn1.example.com/third/path cid=///", "S/=/123/4567/4h/cdn1.example.com/path/one-dest/repodata/repomd.xml cid=///", "S/=/123/4567/4h/cdn1.example.com/path/one/repodata/repomd.xml cid=///", - "S/=/234/6677/10m/cdn2.example.com/other/path/two-dest/listing x/y/z", + "S/=/234/6677/10m/cdn2.example.com/other/path/rhui/two/listing x/y/z", "S/=/234/6677/10m/cdn2.example.com/other/path/two/listing x/y/z", "S/=/234/6677/30d/cdn2.example.com/other/some/kickstart/treeinfo x/y/z", "S/=/234/6677/30d/cdn2.example.com/other/third/path x/y/z", @@ -260,14 +260,14 @@ def test_flush_cdn_cache_typical( # Used the CDN URL which didn't have a leading path. "https://cdn1.example.com/path/one-dest/repodata/repomd.xml", "https://cdn1.example.com/path/one/repodata/repomd.xml", - "https://cdn1.example.com/path/two-dest/listing", + "https://cdn1.example.com/path/rhui/two/listing", "https://cdn1.example.com/path/two/listing", "https://cdn1.example.com/some/kickstart/treeinfo", "https://cdn1.example.com/third/path", # Used the CDN URL which had a leading path. "https://cdn2.example.com/root/path/one-dest/repodata/repomd.xml", "https://cdn2.example.com/root/path/one/repodata/repomd.xml", - "https://cdn2.example.com/root/path/two-dest/listing", + "https://cdn2.example.com/root/path/rhui/two/listing", "https://cdn2.example.com/root/path/two/listing", "https://cdn2.example.com/root/some/kickstart/treeinfo", "https://cdn2.example.com/root/third/path", diff --git a/tests/worker/test_publish.py b/tests/worker/test_publish.py index 6431d890..76487a2c 100644 --- a/tests/worker/test_publish.py +++ b/tests/worker/test_publish.py @@ -102,11 +102,16 @@ def test_commit( ) assert sorted([(pp.env, pp.web_uri) for pp in published_paths]) == sorted( [ - # Note that both sides of the 1 alias are recorded. + # Note that both sides of the 1 alias are recorded, including + # beyond the RHUI alias. ("test", "/content/testproduct/1/repo/"), ("test", "/content/testproduct/1.1.0/repo/"), ("test", "/content/testproduct/1/repo/repomd.xml"), ("test", "/content/testproduct/1.1.0/repo/repomd.xml"), + ("test", "/content/testproduct/rhui/1/repo/"), + ("test", "/content/testproduct/rhui/1/repo/repomd.xml"), + ("test", "/content/testproduct/rhui/1.1.0/repo/"), + ("test", "/content/testproduct/rhui/1.1.0/repo/repomd.xml"), ] )