Fix rhui_alias resolution during cache flush [RHELDST-26143]

It was intended to take RHUI aliases into account during cache flush, but this was not implemented correctly. The main issue was that alias resolution was only implemented in the direction of (src => dest), but in the case of RHUI, we are always publishing content on the destination (non-RHUI) side of the alias. Hence we need to also consider the aliases in the (dest => src) direction to figure out that the src (RHUI) side of the alias is relevant for cache flush. A second issue was that, if multiple aliases were involved in a path, we would not flush cache for all of them. This couldn't work because the uri_alias function was designed to return only a single value, after resolution of all aliases. When multiple aliases are in play, content is actually accessible at all the paths calculated during each step of the resolution, so we need to keep track of intermediate paths and flush cache for all of them. This now behaves correctly in the typical case of both $releasever and RHUI aliases being in play, e.g. for /content/dist/rhel9/9/foo, it will now correctly calculate the 4 typical paths under which content is accessible: - /content/dist/rhel9/9/foo - /content/dist/rhel9/9.4/foo - /content/dist/rhel9/rhui/9/foo - /content/dist/rhel9/rhui/9.4/foo
release-engineering · Aug 13, 2024 · 2cec920 · 2cec920
1 parent e3fde79
commit 2cec920
Show file tree

Hide file tree

Showing 5 changed files with 286 additions and 50 deletions.
diff --git a/exodus_gw/aws/dynamodb.py b/exodus_gw/aws/dynamodb.py
@@ -60,15 +60,77 @@ def _aliases(self, alias_types: list[str]) -> list[tuple[str, str]]:
     def aliases_for_write(self) -> list[tuple[str, str]]:
         # Aliases used when writing items to DynamoDB.
         #
-        # Exclude rhui aliases (for now? RHELDST-18849).
+        # Note that these aliases are traversed only in the src => dest
+        # direction, which intentionally results in a kind of
+        # canonicalization of paths at publish time.
+        #
+        # Example:
+        #
+        #   Given an alias of:
+        #     src:  /content/dist/rhel8/8
+        #     dest: /content/dist/rhel8/8.8
+        #
+        # If /content/dist/rhel8/8/foo is published, we write
+        # an item with path /content/dist/rhel8/8.8/foo and DO NOT
+        # write /content/dist/rhel8/8/foo.
+        #
+        # Note also that rhui_alias is not included here.
+        # It's not needed because, in practice, all of the content
+        # accessed via those aliases is always *published* on the
+        # destination side of an alias.
+        #
+        # Example:
+        #
+        #   Given an alias of:
+        #     src:  /content/dist/rhel8/rhui
+        #     dest: /content/dist/rhel8
+        #
+        # Content is always published under the /content/dist/rhel8 paths,
+        # therefore there is no need to resolve the above alias at publish
+        # time.
+        #
+        # It is possible that processing rhui_alias here would not be
+        # incorrect, but also possible that adding it might have some
+        # unintended effects.
         return self._aliases(["origin_alias", "releasever_alias"])
 
     @property
     def aliases_for_flush(self) -> list[tuple[str, str]]:
         # Aliases used when flushing cache.
-        return self._aliases(
-            ["origin_alias", "releasever_alias", "rhui_alias"]
-        )
+        out = self._aliases(["origin_alias", "releasever_alias", "rhui_alias"])
+
+        # When calculating paths for cache flush, the aliases should be resolved
+        # in *both* directions, so also return inverted copies of the aliases.
+        #
+        # Example:
+        #
+        #   Given an alias of:
+        #     src:  /content/dist/rhel8/8
+        #     dest: /content/dist/rhel8/8.8
+        #
+        #   - If /content/dist/rhel8/8/foo is published, then
+        #     /content/dist/rhel8/8.8/foo should also have cache flushed
+        #     (src => dest)
+        #   - If /content/dist/rhel8/8.8/foo is published, then
+        #     /content/dist/rhel8/8/foo should also have cache flushed
+        #     (dest => src)
+        #
+        # In practice, while it seems technically correct to do this for all aliases,
+        # this is mainly needed for RHUI content.
+        #
+        # Example:
+        #
+        #   Given an alias of:
+        #     src:  /content/dist/rhel8/rhui
+        #     dest: /content/dist/rhel8
+        #
+        # We are always publishing content only on the destination side of
+        # this alias. If we don't traverse the alias from dest => src then we
+        # will miss the fact that /content/dist/rhel8/rhui paths should also
+        # have cache flushed.
+        out = out + [(dest, src) for (src, dest) in out]
+
+        return out
 
     def query_definitions(self) -> dict[str, Any]:
         """Query the definitions in the config_table. If definitions are found, return them. Otherwise,
@@ -123,7 +185,10 @@ def create_request(
             # updated timestamp.
             from_date = str(item.updated)
 
-            web_uri = uri_alias(item.web_uri, uri_aliases)
+            # Resolve aliases. We only write to the deepest path
+            # after all alias resolution, hence only using the
+            # first result from uri_alias.
+            web_uri = uri_alias(item.web_uri, uri_aliases)[0]
 
             if delete:
                 request[table_name].append(

diff --git a/exodus_gw/aws/util.py b/exodus_gw/aws/util.py
@@ -161,43 +161,109 @@ def get_reader(cls, request):
         return cls(request)
 
 
-def uri_alias(uri: str, aliases: list[tuple[str, str]]):
+def uri_alias(uri: str, aliases: list[tuple[str, str]]) -> list[str]:
     # Resolve every alias between paths within the uri (e.g.
     # allow RHUI paths to be aliased to non-RHUI).
     #
     # Aliases are expected to come from cdn-definitions.
+    #
+    # Returns an ordered list of URIs, guaranteed to always contain
+    # at least one element (the original URI).
+    # URIs are returned for each intermediate step in alias resolution,
+    # and are ordered by depth, from deepest to most shallow.
+    #
+    # For example:
+    #  - if there are no applicable aliases, returns [uri]
+    #  - if there is one matching alias, returns [uri_beyond_alias, uri]
+    #  - if one alias matches, and then beyond that another alias matches
+    #    (meaning two levels deep of alias resolution), returns:
+    #    [uri_beyond_both_aliases, uri_beyond_first_alias, uri]
+    #
+    out: list[str] = [uri]
+    uri_alias_recurse(out, uri, aliases)
+    return out
 
-    new_uri = ""
-    remaining: list[tuple[str, str]] = aliases
-
-    # We do multiple passes here to ensure that nested aliases
-    # are resolved correctly, regardless of the order in which
-    # they're provided.
-    while remaining:
-        processed: list[tuple[str, str]] = []
-
-        for src, dest in remaining:
-            if uri.startswith(src + "/") or uri == src:
-                new_uri = uri.replace(src, dest, 1)
-                LOG.debug(
-                    "Resolved alias:\n\tsrc: %s\n\tdest: %s",
-                    uri,
+
+def uri_alias_recurse(
+    accum: list[str],
+    uri: str,
+    aliases: list[tuple[str, str]],
+    depth=0,
+    maxdepth=4,
+):
+    if depth > maxdepth:
+        # Runaway recursion breaker.
+        #
+        # There is no known path to get here, and if we ever do it's
+        # probably a bug.
+        #
+        # The point of this check is to avoid such a bug
+        # causing publish to completely break or hang.
+        LOG.warning(
+            "Aliases too deeply nested, bailing out at %s (URIs so far: %s)",
+            uri,
+            accum,
+        )
+        return
+
+    def add_out(new_uri: str) -> bool:
+        # Prepends a new URI to the output list
+        # (or shifts the existing URI to the front if it was already there)
+        # Return True if the URI was newly added.
+        out = True
+        if new_uri in accum:
+            accum.remove(new_uri)
+            out = False
+        accum.insert(0, new_uri)
+        return out
+
+    for src, dest in aliases:
+        if uri.startswith(src + "/") or uri == src:
+            new_uri = uri.replace(src, dest, 1)
+            LOG.debug(
+                "Resolved alias:\n\tsrc: %s\n\tdest: %s",
+                uri,
+                new_uri,
+                extra={"event": "publish", "success": True},
+            )
+            if add_out(new_uri):
+                # We've calculated a new URI and it's the first
+                # time we've seen it. We have to recursively apply
+                # the aliases again to this new URI.
+                #
+                # Before doing that, we'll subtract *this* alias from the set,
+                # meaning that it won't be considered recursively.
+                #
+                # We do this because otherwise RHUI aliases are infinitely
+                # recursive. For example, since:
+                #
+                #   /content/dist/rhel8/rhui
+                #     => /content/dist/rhel8
+                #
+                # That means this also works:
+                #   /content/dist/rhel8/rhui/rhui
+                #     => /content/dist/rhel8/rhui
+                #       => /content/dist/rhel8
+                #
+                # In fact you could insert as many "rhui" components into the path
+                # as you want and still ultimately resolve to the same path.
+                # That's the way the symlinks actually worked on the legacy CDN.
+                #
+                # But this is not desired behavior, we know that every alias is intended
+                # to be resolved in the URL a maximum of once, hence this adjustment.
+                sub_aliases = [
+                    (subsrc, subdest)
+                    for (subsrc, subdest) in aliases
+                    if (subsrc, subdest) != (src, dest)
+                ]
+
+                uri_alias_recurse(
+                    accum,
                     new_uri,
-                    extra={"event": "publish", "success": True},
+                    sub_aliases,
+                    depth=depth + 1,
+                    maxdepth=maxdepth,
                 )
-                uri = new_uri
-                processed.append((src, dest))
-
-        if not processed:
-            # We didn't resolve any alias, then we're done processing.
-            break
-
-        # We resolved at least one alias, so we need another round
-        # in case others apply now. But take out anything we've already
-        # processed, so it is not possible to recurse.
-        remaining = [r for r in remaining if r not in processed]
-
-    return uri
 
 
 def uris_with_aliases(
@@ -211,10 +277,7 @@ def uris_with_aliases(
         # We accept inputs both with and without leading '/', normalize.
         uri = "/" + uri.removeprefix("/")
 
-        # This always goes into the set we'll process.
-        out.add(uri)
-
-        # The value after alias resolution also goes into the set.
-        out.add(uri_alias(uri, aliases))
+        for resolved in uri_alias(uri, aliases):
+            out.add(resolved)
 
     return sorted(out)
diff --git a/tests/aws/test_uri_alias.py b/tests/aws/test_uri_alias.py
@@ -14,14 +14,20 @@
                 ("/content/origin", "/origin"),
                 ("/origin/rpm", "/origin/rpms"),
             ],
-            "/origin/rpms/path/to/file.iso",
+            [
+                "/origin/rpms/path/to/file.iso",
+                "/content/origin/rpms/path/to/file.iso",
+            ],
         ),
         (
             "/content/dist/rhel8/8/path/to/file.rpm",
             [
                 ("/content/dist/rhel8/8", "/content/dist/rhel8/8.5"),
             ],
-            "/content/dist/rhel8/8.5/path/to/file.rpm",
+            [
+                "/content/dist/rhel8/8.5/path/to/file.rpm",
+                "/content/dist/rhel8/8/path/to/file.rpm",
+            ],
         ),
     ],
     ids=["origin", "releasever"],
@@ -30,7 +36,104 @@ def test_uri_alias(input, aliases, output, caplog):
     caplog.set_level(DEBUG, logger="exodus-gw")
     assert uri_alias(input, aliases) == output
     assert (
-        f'"message": "Resolved alias:\\n\\tsrc: {input}\\n\\tdest: {output}", '
+        f'"message": "Resolved alias:\\n\\tsrc: {input}\\n\\tdest: {output[0]}", '
         '"event": "publish", '
         '"success": true' in caplog.text
     )
+
+
+def test_uri_alias_multi_level_write():
+    # uri_alias should support resolving aliases multiple levels deep
+    # and return values in the right order, using single-direction aliases
+    # (as is typical in the write case)
+    uri = "/content/other/1/repo"
+    aliases = [
+        # The data here is made up as there is not currently any identified
+        # realistic scenario having multi-level aliases during write.
+        ("/content/testproduct/1", "/content/testproduct/1.1.0"),
+        ("/content/other", "/content/testproduct"),
+    ]
+
+    out = uri_alias(uri, aliases)
+    assert out == [
+        "/content/testproduct/1.1.0/repo",
+        "/content/testproduct/1/repo",
+        "/content/other/1/repo",
+    ]
+
+
+def test_uri_alias_multi_level_flush():
+    # uri_alias should support resolving aliases multiple levels deep
+    # and return values in the right order, using bi-directional aliases
+    # (as is typical in the flush case).
+    #
+    # This data is realistic for the common case where releasever
+    # and rhui aliases are both in play.
+
+    uri = "/content/dist/rhel8/8/some-repo/"
+    aliases = [
+        # The caller is providing aliases in both src => dest and
+        # dest => src directions, as in the "cache flush" case.
+        ("/content/dist/rhel8/8", "/content/dist/rhel8/8.8"),
+        ("/content/dist/rhel8/8.8", "/content/dist/rhel8/8"),
+        ("/content/dist/rhel8/rhui", "/content/dist/rhel8"),
+        ("/content/dist/rhel8", "/content/dist/rhel8/rhui"),
+    ]
+
+    out = uri_alias(uri, aliases)
+    # We don't verify the order here because, with bi-directional aliases
+    # provided, it does not really make sense to consider either side of
+    # the alias as "deeper" than the other.
+    assert sorted(out) == sorted(
+        [
+            # It should return the repo on both sides of the
+            # releasever alias...
+            "/content/dist/rhel8/8/some-repo/",
+            "/content/dist/rhel8/8.8/some-repo/",
+            # And *also* on both sides of the releasever alias, beyond
+            # the RHUI alias.
+            "/content/dist/rhel8/rhui/8/some-repo/",
+            "/content/dist/rhel8/rhui/8.8/some-repo/",
+        ]
+    )
+
+
+def test_uri_alias_limit(caplog: pytest.LogCaptureFixture):
+    # uri_alias applies some limit on the alias resolution depth.
+    #
+    # This test exists to exercise the path of code intended to
+    # prevent runaway recursion. There is no known way how to
+    # actually trigger runaway recursion, so we are just providing
+    # an unrealistic config with more levels of alias than are
+    # actually used on production.
+
+    uri = "/path/a/repo"
+    aliases = [
+        ("/path/a", "/path/b"),
+        ("/path/b", "/path/c"),
+        ("/path/c", "/path/d"),
+        ("/path/d", "/path/e"),
+        ("/path/e", "/path/f"),
+        ("/path/f", "/path/g"),
+        ("/path/g", "/path/h"),
+        ("/path/h", "/path/i"),
+    ]
+
+    out = uri_alias(uri, aliases)
+
+    # It should have stopped resolving aliases at some point.
+    # Note that exactly where it stops is rather abitrary, the
+    # max depth currently is simply hardcoded.
+    assert out == [
+        "/path/f/repo",
+        "/path/e/repo",
+        "/path/d/repo",
+        "/path/c/repo",
+        "/path/b/repo",
+        "/path/a/repo",
+    ]
+
+    # It should have warned us about this.
+    assert (
+        "Aliases too deeply nested, bailing out at /path/f/repo" in caplog.text
+    )