Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix rhui_alias resolution during cache flush [RHELDST-26143] #733

Merged
merged 1 commit into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 70 additions & 5 deletions exodus_gw/aws/dynamodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,77 @@ def _aliases(self, alias_types: list[str]) -> list[tuple[str, str]]:
def aliases_for_write(self) -> list[tuple[str, str]]:
# Aliases used when writing items to DynamoDB.
#
# Exclude rhui aliases (for now? RHELDST-18849).
# Note that these aliases are traversed only in the src => dest
# direction, which intentionally results in a kind of
# canonicalization of paths at publish time.
#
# Example:
#
# Given an alias of:
# src: /content/dist/rhel8/8
# dest: /content/dist/rhel8/8.8
#
# If /content/dist/rhel8/8/foo is published, we write
# an item with path /content/dist/rhel8/8.8/foo and DO NOT
# write /content/dist/rhel8/8/foo.
#
# Note also that rhui_alias is not included here.
# It's not needed because, in practice, all of the content
# accessed via those aliases is always *published* on the
# destination side of an alias.
#
# Example:
#
# Given an alias of:
# src: /content/dist/rhel8/rhui
# dest: /content/dist/rhel8
#
# Content is always published under the /content/dist/rhel8 paths,
# therefore there is no need to resolve the above alias at publish
# time.
#
# It is possible that processing rhui_alias here would not be
# incorrect, but also possible that adding it might have some
# unintended effects.
return self._aliases(["origin_alias", "releasever_alias"])

@property
def aliases_for_flush(self) -> list[tuple[str, str]]:
# Aliases used when flushing cache.
return self._aliases(
["origin_alias", "releasever_alias", "rhui_alias"]
)
out = self._aliases(["origin_alias", "releasever_alias", "rhui_alias"])

# When calculating paths for cache flush, the aliases should be resolved
# in *both* directions, so also return inverted copies of the aliases.
#
# Example:
#
# Given an alias of:
# src: /content/dist/rhel8/8
# dest: /content/dist/rhel8/8.8
#
# - If /content/dist/rhel8/8/foo is published, then
# /content/dist/rhel8/8.8/foo should also have cache flushed
# (src => dest)
# - If /content/dist/rhel8/8.8/foo is published, then
# /content/dist/rhel8/8/foo should also have cache flushed
# (dest => src)
#
# In practice, while it seems technically correct to do this for all aliases,
# this is mainly needed for RHUI content.
#
# Example:
#
# Given an alias of:
# src: /content/dist/rhel8/rhui
# dest: /content/dist/rhel8
#
# We are always publishing content only on the destination side of
# this alias. If we don't traverse the alias from dest => src then we
# will miss the fact that /content/dist/rhel8/rhui paths should also
# have cache flushed.
out = out + [(dest, src) for (src, dest) in out]

return out

def query_definitions(self) -> dict[str, Any]:
"""Query the definitions in the config_table. If definitions are found, return them. Otherwise,
Expand Down Expand Up @@ -123,7 +185,10 @@ def create_request(
# updated timestamp.
from_date = str(item.updated)

web_uri = uri_alias(item.web_uri, uri_aliases)
# Resolve aliases. We only write to the deepest path
# after all alias resolution, hence only using the
# first result from uri_alias.
web_uri = uri_alias(item.web_uri, uri_aliases)[0]

if delete:
request[table_name].append(
Expand Down
133 changes: 98 additions & 35 deletions exodus_gw/aws/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,43 +161,109 @@ def get_reader(cls, request):
return cls(request)


def uri_alias(uri: str, aliases: list[tuple[str, str]]):
def uri_alias(uri: str, aliases: list[tuple[str, str]]) -> list[str]:
# Resolve every alias between paths within the uri (e.g.
# allow RHUI paths to be aliased to non-RHUI).
#
# Aliases are expected to come from cdn-definitions.
#
# Returns an ordered list of URIs, guaranteed to always contain
# at least one element (the original URI).
# URIs are returned for each intermediate step in alias resolution,
# and are ordered by depth, from deepest to most shallow.
#
# For example:
# - if there are no applicable aliases, returns [uri]
# - if there is one matching alias, returns [uri_beyond_alias, uri]
# - if one alias matches, and then beyond that another alias matches
# (meaning two levels deep of alias resolution), returns:
# [uri_beyond_both_aliases, uri_beyond_first_alias, uri]
#
out: list[str] = [uri]
uri_alias_recurse(out, uri, aliases)
return out

new_uri = ""
remaining: list[tuple[str, str]] = aliases

# We do multiple passes here to ensure that nested aliases
# are resolved correctly, regardless of the order in which
# they're provided.
while remaining:
processed: list[tuple[str, str]] = []

for src, dest in remaining:
if uri.startswith(src + "/") or uri == src:
new_uri = uri.replace(src, dest, 1)
LOG.debug(
"Resolved alias:\n\tsrc: %s\n\tdest: %s",
uri,

def uri_alias_recurse(
accum: list[str],
uri: str,
aliases: list[tuple[str, str]],
depth=0,
maxdepth=4,
):
if depth > maxdepth:
# Runaway recursion breaker.
#
# There is no known path to get here, and if we ever do it's
# probably a bug.
#
# The point of this check is to avoid such a bug
# causing publish to completely break or hang.
LOG.warning(
"Aliases too deeply nested, bailing out at %s (URIs so far: %s)",
uri,
accum,
)
return

def add_out(new_uri: str) -> bool:
# Prepends a new URI to the output list
# (or shifts the existing URI to the front if it was already there)
# Return True if the URI was newly added.
out = True
if new_uri in accum:
accum.remove(new_uri)
out = False
accum.insert(0, new_uri)
return out

for src, dest in aliases:
if uri.startswith(src + "/") or uri == src:
new_uri = uri.replace(src, dest, 1)
LOG.debug(
"Resolved alias:\n\tsrc: %s\n\tdest: %s",
uri,
new_uri,
extra={"event": "publish", "success": True},
)
if add_out(new_uri):
# We've calculated a new URI and it's the first
# time we've seen it. We have to recursively apply
# the aliases again to this new URI.
#
# Before doing that, we'll subtract *this* alias from the set,
# meaning that it won't be considered recursively.
#
# We do this because otherwise RHUI aliases are infinitely
# recursive. For example, since:
#
# /content/dist/rhel8/rhui
# => /content/dist/rhel8
#
# That means this also works:
# /content/dist/rhel8/rhui/rhui
# => /content/dist/rhel8/rhui
# => /content/dist/rhel8
#
# In fact you could insert as many "rhui" components into the path
# as you want and still ultimately resolve to the same path.
# That's the way the symlinks actually worked on the legacy CDN.
#
# But this is not desired behavior, we know that every alias is intended
# to be resolved in the URL a maximum of once, hence this adjustment.
sub_aliases = [
(subsrc, subdest)
for (subsrc, subdest) in aliases
if (subsrc, subdest) != (src, dest)
]

uri_alias_recurse(
accum,
new_uri,
extra={"event": "publish", "success": True},
sub_aliases,
depth=depth + 1,
maxdepth=maxdepth,
)
uri = new_uri
processed.append((src, dest))

if not processed:
# We didn't resolve any alias, then we're done processing.
break

# We resolved at least one alias, so we need another round
# in case others apply now. But take out anything we've already
# processed, so it is not possible to recurse.
remaining = [r for r in remaining if r not in processed]

return uri


def uris_with_aliases(
Expand All @@ -211,10 +277,7 @@ def uris_with_aliases(
# We accept inputs both with and without leading '/', normalize.
uri = "/" + uri.removeprefix("/")

# This always goes into the set we'll process.
out.add(uri)

# The value after alias resolution also goes into the set.
out.add(uri_alias(uri, aliases))
for resolved in uri_alias(uri, aliases):
out.add(resolved)

return sorted(out)
109 changes: 106 additions & 3 deletions tests/aws/test_uri_alias.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,20 @@
("/content/origin", "/origin"),
("/origin/rpm", "/origin/rpms"),
],
"/origin/rpms/path/to/file.iso",
[
"/origin/rpms/path/to/file.iso",
"/content/origin/rpms/path/to/file.iso",
],
),
(
"/content/dist/rhel8/8/path/to/file.rpm",
[
("/content/dist/rhel8/8", "/content/dist/rhel8/8.5"),
],
"/content/dist/rhel8/8.5/path/to/file.rpm",
[
"/content/dist/rhel8/8.5/path/to/file.rpm",
"/content/dist/rhel8/8/path/to/file.rpm",
],
),
],
ids=["origin", "releasever"],
Expand All @@ -30,7 +36,104 @@ def test_uri_alias(input, aliases, output, caplog):
caplog.set_level(DEBUG, logger="exodus-gw")
assert uri_alias(input, aliases) == output
assert (
f'"message": "Resolved alias:\\n\\tsrc: {input}\\n\\tdest: {output}", '
f'"message": "Resolved alias:\\n\\tsrc: {input}\\n\\tdest: {output[0]}", '
'"event": "publish", '
'"success": true' in caplog.text
)


def test_uri_alias_multi_level_write():
# uri_alias should support resolving aliases multiple levels deep
# and return values in the right order, using single-direction aliases
# (as is typical in the write case)
uri = "/content/other/1/repo"
aliases = [
# The data here is made up as there is not currently any identified
# realistic scenario having multi-level aliases during write.
("/content/testproduct/1", "/content/testproduct/1.1.0"),
("/content/other", "/content/testproduct"),
]

out = uri_alias(uri, aliases)
assert out == [
"/content/testproduct/1.1.0/repo",
"/content/testproduct/1/repo",
"/content/other/1/repo",
]


def test_uri_alias_multi_level_flush():
# uri_alias should support resolving aliases multiple levels deep
# and return values in the right order, using bi-directional aliases
# (as is typical in the flush case).
#
# This data is realistic for the common case where releasever
# and rhui aliases are both in play.

uri = "/content/dist/rhel8/8/some-repo/"
aliases = [
# The caller is providing aliases in both src => dest and
# dest => src directions, as in the "cache flush" case.
("/content/dist/rhel8/8", "/content/dist/rhel8/8.8"),
("/content/dist/rhel8/8.8", "/content/dist/rhel8/8"),
("/content/dist/rhel8/rhui", "/content/dist/rhel8"),
("/content/dist/rhel8", "/content/dist/rhel8/rhui"),
]

out = uri_alias(uri, aliases)
# We don't verify the order here because, with bi-directional aliases
# provided, it does not really make sense to consider either side of
# the alias as "deeper" than the other.
assert sorted(out) == sorted(
[
# It should return the repo on both sides of the
# releasever alias...
"/content/dist/rhel8/8/some-repo/",
"/content/dist/rhel8/8.8/some-repo/",
# And *also* on both sides of the releasever alias, beyond
# the RHUI alias.
"/content/dist/rhel8/rhui/8/some-repo/",
"/content/dist/rhel8/rhui/8.8/some-repo/",
]
)


def test_uri_alias_limit(caplog: pytest.LogCaptureFixture):
# uri_alias applies some limit on the alias resolution depth.
#
# This test exists to exercise the path of code intended to
# prevent runaway recursion. There is no known way how to
# actually trigger runaway recursion, so we are just providing
# an unrealistic config with more levels of alias than are
# actually used on production.

uri = "/path/a/repo"
aliases = [
("/path/a", "/path/b"),
("/path/b", "/path/c"),
("/path/c", "/path/d"),
("/path/d", "/path/e"),
("/path/e", "/path/f"),
("/path/f", "/path/g"),
("/path/g", "/path/h"),
("/path/h", "/path/i"),
]

out = uri_alias(uri, aliases)

# It should have stopped resolving aliases at some point.
# Note that exactly where it stops is rather abitrary, the
# max depth currently is simply hardcoded.
assert out == [
"/path/f/repo",
"/path/e/repo",
"/path/d/repo",
"/path/c/repo",
"/path/b/repo",
"/path/a/repo",
]

# It should have warned us about this.
assert (
"Aliases too deeply nested, bailing out at /path/f/repo" in caplog.text
)
Loading