Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace current_master for current_primary #78

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,4 @@ jobs:
provider: microk8s
bootstrap-options: "--agent-version 2.9.29"
- name: Run integration tests
run: tox -e integration-scaling
run: tox -e integration-scaling
8 changes: 4 additions & 4 deletions lib/charms/redis_k8s/v0/redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def __init__(self, charm, port):

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LIBPATCH on the lib will need updating to 6

def _on_relation_changed(self, event):
"""Handle the relation changed event."""
event.relation.data[self.model.unit]["hostname"] = self._get_master_ip()
event.relation.data[self.model.unit]["hostname"] = self._get_primary_ip()
event.relation.data[self.model.unit]["port"] = str(self._port)
# The reactive Redis charm also exposes 'password'. When tackling
# https://github.com/canonical/redis-k8s/issues/7 add 'password'
Expand All @@ -146,6 +146,6 @@ def _bind_address(self, event):
return address
return self.app.name

def _get_master_ip(self) -> str:
"""Gets the ip of the current redis master."""
return socket.gethostbyname(self._charm.current_master)
def _get_primary_ip(self) -> str:
"""Gets the ip of the current redis primary process."""
return socket.gethostbyname(self._charm.current_primary)
74 changes: 37 additions & 37 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,20 +88,20 @@ def _redis_pebble_ready(self, event) -> None:
# In the event of a pod restart on the same node the upgrade event is not fired.
# The IP might change, so the data needs to be propagated
for relation in self.model.relations[REDIS_REL_NAME]:
relation.data[self.model.unit]["hostname"] = socket.gethostbyname(self.current_master)
relation.data[self.model.unit]["hostname"] = socket.gethostbyname(self.current_primary)

def _upgrade_charm(self, event: UpgradeCharmEvent) -> None:
"""Handle the upgrade_charm event.

Check for failover status and update connection information for redis relation and
current_master.
current_primary.
Also tries to store the certificates on the redis container, as new `juju attach-resource`
will trigger this event.
"""
self._store_certificates()

# NOTE: This is the case of a single unit deployment. If that's the case, the charm
# doesn't need to check for failovers or figure out who the master is.
# doesn't need to check for failovers or figure out who the primary is.
if not self._peers.units:
# NOTE: pod restart or charm upgrade can come along with pod IP changes, and
# during those process, the leader-elected and any relation events are not emitted.
Expand All @@ -126,9 +126,9 @@ def _upgrade_charm(self, event: UpgradeCharmEvent) -> None:
# unit is not a leader, add a key to the application databag so peer_relation_changed
# triggers for the leader unit and application databag is updated.
if self.unit.is_leader():
info = self.sentinel.get_master_info(host=k8s_host)
logger.debug(f"Master info: {info}")
logger.info(f"Unit {self.unit.name} updating master info to {info['ip']}")
info = self.sentinel.get_primary_info(host=k8s_host)
logger.debug(f"primary info: {info}")
logger.info(f"Unit {self.unit.name} updating primary info to {info['ip']}")
self._peers.data[self.app][LEADER_HOST_KEY] = info["ip"]
else:
relations = self.model.relations[REDIS_REL_NAME]
Expand All @@ -150,17 +150,17 @@ def _leader_elected(self, event) -> None:
if not self.get_sentinel_password():
logger.info("Creating sentinel password")
self._peers.data[self.app][SENTINEL_PASSWORD_KEY] = self._generate_password()
# NOTE: if current_master is not set yet, the application is being deployed for the
# NOTE: if current_primary is not set yet, the application is being deployed for the
# first time. Otherwise, we check for failover in case previous juju leader was redis
# master as well.
if self.current_master is None:
# primary as well.
if self.current_primary is None:
logger.info(
"Initial replication, setting leader-host to {}".format(self.unit_pod_hostname)
)
self._peers.data[self.app][LEADER_HOST_KEY] = self.unit_pod_hostname
else:
# TODO extract to method shared with relation_departed
self._update_application_master()
self._update_application_primary()
self._update_quorum()
if not self._is_failover_finished():
logger.info("Failover didn't finish, deferring")
Expand Down Expand Up @@ -199,21 +199,21 @@ def _update_status(self, _) -> None:
"""
logger.info("Beginning update_status")
if self.unit.is_leader():
self._update_application_master()
self._update_application_primary()
self._redis_check()

def _peer_relation_changed(self, event):
"""Handle relation for joining units."""
if not self._master_up_to_date():
logger.error(f"Unit {self.unit.name} doesn't agree on tracked master")
if not self._primary_up_to_date():
logger.error(f"Unit {self.unit.name} doesn't agree on tracked primary")
if not self._is_failover_finished():
logger.info("Failover didn't finish, deferring")
event.defer()
return

if self.unit.is_leader():
# Update who the current master is
self._update_application_master()
# Update who the current primary is
self._update_application_primary()

# (DEPRECATE) If legacy relation exists, layer might need to be
# reconfigured to remove auth
Expand All @@ -224,7 +224,7 @@ def _peer_relation_changed(self, event):
if relations:
for relation in relations:
relation.data[self.model.unit]["hostname"] = socket.gethostbyname(
self.current_master
self.current_primary
)
if self._peers.data[self.unit].get("upgrading", "false") == "true":
self._peers.data[self.unit]["upgrading"] = ""
Expand All @@ -247,8 +247,8 @@ def _peer_relation_departed(self, event):
if not self.unit.is_leader():
return

if not self._master_up_to_date():
self._update_application_master()
if not self._primary_up_to_date():
self._update_application_primary()

# Quorum is updated beforehand, since removal of more units than current majority
# could lead to the cluster never reaching quorum.
Expand Down Expand Up @@ -376,9 +376,9 @@ def _redis_extra_flags(self) -> str:
f"--tls-ca-cert-file {self._storage_path}/ca.crt",
]

# Check that current unit is master
if self.current_master != self.unit_pod_hostname:
extra_flags += [f"--replicaof {self.current_master} {REDIS_PORT}"]
# Check that current unit is primary
if self.current_primary != self.unit_pod_hostname:
extra_flags += [f"--replicaof {self.current_primary} {REDIS_PORT}"]

if self.config["enable-tls"]:
extra_flags += ["--tls-replication yes"]
Expand Down Expand Up @@ -455,8 +455,8 @@ def unit_pod_hostname(self, name="") -> str:
return socket.getfqdn(name)

@property
def current_master(self) -> Optional[str]:
"""Get the current master."""
def current_primary(self) -> Optional[str]:
"""Get the current primary."""
return self._peers.data[self.app].get(LEADER_HOST_KEY)

def _valid_app_databag(self) -> bool:
Expand All @@ -472,7 +472,7 @@ def _valid_app_databag(self) -> bool:
if self._peers.data[self.app].get("enable-password", "true") == "false":
password = True

return bool(password and self.current_master)
return bool(password and self.current_primary)

def _generate_password(self) -> str:
"""Generate a random 16 character password string.
Expand Down Expand Up @@ -574,38 +574,38 @@ def _redis_client(self, hostname="localhost") -> Redis:
finally:
client.close()

def _master_up_to_date(self, host="0.0.0.0") -> bool:
"""Check if stored master is the same as sentinel tracked.
def _primary_up_to_date(self, host="0.0.0.0") -> bool:
"""Check if stored primary is the same as sentinel tracked.

Returns:
host: string to connect to sentinel
"""
info = self.sentinel.get_master_info(host=host)
info = self.sentinel.get_primary_info(host=host)
if info is None:
return False
elif (info["ip"] == self.current_master) and ("s_down" not in info["flags"]):
elif (info["ip"] == self.current_primary) and ("s_down" not in info["flags"]):
return True

return False

def _update_application_master(self) -> None:
"""Use Sentinel to update the current master hostname."""
info = self.sentinel.get_master_info()
logger.debug(f"Master info: {info}")
def _update_application_primary(self) -> None:
"""Use Sentinel to update the current primary hostname."""
info = self.sentinel.get_primary_info()
logger.debug(f"primary info: {info}")
if info is None:
logger.warning("Could not update current master")
logger.warning("Could not update current primary")
return

logger.info(f"Unit {self.unit.name} updating master info to {info['ip']}")
logger.info(f"Unit {self.unit.name} updating primary info to {info['ip']}")
self._peers.data[self.app][LEADER_HOST_KEY] = info["ip"]

def _sentinel_failover(self, departing_unit_name: str) -> None:
"""Try to failover the current master.
"""Try to failover the current primary.

This method should only be called from juju leader, to avoid more than one
sentinel sending failovers concurrently.
"""
if self._k8s_hostname(departing_unit_name) != self.current_master:
if self._k8s_hostname(departing_unit_name) != self.current_primary:
# No failover needed
return

Expand All @@ -628,7 +628,7 @@ def _is_failover_finished(self, host="localhost") -> bool:
True if failover is finished, false otherwise
"""
logger.warning("Checking if failover is finished.")
info = self.sentinel.get_master_info(host=host)
info = self.sentinel.get_primary_info(host=host)
if info is None:
logger.warning("Could not check failover status")
return False
Expand Down
18 changes: 9 additions & 9 deletions src/sentinel.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,12 @@ def _render_sentinel_config_file(self) -> None:
# render the template file with the correct values.
rendered = template.render(
hostname=self.charm.unit_pod_hostname,
master_name=self.charm._name,
primary_name=self.charm._name,
sentinel_port=SENTINEL_PORT,
redis_master=self.charm.current_master,
redis_primary=self.charm.current_primary,
redis_port=REDIS_PORT,
quorum=self.expected_quorum,
master_password=self.charm._get_password(),
primary_password=self.charm._get_password(),
sentinel_password=self.charm.get_sentinel_password(),
)
self._copy_file(SENTINEL_CONFIG_PATH, rendered, "sentinel")
Expand Down Expand Up @@ -147,17 +147,17 @@ def _copy_file(self, path: str, rendered: str, container: str) -> None:
group="redis",
)

def get_master_info(self, host="0.0.0.0") -> Optional[dict]:
"""Connect to sentinel and return the current master."""
def get_primary_info(self, host="0.0.0.0") -> Optional[dict]:
"""Connect to sentinel and return the current primary."""
with self.sentinel_client(host) as sentinel:
try:
# get sentinel info about the master
master_info = sentinel.execute_command(f"SENTINEL MASTER {self.charm._name}")
# get sentinel info about the primary
primary_info = sentinel.execute_command(f"SENTINEL MASTER {self.charm._name}")

# NOTE: master info from redis comes like a list:
# NOTE: primary info from redis comes like a list:
# ['key1', 'value1', 'key2', 'value2', ...]
# this creates a dictionary in a more readable form.
return dict(zip(master_info[::2], master_info[1::2]))
return dict(zip(primary_info[::2], primary_info[1::2]))

except (ConnectionError, TimeoutError) as e:
logger.error("Error when connecting to sentinel: {}".format(e))
Expand Down
12 changes: 6 additions & 6 deletions templates/sentinel.conf.j2
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
port {{ sentinel_port }}
sentinel monitor {{ master_name }} {{ redis_master }} {{ redis_port }} {{ quorum }}
sentinel down-after-milliseconds {{ master_name }} 5000
sentinel failover-timeout {{ master_name }} 30000
sentinel parallel-syncs {{ master_name }} 1
sentinel monitor {{ primary_name }} {{ redis_primary }} {{ redis_port }} {{ quorum }}
sentinel down-after-milliseconds {{ primary_name }} 5000
sentinel failover-timeout {{ primary_name }} 30000
sentinel parallel-syncs {{ primary_name }} 1

{% if master_password != None %}
sentinel auth-pass {{ master_name }} {{ master_password }}
{% if primary_password != None %}
sentinel auth-pass {{ primary_name }} {{ primary_password }}
{% endif %}
requirepass {{ sentinel_password }}

Expand Down
32 changes: 16 additions & 16 deletions tests/integration/test_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ async def test_build_and_deploy(ops_test: OpsTest):
assert ops_test.model.applications[APP_NAME].units[0].workload_status == "active"


@pytest.mark.run(before="test_scale_down_departing_master")
@pytest.mark.run(before="test_scale_down_departing_primary")
async def test_scale_up_replication_after_failover(ops_test: OpsTest):
"""Trigger a failover and scale up the application, then test replication status."""
unit_map = await get_unit_map(ops_test)
Expand All @@ -60,15 +60,15 @@ async def test_scale_up_replication_after_failover(ops_test: OpsTest):
leader_address = await get_address(ops_test, unit_num=leader_num)
password = await get_password(ops_test, leader_num)

# Set some key on the master replica.
# Set some key on the primary replica.
leader_client = Redis(leader_address, password=password)
leader_client.set("testKey", "myValue")
leader_client.close()

sentinel_password = await get_sentinel_password(ops_test)
logger.info("retrieved sentinel password for %s: %s", APP_NAME, password)

# Trigger a master failover
# Trigger a primary failover
sentinel = Redis(leader_address, password=sentinel_password, port=26379, decode_responses=True)
sentinel.execute_command(f"SENTINEL failover {APP_NAME}")
# Give time so sentinel updates information of failover
Expand All @@ -94,13 +94,13 @@ async def test_scale_up_replication_after_failover(ops_test: OpsTest):
timeout=1000,
)

master_info = sentinel.execute_command(f"SENTINEL MASTER {APP_NAME}")
master_info = dict(zip(master_info[::2], master_info[1::2]))
primary_info = sentinel.execute_command(f"SENTINEL MASTER {APP_NAME}")
primary_info = dict(zip(primary_info[::2], primary_info[1::2]))

# General checks that the system is aware of the new unit
assert master_info["num-slaves"] == "3"
assert master_info["quorum"] == "3"
assert master_info["num-other-sentinels"] == "3"
assert primary_info["num-slaves"] == "3"
assert primary_info["quorum"] == "3"
assert primary_info["num-other-sentinels"] == "3"

unit_map = await get_unit_map(ops_test)
# Check that the initial key is still replicated across units
Expand All @@ -112,7 +112,7 @@ async def test_scale_up_replication_after_failover(ops_test: OpsTest):


@pytest.mark.run(after="test_scale_up_replication_after_failover")
async def test_scale_down_departing_master(ops_test: OpsTest):
async def test_scale_down_departing_primary(ops_test: OpsTest):
"""Failover to the last unit and scale down."""
unit_map = await get_unit_map(ops_test)
logger.info("Unit mapping: {}".format(unit_map))
Expand All @@ -130,13 +130,13 @@ async def test_scale_down_departing_master(ops_test: OpsTest):
last_redis = Redis(last_address, password=password, decode_responses=True)

# INITIAL SETUP #
# Sanity check that the added unit on the previous test is not a master
# Sanity check that the added unit on the previous test is not a primary
assert last_redis.execute_command("ROLE")[0] != "master"

# Make the added unit a priority during failover
last_redis.execute_command("CONFIG SET replica-priority 1")
time.sleep(1)
# Failover so the last unit becomes master
# Failover so the last unit becomes primary
sentinel.execute_command(f"SENTINEL FAILOVER {APP_NAME}")
# Give time so sentinel updates information of failover
time.sleep(60)
Expand All @@ -158,12 +158,12 @@ async def test_scale_down_departing_master(ops_test: OpsTest):
assert client.get("testKey") == b"myValue"
client.close()

master_info = sentinel.execute_command(f"SENTINEL MASTER {APP_NAME}")
master_info = dict(zip(master_info[::2], master_info[1::2]))
primary_info = sentinel.execute_command(f"SENTINEL MASTER {APP_NAME}")
primary_info = dict(zip(primary_info[::2], primary_info[1::2]))

# General checks that the system is reconfigured after departed leader
assert master_info["num-slaves"] == "2"
assert master_info["quorum"] == "2"
assert master_info["num-other-sentinels"] == "2"
assert primary_info["num-slaves"] == "2"
assert primary_info["quorum"] == "2"
assert primary_info["num-other-sentinels"] == "2"

sentinel.close()
Loading
Loading