From 7614a186c32d930b2c1c7087ae611d5af2697ae3 Mon Sep 17 00:00:00 2001 From: Smail KOURTA Date: Tue, 24 Sep 2024 09:54:01 +0400 Subject: [PATCH] [DPE-4359] TLS ca/certificate rotation without downtime (#447) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Issue We currently make use of PEM files for the: - CA - Certificates (admin, unit-http/unit-transport) - private keys This is fine in all cases but the case where the CA is rotated, in which case downtime will be incurred as soon as the first unit gets a new CA and restarts. We need to switch from individual certificate resources to: - java keystores: for private keys, admin/unit-http-transport certificates - java trust stores: for CA We need to implement the whole 2-stages rolling restart routine for this case. ## Solution ### Implementation: Ensure that: 1. TLS certificates rotation on expiration works as expected without downtime 2. TLS CA certificates rotation on expiration works as expected without downtime 3. TLS certificates rotation works as expected after a CA rotation, without downtime 4. For both simple and large deployments ### Testing: - unit tests - integration tests coverage for the 4 previous cases ### Documentation: Document on charmhub the whole TLS rotation workflow for both deployment modes: - simple - large --------- Co-authored-by: René Radoi Co-authored-by: Mehdi Bendriss Co-authored-by: Judit Novak --- lib/charms/opensearch/v0/constants_charm.py | 1 + .../opensearch/v0/helper_conf_setter.py | 34 + .../opensearch/v0/opensearch_base_charm.py | 111 +- lib/charms/opensearch/v0/opensearch_config.py | 22 +- lib/charms/opensearch/v0/opensearch_distro.py | 3 +- .../v0/opensearch_relation_peer_cluster.py | 6 + .../opensearch/v0/opensearch_secrets.py | 8 +- lib/charms/opensearch/v0/opensearch_tls.py | 288 +++- tests/integration/plugins/test_plugins.py | 4 +- tests/integration/tls/test_ca_rotation.py | 101 ++ tests/unit/helpers.py | 60 +- tests/unit/lib/test_opensearch_base_charm.py | 7 + tests/unit/lib/test_opensearch_config.py | 4 - tests/unit/lib/test_opensearch_distro.py | 10 +- tests/unit/lib/test_opensearch_secrets.py | 3 +- tests/unit/lib/test_opensearch_tls.py | 1347 ++++++++++++++++- tests/unit/resources/config/opensearch.yml | 7 +- tests/unit/test_charm.py | 2 +- 18 files changed, 1903 insertions(+), 115 deletions(-) create mode 100644 tests/integration/tls/test_ca_rotation.py diff --git a/lib/charms/opensearch/v0/constants_charm.py b/lib/charms/opensearch/v0/constants_charm.py index 73f9d0c8e..b24aebdc2 100644 --- a/lib/charms/opensearch/v0/constants_charm.py +++ b/lib/charms/opensearch/v0/constants_charm.py @@ -80,6 +80,7 @@ SecurityIndexInitProgress = "Initializing the security index..." AdminUserInitProgress = "Configuring admin user..." TLSNewCertsRequested = "Requesting new TLS certificates..." +TLSCaRotation = "Applying new CA certificate..." HorizontalScaleUpSuggest = "Horizontal scale up advised: {} shards unassigned." WaitingForOtherUnitServiceOps = "Waiting for other units to complete the ops on their service." NewIndexRequested = "new index {index} requested" diff --git a/lib/charms/opensearch/v0/helper_conf_setter.py b/lib/charms/opensearch/v0/helper_conf_setter.py index 099f96811..be2f55fc8 100755 --- a/lib/charms/opensearch/v0/helper_conf_setter.py +++ b/lib/charms/opensearch/v0/helper_conf_setter.py @@ -146,6 +146,20 @@ def replace( """ pass + @abstractmethod + def append( + self, + config_file: str, + text_to_append: str, + ) -> None: + """Append any string to a text file. + + Args: + config_file (str): Path to the source config file + text_to_append (str): The str to append to the config file + """ + pass + @staticmethod def __clean_base_path(base_path: str): if base_path is None: @@ -283,6 +297,26 @@ def replace( with open(output_file, "w") as g: g.write(data) + @override + def append( + self, + config_file: str, + text_to_append: str, + ) -> None: + """Append any string to a text file. + + Args: + config_file (str): Path to the source config file + text_to_append (str): The str to append to the config file + """ + path = f"{self.base_path}{config_file}" + + if not exists(path): + raise FileNotFoundError(f"{path} not found.") + + with open(path, "a") as f: + f.write("\n" + text_to_append) + def __dump(self, data: Dict[str, any], output_type: OutputType, target_file: str): """Write the YAML data on the corresponding "output_type" stream.""" if not data: diff --git a/lib/charms/opensearch/v0/opensearch_base_charm.py b/lib/charms/opensearch/v0/opensearch_base_charm.py index 9a030f638..fbb62aa78 100644 --- a/lib/charms/opensearch/v0/opensearch_base_charm.py +++ b/lib/charms/opensearch/v0/opensearch_base_charm.py @@ -33,13 +33,14 @@ ServiceIsStopping, ServiceStartError, ServiceStopped, + TLSCaRotation, TLSNewCertsRequested, TLSNotFullyConfigured, TLSRelationBrokenError, TLSRelationMissing, WaitingToStart, ) -from charms.opensearch.v0.constants_tls import TLS_RELATION, CertType +from charms.opensearch.v0.constants_tls import CertType from charms.opensearch.v0.helper_charm import Status, all_units, format_unit_name from charms.opensearch.v0.helper_cluster import ClusterTopology, Node from charms.opensearch.v0.helper_networking import get_host_ip, units_ips @@ -189,7 +190,7 @@ def __init__(self, *args, distro: Type[OpenSearchDistribution] = None): self.peers_data = RelationDataStore(self, PeerRelationName) self.secrets = OpenSearchSecrets(self, PeerRelationName) self.tls = OpenSearchTLS( - self, TLS_RELATION, self.opensearch.paths.jdk, self.opensearch.paths.certs + self, PeerRelationName, self.opensearch.paths.jdk, self.opensearch.paths.certs ) self.status = Status(self) self.health = OpenSearchHealth(self) @@ -448,9 +449,6 @@ def _on_peer_relation_created(self, event: RelationCreatedEvent): "Adding units during an upgrade is not supported. The charm may be in a broken, unrecoverable state" ) - # Store the "Admin" certificate, key and CA on the disk of the new unit - self.tls.store_admin_tls_secrets_if_applies() - def _on_peer_relation_joined(self, event: RelationJoinedEvent): """Event received by all units when a new node joins the cluster.""" if self.upgrade_in_progress: @@ -460,8 +458,6 @@ def _on_peer_relation_joined(self, event: RelationJoinedEvent): def _on_peer_relation_changed(self, event: RelationChangedEvent): """Handle peer relation changes.""" - self.tls.store_admin_tls_secrets_if_applies() - if self.unit.is_leader() and self.opensearch.is_node_up(): health = self.health.apply() if self._is_peer_rel_changed_deferred: @@ -576,7 +572,7 @@ def _on_opensearch_data_storage_detaching(self, _: StorageDetachingEvent): # no # release lock self.node_lock.release() - def _on_update_status(self, event: UpdateStatusEvent): + def _on_update_status(self, event: UpdateStatusEvent): # noqa: C901 """On update status event. We want to periodically check for the following: @@ -758,6 +754,11 @@ def _on_get_password_action(self, event: ActionEvent): } ) + def on_tls_ca_rotation(self): + """Called when adding new CA to the trust store.""" + self.status.set(MaintenanceStatus(TLSCaRotation)) + self._restart_opensearch_event.emit() + def on_tls_conf_set( self, event: CertificateAvailableEvent, scope: Scope, cert_type: CertType, renewal: bool ): @@ -790,12 +791,25 @@ def on_tls_conf_set( self.tls.store_admin_tls_secrets_if_applies() # In case of renewal of the unit transport layer cert - restart opensearch - if renewal and self.is_admin_user_configured() and self.tls.is_fully_configured(): - try: - self.tls.reload_tls_certificates() - except OpenSearchHttpError: - logger.error("Could not reload TLS certificates via API, will restart.") - self._restart_opensearch_event.emit() + if renewal and self.is_admin_user_configured(): + if self.tls.is_fully_configured(): + try: + self.tls.reload_tls_certificates() + except OpenSearchHttpError: + logger.error("Could not reload TLS certificates via API, will restart.") + self._restart_opensearch_event.emit() + else: + # the chain.pem file should only be updated after applying the new certs + # otherwise there could be TLS verification errors after renewing the CA + self.tls.update_request_ca_bundle() + self.status.clear(TLSNotFullyConfigured) + self.tls.reset_ca_rotation_state() + # cleaning the former CA certificate from the truststore + # must only be done AFTER all renewed certificates are available and loaded + self.tls.remove_old_ca() + else: + event.defer() + return def on_tls_relation_broken(self, _: RelationBrokenEvent): """As long as all certificates are produced, we don't do anything.""" @@ -827,6 +841,18 @@ def is_every_unit_marked_as_started(self) -> bool: except OpenSearchHttpError: return False + def is_tls_full_configured_in_cluster(self) -> bool: + """Check if TLS is configured in all the units of the current cluster.""" + rel = self.model.get_relation(PeerRelationName) + for unit in all_units(self): + if ( + rel.data[unit].get("tls_configured") != "True" + or "tls_ca_renewing" in rel.data[unit] + or "tls_ca_renewed" in rel.data[unit] + ): + return False + return True + def is_admin_user_configured(self) -> bool: """Check if admin user configured.""" # In case the initialisation of the admin user is not finished yet @@ -895,18 +921,23 @@ def _start_opensearch(self, event: _StartOpenSearch) -> None: # noqa: C901 self.peers_data.delete(Scope.UNIT, "started") - if not self.node_lock.acquired: - # (Attempt to acquire lock even if `event.ignore_lock`) - if event.ignore_lock: - # Only used for force upgrades - logger.debug("Starting without lock") - else: - logger.debug("Lock to start opensearch not acquired. Will retry next event") - event.defer() - return + if event.ignore_lock: + # Only used for force upgrades + logger.debug("Starting without lock") + elif not self.node_lock.acquired: + logger.debug("Lock to start opensearch not acquired. Will retry next event") + event.defer() + return if not self._can_service_start(): + # after rotating the CA and certificates: + # the last host in the cluster to restart might not be able to connect to the other + # hosts anymore, because it is the last to renew the pem-file for requests + # in this case we update the pem-file to be able to connect and start the host + if self.peers_data.get(Scope.UNIT, "tls_ca_renewed", False): + self.tls.update_request_ca_bundle() self.node_lock.release() + logger.info("Could not start opensearch service. Will retry next event.") event.defer() return @@ -939,6 +970,11 @@ def _start_opensearch(self, event: _StartOpenSearch) -> None: # noqa: C901 self.unit.status = BlockedStatus(str(e)) return + # we should update the chain.pem file to avoid TLS verification errors + # this happens on restarts after applying a new admin cert on CA rotation + if self.peers_data.get(Scope.UNIT, "tls_ca_renewed", False): + self.tls.update_request_ca_bundle() + try: self.opensearch.start( wait_until_http_200=( @@ -947,7 +983,11 @@ def _start_opensearch(self, event: _StartOpenSearch) -> None: # noqa: C901 ) ) self._post_start_init(event) - except (OpenSearchHttpError, OpenSearchStartTimeoutError, OpenSearchNotFullyReadyError): + except ( + OpenSearchHttpError, + OpenSearchStartTimeoutError, + OpenSearchNotFullyReadyError, + ) as e: self.node_lock.release() # In large deployments with cluster-manager-only-nodes, the startup might fail # for the cluster-manager if a joining data node did not yet initialize the @@ -955,6 +995,7 @@ def _start_opensearch(self, event: _StartOpenSearch) -> None: # noqa: C901 if self.opensearch_peer_cm.is_provider(typ="main"): self.peer_cluster_provider.refresh_relation_data(event, can_defer=False) event.defer() + logger.warning(e) except (OpenSearchStartError, OpenSearchUserMgmtError) as e: logger.warning(e) self.node_lock.release() @@ -992,7 +1033,7 @@ def _post_start_init(self, event: _StartOpenSearch): # noqa: C901 try: nodes = self._get_nodes(use_localhost=self.opensearch.is_node_up()) except OpenSearchHttpError: - logger.debug("Failed to get online nodes") + logger.info("Failed to get online nodes") event.defer() return @@ -1043,6 +1084,7 @@ def _post_start_init(self, event: _StartOpenSearch): # noqa: C901 # clear waiting to start status self.status.clear(WaitingToStart) + self.status.clear(ServiceStartError) self.status.clear(PClusterNoDataNode) if event.after_upgrade: @@ -1100,6 +1142,23 @@ def _post_start_init(self, event: _StartOpenSearch): # noqa: C901 if self.opensearch_peer_cm.is_provider(): self.peer_cluster_provider.refresh_relation_data(event, can_defer=False) + # update the peer relation data for TLS CA rotation routine + self.tls.reset_ca_rotation_state() + if self.is_tls_full_configured_in_cluster(): + self.status.clear(TLSCaRotation) + self.status.clear(TLSNotFullyConfigured) + + # request new certificates after rotating the CA + if self.peers_data.get(Scope.UNIT, "tls_ca_renewing", False) and self.peers_data.get( + Scope.UNIT, "tls_ca_renewed", False + ): + self.status.set(MaintenanceStatus(TLSNotFullyConfigured)) + self.tls.request_new_unit_certificates() + if self.unit.is_leader(): + self.tls.request_new_admin_certificate() + else: + self.tls.store_admin_tls_secrets_if_applies() + def _stop_opensearch(self, *, restart=False) -> None: """Stop OpenSearch if possible.""" self.status.set(WaitingStatus(ServiceIsStopping)) @@ -1144,7 +1203,9 @@ def _restart_opensearch(self, event: _RestartOpenSearch) -> None: try: self._stop_opensearch(restart=True) + logger.info("Restarting OpenSearch.") except OpenSearchStopError as e: + logger.info(f"Error while Restarting Opensearch: {e}") logger.exception(e) self.node_lock.release() event.defer() diff --git a/lib/charms/opensearch/v0/opensearch_config.py b/lib/charms/opensearch/v0/opensearch_config.py index 896705932..a9ffa2d89 100644 --- a/lib/charms/opensearch/v0/opensearch_config.py +++ b/lib/charms/opensearch/v0/opensearch_config.py @@ -64,6 +64,11 @@ def set_client_auth(self): True, ) + self._opensearch.config.append( + self.JVM_OPTIONS, + "-Djdk.tls.client.protocols=TLSv1.2", + ) + def set_admin_tls_conf(self, secrets: Dict[str, any]): """Configures the admin certificate.""" self._opensearch.config.put( @@ -89,12 +94,11 @@ def set_node_tls_conf(self, cert_type: CertType, truststore_pwd: str, keystore_p f"{self._opensearch.paths.certs_relative}/{cert if cert == 'ca' else cert_type}.p12", ) - for store_type, certificate_type in [("keystore", cert_type.val), ("truststore", "ca")]: - self._opensearch.config.put( - self.CONFIG_YML, - f"plugins.security.ssl.{target_conf_layer}.{store_type}_alias", - certificate_type, - ) + self._opensearch.config.put( + self.CONFIG_YML, + f"plugins.security.ssl.{target_conf_layer}.keystore_alias", + cert_type.val, + ) for store_type, pwd in [("keystore", keystore_pwd), ("truststore", truststore_pwd)]: self._opensearch.config.put( @@ -103,6 +107,12 @@ def set_node_tls_conf(self, cert_type: CertType, truststore_pwd: str, keystore_p pwd, ) + self._opensearch.config.put( + self.CONFIG_YML, + f"plugins.security.ssl.{target_conf_layer}.enabled_protocols", + "TLSv1.2", + ) + def append_transport_node(self, ip_pattern_entries: List[str], append: bool = True): """Set the IP address of the new unit in nodes_dn.""" if not append: diff --git a/lib/charms/opensearch/v0/opensearch_distro.py b/lib/charms/opensearch/v0/opensearch_distro.py index 3e0b3105d..7eb4e79e3 100644 --- a/lib/charms/opensearch/v0/opensearch_distro.py +++ b/lib/charms/opensearch/v0/opensearch_distro.py @@ -192,7 +192,8 @@ def is_node_up(self, host: Optional[str] = None) -> bool: timeout=1, ) return resp_code < 400 - except (OpenSearchHttpError, Exception): + except (OpenSearchHttpError, Exception) as e: + logger.debug(f"Error when checking if host {host} is up: {e}") return False def run_bin(self, bin_script_name: str, args: str = None, stdin: str = None) -> str: diff --git a/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py b/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py index d77941128..39193d694 100644 --- a/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py +++ b/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py @@ -634,6 +634,7 @@ def _set_security_conf(self, data: PeerClusterRelData) -> None: # store the app admin TLS resources if not stored self.charm.tls.store_new_tls_resources(CertType.APP_ADMIN, data.credentials.admin_tls) + self.charm.tls.update_request_ca_bundle() # take over the internal users from the main orchestrator self.charm.user_manager.put_internal_user(AdminUser, data.credentials.admin_password_hash) @@ -916,6 +917,11 @@ def _error_set_from_tls(self, peer_cluster_rel_data: PeerClusterRelData) -> bool blocked_msg = "CA certificate mismatch between clusters." should_sever_relation = True + if not peer_cluster_rel_data.credentials.admin_tls["truststore-password"]: + logger.info("Relation data for TLS is missing.") + blocked_msg = "CA truststore-password not available." + should_sever_relation = True + if not blocked_msg: self._clear_errors("error_from_tls") return False diff --git a/lib/charms/opensearch/v0/opensearch_secrets.py b/lib/charms/opensearch/v0/opensearch_secrets.py index eca9da76a..c36e7b9f6 100644 --- a/lib/charms/opensearch/v0/opensearch_secrets.py +++ b/lib/charms/opensearch/v0/opensearch_secrets.py @@ -114,13 +114,7 @@ def _on_secret_changed(self, event: SecretChangedEvent): # noqa: C901 logger.debug("Secret change for %s", str(label_key)) - # Leader has to maintain TLS and Dashboards relation credentials - if not is_leader and label_key == CertType.APP_ADMIN.val: - self._charm.tls.store_new_tls_resources(CertType.APP_ADMIN, event.secret.get_content()) - if self._charm.tls.is_fully_configured(): - self._charm.peers_data.put(Scope.UNIT, "tls_configured", True) - - elif is_leader and label_key == self._charm.secrets.password_key(KibanaserverUser): + if is_leader and label_key == self._charm.secrets.password_key(KibanaserverUser): self._charm.opensearch_provider.update_dashboards_password() # Non-leader units need to maintain local users in internal_users.yml diff --git a/lib/charms/opensearch/v0/opensearch_tls.py b/lib/charms/opensearch/v0/opensearch_tls.py index 43981e411..a4f585f95 100644 --- a/lib/charms/opensearch/v0/opensearch_tls.py +++ b/lib/charms/opensearch/v0/opensearch_tls.py @@ -30,6 +30,7 @@ from charms.opensearch.v0.helper_security import generate_password from charms.opensearch.v0.models import DeploymentType from charms.opensearch.v0.opensearch_exceptions import ( + OpenSearchCmdError, OpenSearchError, OpenSearchHttpError, ) @@ -122,8 +123,13 @@ def request_new_admin_certificate(self) -> None: """Request the generation of a new admin certificate.""" if not self.charm.unit.is_leader(): return - - self._request_certificate(Scope.APP, CertType.APP_ADMIN) + admin_secrets = self.charm.secrets.get_object(Scope.APP, CertType.APP_ADMIN.val) or {} + self._request_certificate( + Scope.APP, + CertType.APP_ADMIN, + admin_secrets.get("key"), + admin_secrets.get("key-password"), + ) def request_new_unit_certificates(self) -> None: """Requests a new certificate with the given scope and type from the tls operator.""" @@ -150,9 +156,7 @@ def _on_tls_relation_created(self, event: RelationCreatedEvent) -> None: if not (deployment_desc := self.charm.opensearch_peer_cm.deployment_desc()): event.defer() return - - admin_secrets = self.charm.secrets.get_object(Scope.APP, CertType.APP_ADMIN.val) or {} - + admin_cert = self.charm.secrets.get_object(Scope.APP, CertType.APP_ADMIN.val) or {} if self.charm.unit.is_leader() and deployment_desc.typ == DeploymentType.MAIN_ORCHESTRATOR: # create passwords for both ca trust_store/admin key_store self._create_keystore_pwd_if_not_exists(Scope.APP, CertType.APP_ADMIN, "ca") @@ -160,9 +164,8 @@ def _on_tls_relation_created(self, event: RelationCreatedEvent) -> None: Scope.APP, CertType.APP_ADMIN, CertType.APP_ADMIN.val ) - if not admin_secrets: - self._request_certificate(Scope.APP, CertType.APP_ADMIN) - elif not admin_secrets.get("truststore-password"): + self._request_certificate(Scope.APP, CertType.APP_ADMIN) + elif not admin_cert.get("truststore-password"): logger.debug("Truststore-password from main-orchestrator not available yet.") event.defer() return @@ -186,7 +189,7 @@ def _on_tls_relation_broken(self, event: RelationBrokenEvent) -> None: ) self.charm.on_tls_relation_broken(event) - def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: + def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: # noqa: C901 """Enable TLS when TLS certificate available. CertificateAvailableEvents fire whenever a new certificate is created by the TLS charm. @@ -202,9 +205,11 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: if not self.charm.unit.is_leader() and scope == Scope.APP: return - old_cert = secrets.get("cert", None) - renewal = old_cert is not None and old_cert != event.certificate + if self.is_ca_rotation_ongoing(): + event.defer() + return + old_cert = secrets.get("cert", None) ca_chain = "\n".join(event.chain[::-1]) self.charm.secrets.put_object( @@ -218,25 +223,52 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: merge=True, ) - # currently only make sure there is a CA - # TODO: workflow for replacement will be added later - if self._read_stored_ca() is None: - self.store_new_ca(self.charm.secrets.get_object(scope, cert_type.val)) + current_stored_ca = self._read_stored_ca() + if current_stored_ca != event.ca: + if not self.store_new_ca(self.charm.secrets.get_object(scope, cert_type.val)): + logger.debug("Could not store new CA certificate.") + event.defer() + return + # replacing the current CA initiates a rolling restart and certificate renewal + # the workflow is the following: + # get new CA -> set tls_ca_renewing -> restart -> post_start_init -> set tls_ca_renewed + # -> request new certs -> get new certs -> on_tls_conf_set + # -> delete both tls_ca_renewing and tls_ca_renewed + if current_stored_ca: + self.charm.peers_data.put(Scope.UNIT, "tls_ca_renewing", True) + self.charm.on_tls_ca_rotation() + return # store the certificates and keys in a key store self.store_new_tls_resources( cert_type, self.charm.secrets.get_object(scope, cert_type.val) ) + # apply the chain.pem file for API requests, only if the CA cert has not been updated + admin_secrets = self.charm.secrets.get_object(Scope.APP, CertType.APP_ADMIN.val) or {} + if admin_secrets.get("chain") and not self._read_stored_ca(alias="old-ca"): + self.update_request_ca_bundle() + # store the admin certificates in non-leader units + # if admin cert not available we need to defer, otherwise it will never be stored if not self.charm.unit.is_leader(): - if self.all_certificates_available(): - admin_secrets = self.charm.secrets.get_object(Scope.APP, CertType.APP_ADMIN.val) + if admin_secrets.get("cert"): self.store_new_tls_resources(CertType.APP_ADMIN, admin_secrets) + else: + event.defer() + return for relation in self.charm.opensearch_provider.relations: self.charm.opensearch_provider.update_certs(relation.id, ca_chain) + # broadcast secret updates for certs and CA to related sub-clusters + if self.charm.unit.is_leader() and self.charm.opensearch_peer_cm.is_provider(typ="main"): + self.charm.peer_cluster_provider.refresh_relation_data(event, can_defer=False) + + renewal = self._read_stored_ca(alias="old-ca") is not None or ( + old_cert is not None and old_cert != event.certificate + ) + try: self.charm.on_tls_conf_set(event, scope, cert_type, renewal) except OpenSearchError as e: @@ -457,37 +489,67 @@ def _create_keystore_pwd_if_not_exists(self, scope: Scope, cert_type: CertType, merge=True, ) - def store_new_ca(self, secrets: Dict[str, Any]): + def store_new_ca(self, secrets: Dict[str, Any]) -> bool: # noqa: C901 """Add new CA cert to trust store.""" keytool = f"sudo {self.jdk_path}/bin/keytool" - admin_secrets = self.charm.secrets.get_object(Scope.APP, CertType.APP_ADMIN.val) or {} - if self.charm.unit.is_leader(): + if not (deployment_desc := self.charm.opensearch_peer_cm.deployment_desc()): + return False + + if self.charm.unit.is_leader() and deployment_desc.typ == DeploymentType.MAIN_ORCHESTRATOR: self._create_keystore_pwd_if_not_exists(Scope.APP, CertType.APP_ADMIN, "ca") + admin_secrets = self.charm.secrets.get_object(Scope.APP, CertType.APP_ADMIN.val) or {} + if not ((secrets or {}).get("ca-cert") and admin_secrets.get("truststore-password")): - logging.error("CA cert not found, quitting.") - return + logging.error("CA cert or truststore-password not found, quitting.") + return False alias = "ca" store_path = f"{self.certs_path}/{alias}.p12" - with tempfile.NamedTemporaryFile(mode="w+t") as ca_tmp_file: - ca_tmp_file.write(secrets.get("ca-cert")) - ca_tmp_file.flush() - + try: run_cmd( - f"""{keytool} -importcert \ - -trustcacerts \ - -noprompt \ + f"""{keytool} -changealias \ -alias {alias} \ + -destalias old-{alias} \ -keystore {store_path} \ - -file {ca_tmp_file.name} \ -storetype PKCS12 """, f"-storepass {admin_secrets.get('truststore-password')}", ) - run_cmd(f"sudo chmod +r {store_path}") + logger.info(f"Current CA {alias} was renamed to old-{alias}.") + except OpenSearchCmdError as e: + # This message means there was no "ca" alias or store before, if it happens ignore + if not ( + f"Alias <{alias}> does not exist" in e.out + or "Keystore file does not exist" in e.out + ): + raise + + with tempfile.NamedTemporaryFile(mode="w+t") as ca_tmp_file: + ca_tmp_file.write(secrets.get("ca-cert")) + ca_tmp_file.flush() + + try: + run_cmd( + f"""{keytool} -importcert \ + -trustcacerts \ + -noprompt \ + -alias {alias} \ + -keystore {store_path} \ + -file {ca_tmp_file.name} \ + -storetype PKCS12 + """, + f"-storepass {admin_secrets.get('truststore-password')}", + ) + run_cmd(f"sudo chmod +r {store_path}") + logger.info("New CA was added to truststore.") + except OpenSearchCmdError as e: + logging.error(f"Error storing the ca-cert: {e}") + return False + + return True def _read_stored_ca(self, alias: str = "ca") -> Optional[str]: """Load stored CA cert.""" @@ -497,10 +559,14 @@ def _read_stored_ca(self, alias: str = "ca") -> Optional[str]: if not (exists(ca_trust_store) and secrets): return None - stored_certs = run_cmd( - f"openssl pkcs12 -in {ca_trust_store}", - f"-passin pass:{secrets.get('truststore-password')}", - ).out + try: + stored_certs = run_cmd( + f"openssl pkcs12 -in {ca_trust_store}", + f"-passin pass:{secrets.get('truststore-password')}", + ).out + except OpenSearchCmdError as e: + logging.error(f"Error reading the current truststore: {e}") + return # parse output to retrieve the current CA (in case there are many) start_cert_marker = "-----BEGIN CERTIFICATE-----" @@ -512,8 +578,54 @@ def _read_stored_ca(self, alias: str = "ca") -> Optional[str]: return None + def remove_old_ca(self) -> None: + """Remove old CA cert from trust store.""" + keytool = f"sudo {self.jdk_path}/bin/keytool" + ca_trust_store = f"{self.certs_path}/ca.p12" + old_alias = "old-ca" + + secrets = self.charm.secrets.get_object(Scope.APP, CertType.APP_ADMIN.val) + store_pwd = secrets.get("truststore-password") + + try: + run_cmd( + f"""{keytool} \ + -list \ + -keystore {ca_trust_store} \ + -storepass {store_pwd} \ + -alias {old_alias} \ + -storetype PKCS12""" + ) + except OpenSearchCmdError as e: + # This message means there was no "ca" alias or store before, if it happens ignore + if f"Alias <{old_alias}> does not exist" in e.out: + return + + run_cmd( + f"""{keytool} \ + -delete \ + -keystore {ca_trust_store} \ + -storepass {store_pwd} \ + -alias {old_alias} \ + -storetype PKCS12""" + ) + logger.info(f"Removed {old_alias} from truststore.") + + def update_request_ca_bundle(self) -> None: + """Create a new chain.pem file for requests module""" + admin_secret = self.charm.secrets.get_object(Scope.APP, CertType.APP_ADMIN.val) + + # we store the pem format to make it easier for the python requests lib + self.charm.opensearch.write_file( + f"{self.certs_path}/chain.pem", + admin_secret["chain"], + ) + def store_new_tls_resources(self, cert_type: CertType, secrets: Dict[str, Any]): """Add key and cert to keystore.""" + if self.is_ca_rotation_ongoing(): + return + cert_name = cert_type.val store_path = f"{self.certs_path}/{cert_type}.p12" @@ -523,17 +635,10 @@ def store_new_tls_resources(self, cert_type: CertType, secrets: Dict[str, Any]): else: self._create_keystore_pwd_if_not_exists(Scope.UNIT, cert_type, cert_type.val) - if not (secrets.get("key") and secrets.get("chain")): - logging.error("TLS key or chain not found, quitting.") + if not secrets.get("key"): + logging.error("TLS key not found, quitting.") return - # we store the pem format to make it easier for the python requests lib - if cert_type == CertType.APP_ADMIN: - self.charm.opensearch.write_file( - f"{self.certs_path}/chain.pem", - secrets["chain"], - ) - try: os.remove(store_path) except OSError: @@ -556,26 +661,69 @@ def store_new_tls_resources(self, cert_type: CertType, secrets: Dict[str, Any]): -out {store_path} \ -name {cert_name} """ - args = f"-passout pass:{secrets.get(f'keystore-password')}" + args = f"-passout pass:{secrets.get('keystore-password')}" if secrets.get("key-password"): args = f"{args} -passin pass:{secrets.get('key-password')}" run_cmd(cmd, args) run_cmd(f"sudo chmod +r {store_path}") + except OpenSearchCmdError as e: + logging.error(f"Error storing the TLS certificates for {cert_name}: {e}") finally: tmp_key.close() tmp_cert.close() + logger.info(f"TLS certificate for {cert_name} stored.") - def all_tls_resources_stored(self, only_unit_resources: bool = False) -> bool: + def all_tls_resources_stored(self, only_unit_resources: bool = False) -> bool: # noqa: C901 """Check if all TLS resources are stored on disk.""" cert_types = [CertType.UNIT_TRANSPORT, CertType.UNIT_HTTP] if not only_unit_resources: cert_types.append(CertType.APP_ADMIN) + # compare issuer of the cert with the issuer of the CA + # if they don't match, certs are not up-to-date and need to be renewed after CA rotation + if not (current_ca := self._read_stored_ca()): + return False + + # to make sure the content is processed correctly by openssl, temporary store it in a file + tmp_ca_file = tempfile.NamedTemporaryFile(mode="w+t") + tmp_ca_file.write(current_ca) + tmp_ca_file.flush() + tmp_ca_file.seek(0) + + try: + ca_issuer = run_cmd(f"openssl x509 -in {tmp_ca_file.name} -noout -issuer").out + except OpenSearchCmdError as e: + logger.error(f"Error reading the current truststore: {e}") + return False + finally: + tmp_ca_file.close() + for cert_type in cert_types: if not exists(f"{self.certs_path}/{cert_type}.p12"): return False + scope = Scope.APP if cert_type == CertType.APP_ADMIN else Scope.UNIT + secret = self.charm.secrets.get_object(scope, cert_type.val) + + try: + cert_issuer = run_cmd( + f"openssl pkcs12 -in {self.certs_path}/{cert_type}.p12", + f"""-nodes \ + -passin pass:{secret.get('keystore-password')} \ + | openssl x509 -noout -issuer + """, + ).out + except OpenSearchCmdError as e: + logger.error(f"Error reading the current certificate: {e}") + return False + except AttributeError as e: + logger.error(f"Error reading secret: {e}") + return False + + if cert_issuer != ca_issuer: + return False + return True def all_certificates_available(self) -> bool: @@ -586,15 +734,9 @@ def all_certificates_available(self) -> bool: if not admin_secrets or not admin_secrets.get("cert"): return False - admin_ca = admin_secrets.get("ca") - for cert_type in [CertType.UNIT_TRANSPORT, CertType.UNIT_HTTP]: unit_secrets = secrets.get_object(Scope.UNIT, cert_type.val) - if ( - not unit_secrets - or not unit_secrets.get("cert") - or unit_secrets.get("ca") != admin_ca - ): + if not unit_secrets or not unit_secrets.get("cert"): return False return True @@ -678,3 +820,43 @@ def reload_tls_certificates(self): finally: tmp_cert.close() tmp_key.close() + + def reset_ca_rotation_state(self) -> None: + """Handle internal flags during CA rotation routine.""" + if not self.charm.peers_data.get(Scope.UNIT, "tls_ca_renewing", False): + # if the CA is not being renewed we don't have to do anything here + return + + # if this flag is set, the CA rotation routine is complete for this unit + if self.charm.peers_data.get(Scope.UNIT, "tls_ca_renewed", False): + self.charm.peers_data.delete(Scope.UNIT, "tls_ca_renewing") + self.charm.peers_data.delete(Scope.UNIT, "tls_ca_renewed") + else: + # this means only the CA rotation completed, still need to create certificates + self.charm.peers_data.put(Scope.UNIT, "tls_ca_renewed", True) + + def ca_rotation_complete_in_cluster(self) -> bool: + """Check whether the CA rotation completed in all units.""" + rotation_complete = True + rel = self.model.get_relation(PeerRelationName) + + for unit in rel.units: + if not rel.data[unit].get("tls_ca_renewed"): + logger.debug(f"TLS CA rotation not complete for unit {unit}.") + rotation_complete = False + break + + return rotation_complete + + def is_ca_rotation_ongoing(self) -> bool: + """Check whether the CA rotation is currently in progress.""" + if ( + self.charm.peers_data.get(Scope.UNIT, "tls_ca_renewing", False) + and not self.charm.peers_data.get(Scope.UNIT, "tls_ca_renewed", False) + or self.charm.peers_data.get(Scope.UNIT, "tls_ca_renewed", False) + and not self.ca_rotation_complete_in_cluster() + ): + logger.debug("TLS CA rotation ongoing, will not update tls certificates.") + return True + + return False diff --git a/tests/integration/plugins/test_plugins.py b/tests/integration/plugins/test_plugins.py index 902fa72fa..c91b191d1 100644 --- a/tests/integration/plugins/test_plugins.py +++ b/tests/integration/plugins/test_plugins.py @@ -94,7 +94,7 @@ async def _wait_for_units(ops_test: OpsTest, deployment_type: str) -> None: apps_statuses=["active"], units_statuses=["active"], wait_for_exact_units={APP_NAME: 3}, - timeout=1200, + timeout=1800, idle_period=IDLE_PERIOD, ) return @@ -114,7 +114,7 @@ async def _wait_for_units(ops_test: OpsTest, deployment_type: str) -> None: FAILOVER_ORCHESTRATOR_NAME: 2, APP_NAME: 1, }, - timeout=1200, + timeout=1800, idle_period=IDLE_PERIOD, ) diff --git a/tests/integration/tls/test_ca_rotation.py b/tests/integration/tls/test_ca_rotation.py new file mode 100644 index 000000000..0f6cc2afc --- /dev/null +++ b/tests/integration/tls/test_ca_rotation.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging + +import pytest +import requests +from pytest_operator.plugin import OpsTest + +from ..ha.continuous_writes import ContinuousWrites +from ..helpers import ( + APP_NAME, + MODEL_CONFIG, + SERIES, + UNIT_IDS, + get_leader_unit_ip, + get_secret_by_label, +) +from ..helpers_deployments import wait_until + +logger = logging.getLogger(__name__) + + +TLS_CERTIFICATES_APP_NAME = "self-signed-certificates" + + +@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "xlarge"]) +@pytest.mark.group(1) +@pytest.mark.abort_on_fail +@pytest.mark.skip_if_deployed +async def test_build_and_deploy_active(ops_test: OpsTest) -> None: + """Build and deploy one unit of OpenSearch.""" + my_charm = await ops_test.build_charm(".") + await ops_test.model.set_config(MODEL_CONFIG) + + await ops_test.model.deploy( + my_charm, + num_units=len(UNIT_IDS), + series=SERIES, + ) + + # Deploy TLS Certificates operator. + config = {"ca-common-name": "CN_CA"} + await ops_test.model.deploy(TLS_CERTIFICATES_APP_NAME, channel="stable", config=config) + await wait_until(ops_test, apps=[TLS_CERTIFICATES_APP_NAME], apps_statuses=["active"]) + + # Relate it to OpenSearch to set up TLS. + await ops_test.model.integrate(APP_NAME, TLS_CERTIFICATES_APP_NAME) + await wait_until( + ops_test, + apps=[APP_NAME], + apps_statuses=["active"], + units_statuses=["active"], + timeout=1800, + wait_for_exact_units=len(UNIT_IDS), + ) + + +@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "xlarge"]) +@pytest.mark.group(1) +@pytest.mark.abort_on_fail +async def test_rollout_new_ca(ops_test: OpsTest) -> None: + """Test that the cluster restarted and functional after processing a new CA certificate""" + c_writes = ContinuousWrites(ops_test, APP_NAME) + await c_writes.start() + + # trigger a rollout of the new CA by changing the config on TLS Provider side + new_config = {"ca-common-name": "NEW_CA"} + await ops_test.model.applications[TLS_CERTIFICATES_APP_NAME].set_config(new_config) + + writes_count = await c_writes.count() + + await wait_until( + ops_test, + apps=[APP_NAME], + apps_statuses=["active"], + units_statuses=["active"], + timeout=1800, + idle_period=60, + wait_for_exact_units=len(UNIT_IDS), + ) + + more_writes = await c_writes.count() + await c_writes.stop() + assert more_writes > writes_count, "Writes have not continued during CA rotation" + + # using the SSL API requires authentication with app-admin cert and key + leader_unit_ip = await get_leader_unit_ip(ops_test) + url = f"https://{leader_unit_ip}:9200/_plugins/_security/api/ssl/certs" + admin_secret = await get_secret_by_label(ops_test, "opensearch:app:app-admin") + + with open("admin.cert", "w") as cert: + cert.write(admin_secret["cert"]) + + with open("admin.key", "w") as key: + key.write(admin_secret["key"]) + + response = requests.get(url, cert=("admin.cert", "admin.key"), verify=False) + data = response.json() + assert new_config["ca-common-name"] in data["http_certificates_list"][0]["issuer_dn"] diff --git a/tests/unit/helpers.py b/tests/unit/helpers.py index 50babc6ba..60b55f939 100644 --- a/tests/unit/helpers.py +++ b/tests/unit/helpers.py @@ -123,7 +123,7 @@ def mock_response_mynode( "build_type": "tar", "build_hash": "30dd870855093c9dca23fc6f8cfd5c0d7c83127d", "total_indexing_buffer": 107374182, - "roles": ["cluster_manager", "data", "ingest", "ml"], + "roles": ["cluster_manager", "coordinating_only", "data", "ingest", "ml"], "attributes": { "shard_indexing_pressure_enabled": "true", "app_id": "617e5f02-5be5-4e25-85f0-276b2347a5ad/opensearch", @@ -143,6 +143,7 @@ def mock_response_mynode( "data", "ingest", "ml", + "coordinating_only", "cluster_manager", ], }, @@ -171,6 +172,63 @@ def mock_response_mynode( ) +def mock_response_lock_not_requested(host): + expected_response = {"unit-name": ""} + + responses.add( + method="GET", + url=f"https://{host}:9200/.charm_node_lock/_source/0", + json=expected_response, + status=200, + ) + + +def mock_response_health_green(host, cluster_name: str = CLUSTER_NAME): + expected_response = { + "cluster_name": cluster_name, + "status": "green", + "timed_out": False, + "number_of_nodes": 2, + "number_of_data_nodes": 2, + "discovered_master": True, + "active_primary_shards": 6, + "active_shards": 12, + "relocating_shards": 0, + "initializing_shards": 0, + "unassigned_shards": 0, + "delayed_unassigned_shards": 0, + "number_of_pending_tasks": 0, + "number_of_in_flight_fetch": 0, + "task_max_waiting_in_queue_millis": 0, + "active_shards_percent_as_number": 100.0, + } + + responses.add( + method="GET", + url=f"https://{host}:9200/_cluster/health", + json=expected_response, + status=200, + ) + + +def mock_response_put_http_cert(host): + responses.add( + method="PUT", + url=f"https://{host}:9200/_plugins/_security/api/ssl/http/reloadcerts", + json={"status": "OK", "message": "updated transport certs"}, + status=201, + ) + + +def mock_response_put_transport_cert(host): + responses.add( + method="PUT", + url=f"https://{host}:9200/_plugins/_security/api/ssl/transport/reloadcerts", + json={"status": "OK", "message": "updated transport certs"}, + status=201, + ) + + def get_relation_unit(model: Model, relation_name: str, unit_name: str) -> Unit | None: """Get the Unit object from the relation that matches unit_name.""" relation = model.get_relation(relation_name) diff --git a/tests/unit/lib/test_opensearch_base_charm.py b/tests/unit/lib/test_opensearch_base_charm.py index 959269c95..150d2a7bc 100644 --- a/tests/unit/lib/test_opensearch_base_charm.py +++ b/tests/unit/lib/test_opensearch_base_charm.py @@ -357,12 +357,16 @@ def test_on_update_status(self, _, cert_expiration_remaining_hours, _stop_opense @patch(f"{BASE_CHARM_CLASS}.is_admin_user_configured") @patch(f"{BASE_LIB_PATH}.opensearch_tls.OpenSearchTLS.is_fully_configured") @patch(f"{BASE_LIB_PATH}.opensearch_tls.OpenSearchTLS.reload_tls_certificates") + @patch(f"{BASE_LIB_PATH}.opensearch_tls.OpenSearchTLS.update_request_ca_bundle") + @patch(f"{BASE_LIB_PATH}.opensearch_tls.OpenSearchTLS.remove_old_ca") def test_reload_tls_certs_without_restart( self, store_admin_tls_secrets_if_applies, is_admin_user_configured, is_fully_configured, reload_tls_certificates, + update_request_ca_bundle, + remove_old_ca, ): """Test that tls configuration set does not trigger restart.""" cert = "cert_12345" @@ -375,6 +379,9 @@ def test_reload_tls_certs_without_restart( store_admin_tls_secrets_if_applies.assert_called_once() reload_tls_certificates.assert_called_once() + update_request_ca_bundle.assert_called_once() + + remove_old_ca.assert_called_once() self.charm._restart_opensearch_event.emit.assert_not_called() def test_app_peers_data(self): diff --git a/tests/unit/lib/test_opensearch_config.py b/tests/unit/lib/test_opensearch_config.py index ed56eb86a..4be5fcd3e 100644 --- a/tests/unit/lib/test_opensearch_config.py +++ b/tests/unit/lib/test_opensearch_config.py @@ -149,10 +149,6 @@ def test_set_node_tls_conf(self): opensearch_conf[f"plugins.security.ssl.{layer}.keystore_alias"], f"unit-{layer}", ) - self.assertEqual( - opensearch_conf[f"plugins.security.ssl.{layer}.truststore_alias"], - "ca", - ) self.assertEqual( opensearch_conf[f"plugins.security.ssl.{layer}.keystore_password"], diff --git a/tests/unit/lib/test_opensearch_distro.py b/tests/unit/lib/test_opensearch_distro.py index 12d2012b8..5e0b9270c 100644 --- a/tests/unit/lib/test_opensearch_distro.py +++ b/tests/unit/lib/test_opensearch_distro.py @@ -62,7 +62,7 @@ def test_distro_current_online_ok(self, _): node = self.charm.opensearch.current() assert isinstance(node, Node) assert node.app.name == "opensearch" - assert node.roles == ["cluster_manager", "data", "ingest", "ml"] + assert node.roles == ["cluster_manager", "coordinating_only", "data", "ingest", "ml"] assert not node.temperature def test_distro_current_api_unavail_primary_fallback_to_static_conf(self): @@ -70,7 +70,9 @@ def test_distro_current_api_unavail_primary_fallback_to_static_conf(self): node = self.charm.opensearch.current() assert isinstance(node, Node) assert node.temperature == "hot" - assert sorted(node.roles) == sorted(["cluster_manager", "data", "ingest", "ml"]) + assert sorted(node.roles) == sorted( + ["cluster_manager", "coordinating_only", "data", "ingest", "ml"] + ) # NOTE: app is still retrieved from Deployment Description assert node.app.name == "opensearch" @@ -84,7 +86,9 @@ def test_distro_current_api_unavail_deployment_unavail_all_fallback_to_static_co assert isinstance(node, Node) assert node.app.name == "opensearch" assert node.temperature == "hot" - assert sorted(node.roles) == sorted(["cluster_manager", "data", "ingest", "ml"]) + assert sorted(node.roles) == sorted( + ["cluster_manager", "coordinating_only", "data", "ingest", "ml"] + ) # NOTE: app is still retrieved from Deployment Description assert node.app.name == "opensearch" diff --git a/tests/unit/lib/test_opensearch_secrets.py b/tests/unit/lib/test_opensearch_secrets.py index 21d4c00de..d84b559df 100644 --- a/tests/unit/lib/test_opensearch_secrets.py +++ b/tests/unit/lib/test_opensearch_secrets.py @@ -100,8 +100,7 @@ def test_on_secret_changed_unit(self, mock_store_tls_resources): event.secret.label = f"opensearch:app:{CertType.APP_ADMIN.val}" self.secrets._on_secret_changed(event) - mock_store_tls_resources.assert_called() - mock_store_tls_resources.assert_called_with(CertType.APP_ADMIN, event.secret.get_content()) + mock_store_tls_resources.assert_not_called() def test_interface(self): """We want to make sure that the following public methods are always supported.""" diff --git a/tests/unit/lib/test_opensearch_tls.py b/tests/unit/lib/test_opensearch_tls.py index 2c45a26c4..269861272 100644 --- a/tests/unit/lib/test_opensearch_tls.py +++ b/tests/unit/lib/test_opensearch_tls.py @@ -2,13 +2,21 @@ # See LICENSE file for licensing details. """Unit test for the helper_cluster library.""" +import itertools +import re import socket import unittest from unittest import mock from unittest.mock import MagicMock, Mock, patch -from charms.opensearch.v0.constants_charm import PeerRelationName +import responses +from charms.opensearch.v0.constants_charm import ( + PeerRelationName, + TLSCaRotation, + TLSNotFullyConfigured, +) from charms.opensearch.v0.constants_tls import TLS_RELATION, CertType +from charms.opensearch.v0.helper_conf_setter import YamlConfigSetter from charms.opensearch.v0.models import ( App, DeploymentDescription, @@ -20,10 +28,25 @@ State, ) from charms.opensearch.v0.opensearch_internal_data import Scope +from ops.model import ActiveStatus, MaintenanceStatus from ops.testing import Harness +from parameterized import parameterized from charm import OpenSearchOperatorCharm from tests.helpers import create_utf8_encoded_private_key, patch_network_get +from tests.unit.helpers import ( + mock_response_health_green, + mock_response_lock_not_requested, + mock_response_nodes, + mock_response_put_http_cert, + mock_response_put_transport_cert, + mock_response_root, +) + + +def single_space(input: str) -> str: + """Replace multiple spaces with one.""" + return " ".join(input.split()) @patch_network_get("1.1.1.1") @@ -57,11 +80,12 @@ class TestOpenSearchTLS(unittest.TestCase): def setUp(self, _) -> None: self.harness = Harness(OpenSearchOperatorCharm) self.addCleanup(self.harness.cleanup) - self.harness.add_network("1.1.1.1", endpoint=PeerRelationName) + self.rel_id = self.harness.add_network("1.1.1.1", endpoint=PeerRelationName) self.harness.add_network("1.1.1.1", endpoint=TLS_RELATION) self.harness.begin() self.charm = self.harness.charm - self.harness.add_relation(PeerRelationName, self.charm.app.name) + self.rel_id = self.harness.add_relation(PeerRelationName, self.charm.app.name) + self.harness.add_relation_unit(self.rel_id, f"{self.charm.app.name}/0") self.harness.add_relation(TLS_RELATION, self.charm.app.name) self.secret_store = self.charm.secrets @@ -69,6 +93,8 @@ def setUp(self, _) -> None: socket.getfqdn = Mock() socket.getfqdn.return_value = "nebula" + self.charm.opensearch.config = YamlConfigSetter(base_path="tests/unit/resources/config") + @patch(f"{PEER_CLUSTERS_MANAGER}.deployment_desc") @patch(f"{BASE_LIB_PATH}.opensearch_tls.get_host_public_ip") @patch("socket.getfqdn") @@ -132,7 +158,7 @@ def test_find_secret(self): @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._request_certificate") @patch("charm.OpenSearchOperatorCharm._put_or_update_internal_user_leader") @patch("charm.OpenSearchOperatorCharm._purge_users") - def test_on_relation_joined_admin(self, _, __, _request_certificate, deployment_desc): + def test_on_relation_created_admin(self, _, __, _request_certificate, deployment_desc): """Test on certificate relation created event.""" deployment_desc.return_value = DeploymentDescription( config=PeerClusterConfig(cluster_name="", init_hold=False, roles=[]), @@ -161,7 +187,44 @@ def test_on_relation_joined_admin(self, _, __, _request_certificate, deployment_ @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._request_certificate") @patch("charm.OpenSearchOperatorCharm._put_or_update_internal_user_leader") @patch("charm.OpenSearchOperatorCharm._purge_users") - def test_on_relation_joined_non_admin(self, _, __, _request_certificate, deployment_desc): + def test_on_relation_created_only_main_orchestrator_requests_application_cert( + self, _, __, _request_certificate, deployment_desc + ): + """Test on certificate relation created event.""" + deployment_desc.return_value = DeploymentDescription( + config=PeerClusterConfig(cluster_name="", init_hold=False, roles=[]), + start=StartMode.WITH_GENERATED_ROLES, + pending_directives=[], + typ=DeploymentType.OTHER, + app=App(model_uuid=self.charm.model.uuid, name=self.charm.app.name), + state=DeploymentState(value=State.ACTIVE), + ) + # Truststore password is required + self.secret_store.put_object( + Scope.APP, + CertType.APP_ADMIN.val, + {"truststore-password": "abc"}, + ) + event_mock = MagicMock() + + self.harness.set_leader(is_leader=True) + self.charm.tls._on_tls_relation_created(event_mock) + + self.assertEqual( + _request_certificate.mock_calls, + [ + mock.call(Scope.UNIT, CertType.UNIT_TRANSPORT), + mock.call(Scope.UNIT, CertType.UNIT_HTTP), + ], + ) + + @patch( + f"{BASE_LIB_PATH}.opensearch_peer_clusters.OpenSearchPeerClustersManager.deployment_desc" + ) + @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._request_certificate") + @patch("charm.OpenSearchOperatorCharm._put_or_update_internal_user_leader") + @patch("charm.OpenSearchOperatorCharm._purge_users") + def test_on_relation_created_non_admin(self, _, __, _request_certificate, deployment_desc): """Test on certificate relation created event.""" deployment_desc.return_value = DeploymentDescription( config=PeerClusterConfig(cluster_name="", init_hold=False, roles=[]), @@ -336,9 +399,16 @@ def test_on_certificate_invalidated(self, _, deployment_desc, request_certificat request_certificate_renewal.assert_called_once() + # Testing store_new_ca() function + + @patch(f"{PEER_CLUSTERS_MANAGER}.deployment_desc") @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._create_keystore_pwd_if_not_exists") @patch("charm.OpenSearchOperatorCharm._put_or_update_internal_user_leader") - def test_truststore_password_secret(self, _, _create_keystore_pwd_if_not_exists): + @patch("builtins.open", side_effect=unittest.mock.mock_open()) + def test_truststore_password_secret( + self, _, __, _create_keystore_pwd_if_not_exists, deployment_desc + ): + deployment_desc.return_value = self.deployment_descriptions["ok"] secret = {"key": "secret_12345"} self.harness.set_leader(is_leader=False) @@ -350,3 +420,1268 @@ def test_truststore_password_secret(self, _, _create_keystore_pwd_if_not_exists) self.charm.tls.store_new_ca(secret) _create_keystore_pwd_if_not_exists.assert_called_once() + + @patch(f"{PEER_CLUSTERS_MANAGER}.deployment_desc") + @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._create_keystore_pwd_if_not_exists") + @patch("charm.OpenSearchOperatorCharm._put_or_update_internal_user_leader") + @patch("builtins.open", side_effect=unittest.mock.mock_open()) + def test_truststore_password_secret_only_created_by_main_orchestrator( + self, _, __, _create_keystore_pwd_if_not_exists, deployment_desc + ): + deployment_desc.return_value = DeploymentDescription( + config=PeerClusterConfig(cluster_name="", init_hold=False, roles=[]), + start=StartMode.WITH_GENERATED_ROLES, + pending_directives=[], + typ=DeploymentType.OTHER, + app=App(model_uuid=self.charm.model.uuid, name=self.charm.app.name), + state=DeploymentState(value=State.ACTIVE), + ) + secret = {"key": "secret_12345"} + + self.harness.set_leader(is_leader=True) + self.charm.tls.store_new_ca(secret) + + _create_keystore_pwd_if_not_exists.assert_not_called() + + ########################################################################## + # Full workflow tests + ########################################################################## + + # NOTE: Syntax: parametrized has to be the outermost decorator + @parameterized.expand( + [ + (DeploymentType.MAIN_ORCHESTRATOR), + (DeploymentType.OTHER), + (DeploymentType.FAILOVER_ORCHESTRATOR), + ] + ) + @patch("charms.opensearch.v0.opensearch_tls.run_cmd") + @patch(f"{PEER_CLUSTERS_MANAGER}.deployment_desc") + # Mocks to avoid I/O + @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._read_stored_ca") + @patch("builtins.open", side_effect=unittest.mock.mock_open()) + def test_on_certificate_available_leader_app_cert_full_workflow( + self, + # NOTE: Syntax: parametrized parameter comes first + deployment_type, + _, + _read_stored_ca, + deployment_desc, + run_cmd, + ): + """New certificate received. + + The charm leader unit should save the new certificate both to + Juju secrets and to the keystore. + + Applies to: + - all deployments + - leader ONLY + """ + csr = "csr" + key = "key" + ca = "ca" + + new_cert = "new_cert" + new_chain = ["new_chain"] + + self.secret_store.put_object( + Scope.APP, + CertType.APP_ADMIN.val, + { + "csr": csr, + "key": key, + "ca-cert": ca, + "cert": "old_cert", + "keystore-password": "keystore_12345", + "truststore-password": "truststore_12345", + }, + ) + # Purposefully not adding unit certificates, to also trigger corner-case checks + + event_mock = MagicMock( + certificate_signing_request=csr, chain=new_chain, certificate=new_cert, ca=ca + ) + + # There was no change of the CA (certificate), the event matches the one on disk + _read_stored_ca.return_value = ca + + # Applies to ANY deployment type + deployment_desc.return_value = DeploymentDescription( + config=PeerClusterConfig(cluster_name="", init_hold=False, roles=[]), + start=StartMode.WITH_GENERATED_ROLES, + pending_directives=[], + typ=deployment_type, + app=App(model_uuid=self.charm.model.uuid, name=self.charm.app.name), + state=DeploymentState(value=State.ACTIVE), + ) + + self.harness.set_leader(is_leader=True) + + original_status_app = self.harness.model.app.status + original_status_unit = self.harness.model.unit.status + self.charm._restart_opensearch_event = MagicMock() + + self.charm.tls._on_certificate_available(event_mock) + + # The new cert is saved to the keystore + # NOTE on the leader node, the operation is redundant i.e. executed TWICE + # This is because the function that applies on normal units to save app certificate + # is executed on top of the mechanism that recognizes that the leader + # received a new app cert + assert run_cmd.call_count == 4 + + assert re.search( + "openssl pkcs12 -export .*-out " + "/var/snap/opensearch/current/etc/opensearch/certificates/app-admin.p12 .*-name app-admin", + run_cmd.call_args_list[0].args[0], + ) + assert ( + "sudo chmod +r /var/snap/opensearch/current/etc/opensearch/certificates/app-admin.p12" + in run_cmd.call_args_list[1].args[0] + ) + + assert self.harness.model.app.status == original_status_app + assert self.harness.model.unit.status == original_status_unit + + # The new certificate is now replacing the old one in Peer Relation secrets + assert self.secret_store.get_object(Scope.APP, CertType.APP_ADMIN.val) == { + "csr": csr, + "key": key, + "ca-cert": ca, + "cert": new_cert, + "chain": new_chain[0], + "truststore-password": "truststore_12345", + "keystore-password": "keystore_12345", + } + + # NOTE: Syntax: parametrized has to be the outermost decorator + @parameterized.expand( + itertools.product( + [ + (DeploymentType.MAIN_ORCHESTRATOR), + (DeploymentType.OTHER), + (DeploymentType.FAILOVER_ORCHESTRATOR), + ], + [True, False], + [CertType.UNIT_HTTP.val, CertType.UNIT_TRANSPORT.val], + ) + ) + @patch("charms.opensearch.v0.opensearch_tls.run_cmd") + @patch(f"{PEER_CLUSTERS_MANAGER}.deployment_desc") + # Mocks to avoid I/O + @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._read_stored_ca") + @patch("builtins.open", side_effect=unittest.mock.mock_open()) + def test_on_certificate_available_any_node_unit_cert_full_workflow( + self, + # NOTE: Syntax: parametrized parameter comes first + deployment_type, + leader, + cert_type, + _, + _read_stored_ca, + deployment_desc, + run_cmd, + ): + """New *unit* certificate received. + + At this point the charm leader unit should save the new certificate both to + Juju secrets and to the keystore. + + Applies to: + - all deployments + - all units + """ + csr = "csr" + key = "key" + ca = "ca" + keystore_password = "keystore_12345" + + new_cert = "new_cert" + new_chain = ["new_chain"] + + self.secret_store.put_object( + Scope.APP, + CertType.APP_ADMIN.val, + { + "csr": csr, + "key": key, + "ca-cert": ca, + "cert": "old_cert", + "keystore-password": keystore_password, + "truststore-password": "truststore_12345", + }, + ) + self.secret_store.put_object( + Scope.UNIT, + CertType.UNIT_TRANSPORT, + { + "csr": f"{CertType.UNIT_TRANSPORT.val}-csr", + "truststore-password": "truststore_12345", + "keystore-password": keystore_password, + "key": key, + "ca-cert": ca, + "cert": "old_cert", + }, + ) + + self.secret_store.put_object( + Scope.UNIT, + CertType.UNIT_HTTP, + { + "csr": f"{CertType.UNIT_HTTP.val}-csr", + "truststore-password": "truststore_12345", + "keystore-password": keystore_password, + "key": key, + "ca-cert": ca, + "cert": "old_cert", + }, + ) + + event_mock = MagicMock( + certificate_signing_request=f"{cert_type}-csr", + chain=new_chain, + certificate=new_cert, + ca=ca, + ) + + # There was no change of the CA (certificate), the event matches the one on disk + _read_stored_ca.return_value = ca + + # Applies to ANY deployment type + deployment_desc.return_value = DeploymentDescription( + config=PeerClusterConfig(cluster_name="", init_hold=False, roles=[]), + start=StartMode.WITH_GENERATED_ROLES, + pending_directives=[], + typ=deployment_type, + app=App(model_uuid=self.charm.model.uuid, name=self.charm.app.name), + state=DeploymentState(value=State.ACTIVE), + ) + + self.harness.set_leader(is_leader=leader) + + original_status_unit = self.harness.model.unit.status + self.charm._restart_opensearch_event = MagicMock() + + self.charm.tls._on_certificate_available(event_mock) + + # The new cert is saved to the keystore + if self.charm.unit.is_leader(): + assert run_cmd.call_count == 2 + else: + assert run_cmd.call_count == 4 + + assert re.search( + "openssl pkcs12 -export .*-out " + f"/var/snap/opensearch/current/etc/opensearch/certificates/{cert_type}.p12 .*-name {cert_type}", + run_cmd.call_args_list[0].args[0], + ) + assert ( + f"sudo chmod +r /var/snap/opensearch/current/etc/opensearch/certificates/{cert_type}.p12" + in run_cmd.call_args_list[1].args[0] + ) + + assert self.harness.model.unit.status == original_status_unit + + # The new certificate is now replacing the old one in Peer Relation secrets + assert self.secret_store.get_object(Scope.UNIT, cert_type) == { + "csr": f"{cert_type}-csr", + "key": key, + "ca-cert": ca, + "cert": new_cert, + "chain": new_chain[0], + "keystore-password": keystore_password, + "truststore-password": "truststore_12345", + } + + ########################################################################## + # Tests below verify to the CA rotation cycle + ########################################################################## + + # NOTE: Syntax: parametrized has to be the outermost decorator + @parameterized.expand( + [ + (DeploymentType.MAIN_ORCHESTRATOR), + (DeploymentType.OTHER), + (DeploymentType.FAILOVER_ORCHESTRATOR), + ] + ) + @patch("charms.opensearch.v0.opensearch_tls.run_cmd") + @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._read_stored_ca") + @patch(f"{PEER_CLUSTERS_MANAGER}.deployment_desc") + # Mocks to avoid I/O + @patch("builtins.open", side_effect=unittest.mock.mock_open()) + def test_on_certificate_available_ca_rotation_first_stage_any_cluster_leader( + self, + # NOTE: Syntax: parametrized parameter comes first + deployment_type, + _, + deployment_desc, + _read_stored_ca, + run_cmd, + ): + """Test CA rotation 1st stage. + + At this point the charm already is receiving a new CA cert from the + 'self-signed-certificates' charm. + Note: there is no preceding action on any of the involved parties to trigger that. + The new CA cert may be received due to a CA change, CA cert expiration, etc. + The 'self-signed-certificates' operator sends no signal/notification but simply adds + the new CA certificate to a 'certificate-available' event. + + On this event, the Opensearch charm should: + - save the new CA cert to truststore ALONGSIDE the old one that receives a new alias + - set the 'tls_ca_renewing' flag in the peer databag + - trigger a service restart + - set the charm state to 'maintenance', indicating CA certificate rotation + + NOTE: The 'certificate-available' event also contains a new cert and chain. These are + kind of "useless", as will need to request new ones matching the new CA cert. + Not to modify existing workflows, they are saved to the secret but NOT to the disk. + (The inconsistency is temporary, while the charm is in a maintenance mode anyway.) + + Applies to + - any deployment types + - leader ONLY + - normal units are passive, see test later + """ + old_csr = "old_csr" + + new_cert = "new_cert" + new_chain = ["new_chain"] + new_ca = "new_ca" + + self.secret_store.put_object( + Scope.APP, + CertType.APP_ADMIN.val, + { + "csr": old_csr, + "keystore-password": "keystore_12345", + "truststore-password": "truststore_12345", + "ca-cert": "old_ca_cert", + "cert": "old_cert", + }, + ) + + # NOTE: The event is issued with the old csr, i.e. the identifier of + # the ongoing transaction. A new csr will be generated and saved in the second step + event_mock = MagicMock( + certificate_signing_request=old_csr, chain=new_chain, certificate=new_cert, ca=new_ca + ) + + # The CA stored in the keystore is still the old one + _read_stored_ca.return_value = "old_ca" + + # Applies to ANY deployment type + deployment_desc.return_value = DeploymentDescription( + config=PeerClusterConfig(cluster_name="", init_hold=False, roles=[]), + start=StartMode.WITH_GENERATED_ROLES, + pending_directives=[], + typ=deployment_type, + app=App(model_uuid=self.charm.model.uuid, name=self.charm.app.name), + state=DeploymentState(value=State.ACTIVE), + ) + + self.charm._restart_opensearch_event = MagicMock() + + self.harness.set_leader(is_leader=True) + original_status = self.harness.model.unit.status + + self.charm.tls._on_certificate_available(event_mock) + + # Old CA cert is saved with corresponding alias, new new CA cert added to keystore + assert run_cmd.call_count == 3 + assert re.search( + "keytool *-changealias *-alias ca *-destalias old-ca", + run_cmd.call_args_list[0].args[0], + ) + assert re.search("keytool *-importcert.* *-alias ca", run_cmd.call_args_list[1].args[0]) + assert ( + "chmod +r /var/snap/opensearch/current/etc/opensearch/certificates/ca.p12" + in run_cmd.call_args_list[2].args[0] + ) + # NOTE: The new cert and chain are NOT saved into the keystore (disk) + + # Set flag, set status, restart + assert ( + self.harness.get_relation_data(self.rel_id, "opensearch/0")["tls_ca_renewing"] + == "True" + ) + assert isinstance(self.harness.model.unit.status, MaintenanceStatus) + assert self.harness.model.unit.status.message == TLSCaRotation + assert self.harness.model.unit.status, MaintenanceStatus != original_status + self.charm._restart_opensearch_event.emit.assert_called_once() + + # The new certificate is now replacing the old one in Peer Relation secrets + # NOTE: INCONSISTENCY: The new cert and chain ARE saved into the secret + assert self.secret_store.get_object(Scope.APP, CertType.APP_ADMIN.val) == { + "csr": old_csr, + "cert": new_cert, + "chain": new_chain[0], + "truststore-password": "truststore_12345", + "keystore-password": "keystore_12345", + "ca-cert": new_ca, + } + + @parameterized.expand( + [ + (DeploymentType.MAIN_ORCHESTRATOR), + (DeploymentType.OTHER), + (DeploymentType.FAILOVER_ORCHESTRATOR), + ] + ) + @patch("charms.opensearch.v0.opensearch_tls.run_cmd") + @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._read_stored_ca") + @patch(f"{PEER_CLUSTERS_MANAGER}.deployment_desc") + def test_on_certificate_available_ca_rotation_first_stage_any_cluster_non_leader( + # NOTE: Syntax: parametrized parameter comes first + self, + deployment_type, + deployment_desc, + _read_stored_ca, + run_cmd, + ): + """'certificate-available' with an app cert and/or a CA cert. + + ONLY the leader takes action. + """ + csr = "old_csr" + cert = "new_cert" + chain = ["new_chain"] + ca = "new_ca" + + self.secret_store.put_object( + Scope.APP, + CertType.APP_ADMIN.val, + { + "csr": csr, + "keystore-password": "keystore_12345", + "truststore-password": "truststore_12345", + "ca-cert": "old_ca_cert", + "cert": "old_cert", + }, + ) + + event_mock = MagicMock( + certificate_signing_request=csr, chain=chain, certificate=cert, ca=ca + ) + + _read_stored_ca.return_value = "stored_ca" + + # Applies to ANY deployment type + deployment_desc.return_value = DeploymentDescription( + config=PeerClusterConfig(cluster_name="", init_hold=False, roles=[]), + start=StartMode.WITH_GENERATED_ROLES, + pending_directives=[], + typ=deployment_type, + app=App(model_uuid=self.charm.model.uuid, name=self.charm.app.name), + state=DeploymentState(value=State.ACTIVE), + ) + + self.harness.set_leader(is_leader=False) + original_status = self.harness.model.unit.status + self.charm._restart_opensearch_event = MagicMock() + + self.charm.tls._on_certificate_available(event_mock) + + # No action taken, no change on status or certificates + assert run_cmd.call_count == 0 + assert self.harness.model.unit.status == original_status + self.charm._restart_opensearch_event.emit.assert_not_called() + assert self.secret_store.get_object(Scope.APP, CertType.APP_ADMIN.val) == { + "csr": csr, + "keystore-password": "keystore_12345", + "truststore-password": "truststore_12345", + "ca-cert": "old_ca_cert", + "cert": "old_cert", + } + + # Mocks on functions we want to investigate + # NOTE: Syntax: parametrized has to be the outermost decorator + @parameterized.expand( + [ + (DeploymentType.MAIN_ORCHESTRATOR), + (DeploymentType.OTHER), + (DeploymentType.FAILOVER_ORCHESTRATOR), + ] + ) + @patch("charms.opensearch.v0.opensearch_tls.generate_csr") + @patch( + "charms.tls_certificates_interface.v3.tls_certificates.TLSCertificatesRequiresV3.request_certificate_renewal" + ) + @patch( + "charms.tls_certificates_interface.v3.tls_certificates.TLSCertificatesRequiresV3.request_certificate_revocation" + ) + @patch( + "charms.tls_certificates_interface.v3.tls_certificates.TLSCertificatesRequiresV3.request_certificate_creation" + ) + @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._read_stored_ca") + @patch(f"{PEER_CLUSTERS_MANAGER}.deployment_desc") + # Necessary mocks to simulate a smotth startup + @patch("machine_upgrade.Upgrade") + @patch("charm.OpenSearchOperatorCharm._put_or_update_internal_user_leader") + @patch("socket.socket.connect") + @responses.activate + def test_on_certificate_available_ca_rotation_second_stage_any_cluster_leader( + self, + # NOTE: Syntax: parametrized parameter comes first + deployment_type, + _, + __, + upgrade, + deployment_desc, + _read_stored_ca, + create_cert, + revoke_cert, + renew_cert, + generate_csr, + ): + """Test CA rotation 2nd stage. + + At this point the charm already has the new CA cert stored locally + (with the old CA cert also being kept around) and a service restart + was supposed to take place. + + After the restart + - old certificates have to be invalidated + - unit certificates have to be renewed using the new CA cert + - to signify the above being completed, the 'tls_ca_renewed' flag is set in the databag. + + Applies to + - any deployment types + - LEADER ONLY + """ + # Units had their certificates already + old_csr = "old_csr" + old_key = create_utf8_encoded_private_key() + old_subject = "old_subject" + keystore_password = "keystore_12345" + + new_ca = "new_ca" + + self.secret_store.put_object( + Scope.APP, + CertType.APP_ADMIN.val, + { + "csr": old_csr, + "keystore-password": keystore_password, + "truststore-password": "truststore_12345", + "ca-cert": new_ca, + "key": old_key, + "subject": old_subject, + }, + ) + self.secret_store.put_object( + Scope.UNIT, + CertType.UNIT_TRANSPORT.val, + { + "keystore-password": keystore_password, + "csr": "csr-transport", + "key": "key-transport", + }, + ) + self.secret_store.put_object( + Scope.UNIT, + CertType.UNIT_HTTP.val, + {"keystore-password": keystore_password, "csr": "csr-http", "key": "key-http"}, + ) + + # Leader ONLY + with self.harness.hooks_disabled(): + self.harness.set_leader(is_leader=True) + self.harness.update_relation_data( + self.rel_id, "opensearch", {"security_index_initialised": "True"} + ) + + # We passed the 1st stage of the certificate renewalV + self.harness.update_relation_data( + self.rel_id, "opensearch/0", {"tls_ca_renewing": "True"} + ) + + # Applies to ANY deployment type + deployment_desc.return_value = DeploymentDescription( + config=PeerClusterConfig(cluster_name="", init_hold=False, roles=[]), + start=StartMode.WITH_GENERATED_ROLES, + pending_directives=[], + typ=deployment_type, + app=App(model_uuid=self.charm.model.uuid, name=self.charm.app.name), + state=DeploymentState(value=State.ACTIVE), + ) + upgrade_mock = MagicMock(app_status=ActiveStatus()) + upgrade_mock.get_unit_juju_status.return_value = ActiveStatus() + upgrade.return_value = upgrade_mock + + mock_response_root(self.charm.unit_name, self.charm.opensearch.host) + mock_response_nodes(self.charm.unit_name, self.charm.opensearch.host) + mock_response_lock_not_requested("1.1.1.1") + mock_response_health_green("1.1.1.1") + event = MagicMock(after_upgrade=False) + original_status = self.harness.model.unit.status + + self.charm._post_start_init(event) + + # 'tls_ca_renewed' flag is set, new unit certificates were requested + assert ( + self.harness.get_relation_data(self.rel_id, "opensearch/0")["tls_ca_renewed"] == "True" + ) + + new_app_admin_secret = self.secret_store.get_object(Scope.APP, CertType.APP_ADMIN.val) + + assert new_app_admin_secret["csr"] != old_csr + assert new_app_admin_secret["ca-cert"] == new_ca + assert new_app_admin_secret["key"] == old_key + assert new_app_admin_secret["subject"] != old_subject + + assert generate_csr.call_count == 3 + assert revoke_cert.called_with( + private_key=bytes(new_app_admin_secret["csr"], encoding="utf-8") + ) + assert revoke_cert.called_with(private_key=b"key-http") + assert revoke_cert.called_with(private_key=b"key-transport") + + assert create_cert.call_count == 1 + assert create_cert.called_with(certificate_signing_request=new_app_admin_secret["csr"]) + + assert revoke_cert.call_count == 2 + assert revoke_cert.called_with(b"csr-http") + assert revoke_cert.called_with(b"csr-transport") + + assert renew_cert.call_count == 2 + assert renew_cert.called_with(old_certificate_signing_renew=b"csr-http") + assert renew_cert.called_with(old_certificate_signing_renew=b"csr-transport") + + assert self.harness.model.unit.status.message == TLSNotFullyConfigured + assert self.harness.model.unit.status, MaintenanceStatus != original_status + + # Mocks on functions we want to investigate + # NOTE: Syntax: parametrized has to be the outermost decorator + @parameterized.expand( + [ + (DeploymentType.MAIN_ORCHESTRATOR), + (DeploymentType.OTHER), + (DeploymentType.FAILOVER_ORCHESTRATOR), + ] + ) + @patch("charms.opensearch.v0.opensearch_tls.generate_csr") + @patch( + "charms.tls_certificates_interface.v3.tls_certificates.TLSCertificatesRequiresV3.request_certificate_renewal" + ) + @patch( + "charms.tls_certificates_interface.v3.tls_certificates.TLSCertificatesRequiresV3.request_certificate_revocation" + ) + @patch(f"{PEER_CLUSTERS_MANAGER}.deployment_desc") + # Mocks to avoid I/O + @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._read_stored_ca") + # Necessary mocks to simulate a smooth startup + @patch("machine_upgrade.Upgrade") + @patch("socket.socket.connect") + @responses.activate + def test_on_certificate_available_ca_rotation_second_stage_any_cluster_non_leader( + self, + # NOTE: Syntax: parametrized parameter comes first + deployment_type, + _, + upgrade, + _read_stored_ca, + deployment_desc, + revoke_cert, + renew_cert, + generate_csr, + ): + """Test CA rotation 2nd stage. + + At this point the charm already has the new CA cert stored locally + (with the old CA cert also being kept around) and a service restart + was supposed to take place. + + After the restart, unit certificates have to be renewed, + and the 'tls_ca_renewed' flag has to be set in the databag. + + Applies to + - any deployment types + - any units + """ + # Units had their certificates already + csr = "old_csr" + ca = "new_ca" + keystore_password = "keystore_12345" + + csr_http_old = "csr-http-old" + csr_transport_old = "csr-transport-old" + + self.secret_store.put_object( + Scope.APP, + CertType.APP_ADMIN.val, + { + "csr": csr, + "truststore-password": "truststore_12345", + "keystore-password": keystore_password, + "ca-cert": ca, + }, + ) + self.secret_store.put_object( + Scope.UNIT, + CertType.UNIT_TRANSPORT.val, + { + "keystore-password": keystore_password, + "csr": csr_transport_old, + "key": "key-transport", + }, + ) + self.secret_store.put_object( + Scope.UNIT, + CertType.UNIT_HTTP.val, + {"keystore-password": keystore_password, "csr": csr_http_old, "key": "key-http"}, + ) + + # Emphasizing: NON-leader + self.harness.set_leader(is_leader=False) + with self.harness.hooks_disabled(): + self.harness.update_relation_data( + self.rel_id, "opensearch", {"security_index_initialised": "True"} + ) + + # We passed the 1st stage of the certificate renewalV + self.harness.update_relation_data( + self.rel_id, "opensearch/0", {"tls_ca_renewing": "True"} + ) + + # Applies to ANY deployment type + deployment_desc.return_value = DeploymentDescription( + config=PeerClusterConfig(cluster_name="", init_hold=False, roles=[]), + start=StartMode.WITH_GENERATED_ROLES, + pending_directives=[], + typ=deployment_type, + app=App(model_uuid=self.charm.model.uuid, name=self.charm.app.name), + state=DeploymentState(value=State.ACTIVE), + ) + upgrade_mock = MagicMock(app_status=ActiveStatus()) + upgrade_mock.get_unit_juju_status.return_value = ActiveStatus() + upgrade.return_value = upgrade_mock + + mock_response_root(self.charm.unit_name, self.charm.opensearch.host) + mock_response_nodes(self.charm.unit_name, self.charm.opensearch.host) + mock_response_lock_not_requested("1.1.1.1") + mock_response_health_green("1.1.1.1") + event = MagicMock(after_upgrade=False) + original_status = self.harness.model.unit.status + + self.charm._post_start_init(event) + + # 'tls_ca_renewed' flag is set, new unit certificates were requested + assert ( + self.harness.get_relation_data(self.rel_id, "opensearch/0")["tls_ca_renewed"] == "True" + ) + # Note that the old flag is left intact + assert ( + self.harness.get_relation_data(self.rel_id, "opensearch/0")["tls_ca_renewing"] + == "True" + ) + + assert revoke_cert.call_count == 2 + assert revoke_cert.called_with(b"csr-http") + assert revoke_cert.called_with(b"csr-transport") + + assert renew_cert.call_count == 2 + assert renew_cert.called_with(old_certificate_signing_renew=b"csr-http") + assert renew_cert.called_with(old_certificate_signing_renew=b"csr-transport") + + assert ( + self.secret_store.get_object(Scope.UNIT, CertType.UNIT_HTTP.val)["csr"] != csr_http_old + ) + assert ( + self.secret_store.get_object(Scope.UNIT, CertType.UNIT_TRANSPORT.val)["csr"] + != csr_transport_old + ) + + assert self.harness.model.unit.status.message == TLSNotFullyConfigured + assert self.harness.model.unit.status, MaintenanceStatus != original_status + + # Mocks to investigate/compare/alter + # NOTE: Syntax: parametrized has to be the outermost decorator + @parameterized.expand( + [ + (DeploymentType.MAIN_ORCHESTRATOR), + (DeploymentType.OTHER), + (DeploymentType.FAILOVER_ORCHESTRATOR), + ] + ) + @patch("charms.opensearch.v0.opensearch_tls.run_cmd") + @patch(f"{PEER_CLUSTERS_MANAGER}.deployment_desc") + # Mocks to avoid I/O + @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._read_stored_ca") + @patch("builtins.open", side_effect=unittest.mock.mock_open()) + def test_on_certificate_available_ca_rotation_third_stage_leader_cert_app( + self, + # NOTE: Syntax: parametrized parameter comes first + deployment_type, + _, + _read_stored_ca, + deployment_desc, + run_cmd, + ): + """Test CA rotation 3rd stage -- *app* certificate. + + At this point, the new CA has been already saved to the keystore. + The charm receives the new app certificate. The leader unit has to save it. + + Applies to: + + """ + cert = "new_cert" + chain = ["new_chain"] + csr = "old_csr" + ca = "new_ca" + key = "key" + keystore_password = "keystore_12345" + + self.secret_store.put_object( + Scope.APP, + CertType.APP_ADMIN.val, + { + "csr": csr, + "truststore-password": "truststore_12345", + "keystore-password": keystore_password, + "ca-cert": ca, + "key": key, + }, + ) + + event_mock = MagicMock( + certificate_signing_request=csr, chain=chain, certificate=cert, ca=ca + ) + + # The new CA cert has been saved to the keystore earlier + def mock_stored_ca(alias: str | None = None): + if alias == "old-ca": + return "old_ca_cert" + return ca + + _read_stored_ca.side_effect = mock_stored_ca + + # Applies to ANY deployment type + deployment_desc.return_value = DeploymentDescription( + config=PeerClusterConfig(cluster_name="", init_hold=False, roles=[]), + start=StartMode.WITH_GENERATED_ROLES, + pending_directives=[], + typ=deployment_type, + app=App(model_uuid=self.charm.model.uuid, name=self.charm.app.name), + state=DeploymentState(value=State.ACTIVE), + ) + + self.charm._restart_opensearch_event = MagicMock() + self.harness.model.unit.status = MaintenanceStatus() + original_status = self.harness.model.unit.status + + with self.harness.hooks_disabled(): + self.harness.set_leader(is_leader=True) + self.harness.update_relation_data( + self.rel_id, "opensearch", {"security_index_initialised": "True"} + ) + + # We passed the 1st stage of the certificate renewalV + self.harness.update_relation_data( + self.rel_id, "opensearch/0", {"tls_ca_renewing": "True", "tls_ca_renewed": "True"} + ) + + self.charm.tls._on_certificate_available(event_mock) + + # NOTE: Currently store_new_tls_resources() is invoked twice for 'app-admin' cert! + assert run_cmd.call_count == 4 + + # Exporting new certs + assert re.search( + "openssl pkcs12 -export .* -out " + "/var/snap/opensearch/current/etc/opensearch/certificates/app-admin.p12 .* -name app-admin", + run_cmd.call_args_list[0].args[0], + ) + assert ( + "chmod +r /var/snap/opensearch/current/etc/opensearch/certificates/app-admin.p12" + in run_cmd.call_args_list[1].args[0] + ) + assert ( + self.harness.get_relation_data(self.rel_id, "opensearch/0")["tls_ca_renewed"] == "True" + ) + # Note that the old flag is left intact + assert ( + self.harness.get_relation_data(self.rel_id, "opensearch/0")["tls_ca_renewing"] + == "True" + ) + + assert self.secret_store.get_object(Scope.APP, CertType.APP_ADMIN.val) == { + "csr": csr, + "cert": cert, + "chain": chain[0], + "truststore-password": "truststore_12345", + "keystore-password": "keystore_12345", + "key": key, + "ca-cert": ca, + } + + assert self.harness.model.unit.status.message == "" + assert self.harness.model.unit.status, MaintenanceStatus != original_status + + # Mocks to investigate/compare/alter + # NOTE: Syntax: parametrized has to be the outermost decorator + @parameterized.expand( + list( + itertools.product( + [ + (DeploymentType.MAIN_ORCHESTRATOR), + (DeploymentType.OTHER), + (DeploymentType.FAILOVER_ORCHESTRATOR), + ], + [True, False], + [CertType.UNIT_HTTP.val, CertType.UNIT_TRANSPORT.val], + ) + ) + ) + @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS.reload_tls_certificates") + @patch("charms.opensearch.v0.opensearch_tls.run_cmd") + @patch(f"{PEER_CLUSTERS_MANAGER}.deployment_desc") + # Mocks to avoid I/O + @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._read_stored_ca") + @patch("charms.opensearch.v0.opensearch_tls.exists", return_value=True) + @patch("opensearch.OpenSearchSnap.write_file") + @patch("builtins.open", side_effect=unittest.mock.mock_open()) + @patch("socket.socket.connect") + @responses.activate + def test_on_certificate_available_ca_rotation_third_stage_any_unit_cert_unit( + self, + # NOTE: Syntax: parametrized parameter comes first + deployment_type, + leader, + cert_type, + _, + __, + ___, + _____, + _read_stored_ca, + deployment_desc, + run_cmd, + reload_tls_certificates, + ): + """Test CA rotation 3rd stage -- *unit* certificate. + + At this point, the new CA has been already saved to the keystore. + The charm receives a new unit certificate in the 'certificate-available' event. + The unit has to + 1. save the new certificate + 2. if it was the last one to be updated: remove CA renewal flags + 3. if it was the last one updated: remove CA from keystore + + Applies to: + - all deployments + - all units + """ + cert = "new_cert" + chain = ["new_chain"] + ca = "new_ca" + key = "key" + keystore_password = "keystore_12345" + + self.secret_store.put_object( + Scope.APP, + CertType.APP_ADMIN.val, + { + "csr": "new_csr", + "keystore-password": keystore_password, + "truststore-password": "truststore_12345", + "ca-cert": ca, + "cert": "cert", + "key": "new_key", + "subject": "new_subject", + "chain": chain, + }, + ) + + self.secret_store.put_object( + Scope.UNIT, + CertType.UNIT_TRANSPORT, + { + "csr": f"{CertType.UNIT_TRANSPORT.val}-csr-new", + "truststore-password": "truststore_12345", + "keystore-password": keystore_password, + "key": key, + "ca-cert": ca, + "cert": "old_cert", + }, + ) + + self.secret_store.put_object( + Scope.UNIT, + CertType.UNIT_HTTP, + { + "csr": f"{CertType.UNIT_HTTP.val}-csr-new", + "truststore-password": "truststore_12345", + "keystore-password": keystore_password, + "key": key, + "ca-cert": ca, + "cert": "old_cert", + }, + ) + + # The event is addressing the transaction identified by the new csr + # for the corresponding cert type defined by the test parameter + event_mock = MagicMock( + certificate_signing_request=f"{cert_type}-csr-new", + chain=chain, + certificate=cert, + ca=ca, + ) + + # The new CA cert has been saved to the keystore earlier + _read_stored_ca.return_value = ca + + # Applies to ANY deployment type + deployment_desc.return_value = DeploymentDescription( + config=PeerClusterConfig(cluster_name="", init_hold=False, roles=[]), + start=StartMode.WITH_GENERATED_ROLES, + pending_directives=[], + typ=deployment_type, + app=App(model_uuid=self.charm.model.uuid, name=self.charm.app.name), + state=DeploymentState(value=State.ACTIVE), + ) + + self.charm._restart_opensearch_event = MagicMock() + self.harness.model.unit.status = MaintenanceStatus() + + with self.harness.hooks_disabled(): + self.harness.set_leader(is_leader=leader) + self.harness.update_relation_data( + self.rel_id, + "opensearch", + {"security_index_initialised": "True", "admin_user_initialized": "True"}, + ) + + # We passed the 1st stage of the certificate renewalV + self.harness.update_relation_data( + self.rel_id, "opensearch/0", {"tls_ca_renewing": "True", "tls_ca_renewed": "True"} + ) + + reload_tls_certificates.side_effect = None + mock_response_put_transport_cert("1.1.1.1") + mock_response_put_http_cert("1.1.1.1") + original_status = self.harness.model.unit.status + + self.charm.tls._on_certificate_available(event_mock) + + # Saving new cert, cleaning up CA renewal flag, removing old CA from keystore + # Note: the high number of operations come from the fact that on each certificate received + # the 'issuer' is checked on each certificate that is saved on the disk. + if self.charm.unit.is_leader(): + assert run_cmd.call_count == 14 + else: + assert run_cmd.call_count == 16 + + assert re.search( + "openssl pkcs12 -export .* -out " + f"/var/snap/opensearch/current/etc/opensearch/certificates/{cert_type}.p12 .* -name {cert_type}", + run_cmd.call_args_list[0].args[0], + ) + assert ( + f"chmod +r /var/snap/opensearch/current/etc/opensearch/certificates/{cert_type}.p12" + in run_cmd.call_args_list[1].args[0] + ) + assert re.search("keytool .*-delete .*-alias old-ca", run_cmd.call_args_list[-1].args[0]) + + assert "tls_ca_renewing" not in self.harness.get_relation_data(self.rel_id, "opensearch/0") + assert "tls_ca_renewed" not in self.harness.get_relation_data(self.rel_id, "opensearch/0") + + assert self.harness.model.unit.status.message == "" + assert self.harness.model.unit.status, MaintenanceStatus != original_status + + # Additional potential phases of the workflow + + # Mock to investigate/compare/alter + @parameterized.expand( + list( + itertools.product( + [ + (DeploymentType.MAIN_ORCHESTRATOR), + (DeploymentType.OTHER), + (DeploymentType.FAILOVER_ORCHESTRATOR), + ], + [True, False], + ) + ) + ) + @patch("charms.opensearch.v0.opensearch_tls.run_cmd") + @patch(f"{PEER_CLUSTERS_MANAGER}.deployment_desc") + # Mock to avoid I/O + @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._read_stored_ca") + @patch("builtins.open", side_effect=unittest.mock.mock_open()) + def test_on_certificate_available_rotation_ongoing_on_this_unit( + # NOTE: Syntax: parametrized parameter comes first + self, + deployment_type, + leader, + _, + _read_stored_ca, + deployment_desc, + run_cmd, + ): + """Additional 'certificate-available' event while processing CA rotation. + + In case any stage of a CA cert rotation is being processed, + further 'certificate-available' events are deferred. + + Applies to: + - any deployment + - any unit + """ + csr = "old_csr" + cert = "new_cert" + chain = ["new_chain"] + ca = "new_ca" + + self.secret_store.put_object( + Scope.APP, + CertType.APP_ADMIN.val, + { + "csr": csr, + "keystore-password": "keystore_12345", + "truststore-password": "truststore_12345", + "ca-cert": "old_ca_cert", + "cert": "old_cert", + }, + ) + + _read_stored_ca.return_value = "stored_ca" + + # Applies to ANY deployment type + deployment_desc.return_value = DeploymentDescription( + config=PeerClusterConfig(cluster_name="", init_hold=False, roles=[]), + start=StartMode.WITH_GENERATED_ROLES, + pending_directives=[], + typ=deployment_type, + app=App(model_uuid=self.charm.model.uuid, name=self.charm.app.name), + state=DeploymentState(value=State.ACTIVE), + ) + + event_mock = MagicMock( + certificate_signing_request=csr, chain=chain, certificate=cert, ca=ca + ) + self.charm.on.certificate_available = MagicMock() + + self.harness.set_leader(is_leader=leader) + original_status = self.harness.model.unit.status + + # This unit is within the process of certificate renewal + with self.harness.hooks_disabled(): + self.harness.update_relation_data( + self.rel_id, f"{self.charm.unit.name}", {"tls_ca_renewing": "True"} + ) + + self.charm.tls._on_certificate_available(event_mock) + + # No action taken, no change on status or certificates + assert run_cmd.call_count == 0 + assert self.harness.model.unit.status == original_status + self.charm.on.certificate_available.defer.called_once() + assert self.secret_store.get_object(Scope.APP, CertType.APP_ADMIN.val) == { + "csr": csr, + "keystore-password": "keystore_12345", + "truststore-password": "truststore_12345", + "ca-cert": "old_ca_cert", + "cert": "old_cert", + } + + # Mock to investigate/compare/alter + @parameterized.expand( + list( + itertools.product( + [ + (DeploymentType.MAIN_ORCHESTRATOR), + (DeploymentType.OTHER), + (DeploymentType.FAILOVER_ORCHESTRATOR), + ], + [True, False], + ) + ) + ) + @patch("charms.opensearch.v0.opensearch_tls.run_cmd") + @patch(f"{PEER_CLUSTERS_MANAGER}.deployment_desc") + # Mock to avoid I/O + @patch("charms.opensearch.v0.opensearch_tls.OpenSearchTLS._read_stored_ca") + @patch("builtins.open", side_effect=unittest.mock.mock_open()) + def test_on_certificate_available_rotation_ongoing_on_another_unit( + # NOTE: Syntax: parametrized parameter comes first + self, + deployment_type, + leader, + _, + _read_stored_ca, + deployment_desc, + run_cmd, + ): + """Additional 'certificate-available' event while processing CA rotation. + + In case any stage of a CA cert rotation is being processed, + further 'certificate-available' events are deferred. + + Applies to: + - any deployment + - any unit + """ + csr = "old_csr" + cert = "new_cert" + chain = ["new_chain"] + ca = "new_ca" + + self.secret_store.put_object( + Scope.APP, + CertType.APP_ADMIN.val, + { + "csr": csr, + "keystore-password": "keystore_12345", + "truststore-password": "truststore_12345", + "ca-cert": "old_ca_cert", + "cert": "old_cert", + }, + ) + + _read_stored_ca.return_value = "stored_ca" + + # Applies to ANY deployment type + deployment_desc.return_value = DeploymentDescription( + config=PeerClusterConfig(cluster_name="", init_hold=False, roles=[]), + start=StartMode.WITH_GENERATED_ROLES, + pending_directives=[], + typ=deployment_type, + app=App(model_uuid=self.charm.model.uuid, name=self.charm.app.name), + state=DeploymentState(value=State.ACTIVE), + ) + + event_mock = MagicMock( + certificate_signing_request=csr, chain=chain, certificate=cert, ca=ca + ) + self.charm.on.certificate_available = MagicMock() + + self.harness.set_leader(is_leader=leader) + original_status = self.harness.model.unit.status + + # This unit has updated CA certificate + # but another unit of the cluster is still within the process + self.harness.add_relation_unit(self.rel_id, f"{self.charm.app.name}/1") + with self.harness.hooks_disabled(): + self.harness.update_relation_data( + self.rel_id, f"{self.charm.app.name}/0", {"tls_ca_renewed": "True"} + ) + self.harness.update_relation_data( + self.rel_id, f"{self.charm.app.name}/1", {"tls_ca_renewing": "True"} + ) + + self.charm.tls._on_certificate_available(event_mock) + + # No action taken, no change on status or certificates + assert run_cmd.call_count == 0 + assert self.harness.model.unit.status == original_status + self.charm.on.certificate_available.defer.called_once() + assert self.secret_store.get_object(Scope.APP, CertType.APP_ADMIN.val) == { + "csr": csr, + "keystore-password": "keystore_12345", + "truststore-password": "truststore_12345", + "ca-cert": "old_ca_cert", + "cert": "old_cert", + } diff --git a/tests/unit/resources/config/opensearch.yml b/tests/unit/resources/config/opensearch.yml index d3c49b9d2..05663e2c5 100644 --- a/tests/unit/resources/config/opensearch.yml +++ b/tests/unit/resources/config/opensearch.yml @@ -86,12 +86,11 @@ # Require explicit names when deleting indices: # #action.destructive_requires_name: true -# -node.attr.app_id: 617e5f02-5be5-4e25-85f0-276b2347a5ad/opensearch +node.attr.app_id: opensearch node.attr.temp: hot -node.roles: +node.roles: - data - ingest - ml +- coordinating_only - cluster_manager - diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 028b1749b..4e31f8e57 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -59,5 +59,5 @@ def test_store_tls_resources(self, grp_getgrnam, pwd_getpwnam, os_chown): }, ) - self.assertTrue(exists(f"{tmp_dir}/chain.pem")) + self.assertFalse(exists(f"{tmp_dir}/chain.pem")) self.assertTrue(exists(f"{tmp_dir}/unit-transport.p12"))