From 06645a8358652ab7ac5882652432893f98c530dd Mon Sep 17 00:00:00 2001 From: Thore Sommer Date: Mon, 30 Aug 2021 12:23:14 +0200 Subject: [PATCH] verifier: move to new failure architecture We only send notifications if a event with a higher severity level was generated and save the severity level (as integer) and the first event id that generated the failure with that level in the database. The current design assumes that when a device is added no failures should occur on the first validation to mark the device as in use to recive revocation notifcations. If a failure is generated that is irrecoverable we stop polling the agent. Part of enhancement proposal keylime/enhancements#48 Signed-off-by: Thore Sommer --- keylime/cloud_verifier_common.py | 80 ++++++++++++------- keylime/cloud_verifier_tornado.py | 78 +++++++++++------- keylime/db/verifier_db.py | 2 + ...9_add_fields_for_revocation_context_to_.py | 45 +++++++++++ 4 files changed, 149 insertions(+), 56 deletions(-) create mode 100644 keylime/migrations/versions/257fe0f0c039_add_fields_for_revocation_context_to_.py diff --git a/keylime/cloud_verifier_common.py b/keylime/cloud_verifier_common.py index 0fd6f4acf..e2fa1171b 100644 --- a/keylime/cloud_verifier_common.py +++ b/keylime/cloud_verifier_common.py @@ -19,6 +19,7 @@ from keylime import ca_util from keylime import revocation_notifier from keylime.agentstates import AgentAttestStates +from keylime.failure import Failure, Component from keylime.tpm.tpm_main import tpm from keylime.tpm.tpm_abstract import TPM_Utilities from keylime.common import algorithms @@ -133,11 +134,12 @@ def init_mtls(section='cloud_verifier', generatedir='cv_ca'): return context -def process_quote_response(agent, json_response, agentAttestState): +def process_quote_response(agent, json_response, agentAttestState) -> Failure: """Validates the response from the Cloud agent. This method invokes an Registrar Server call to register, and then check the quote. """ + failure = Failure(Component.QUOTE_VALIDATION) received_public_key = None quote = None # in case of failure in response content do not continue @@ -157,9 +159,11 @@ def process_quote_response(agent, json_response, agentAttestState): logger.debug("received ima_measurement_list_entry: %d", ima_measurement_list_entry) logger.debug("received boottime: %s", boottime) logger.debug("received boot log %s", (mb_measurement_list is not None)) - except Exception: - return None + except Exception as e: + failure.add_event("invalid_data", {"message": "parsing agents get quote respone failed", "data": e}, False) + return failure + # TODO: Are those separate failures? if not isinstance(ima_measurement_list_entry, int): raise Exception("ima_measurement_list_entry parameter must be an integer") @@ -170,7 +174,8 @@ def process_quote_response(agent, json_response, agentAttestState): if received_public_key is None: if agent.get('public_key', "") == "" or agent.get('b64_encrypted_V', "") == "": logger.error("agent did not provide public key and no key or encrypted_v was cached at CV") - return False + failure.add_event("no_pubkey", "agent did not provide public key and no key or encrypted_v was cached at CV", False) + return failure agent['provide_V'] = False received_public_key = agent['public_key'] @@ -180,7 +185,8 @@ def process_quote_response(agent, json_response, agentAttestState): "cloud_verifier", "registrar_port"), agent['agent_id']) if registrar_data is None: logger.warning("AIK not found in registrar, quote not validated") - return False + failure.add_event("no_aik", "AIK not found in registrar, quote not validated", False) + return failure agent['registrar_data'] = registrar_data hash_alg = json_response.get('hash_alg') @@ -194,38 +200,50 @@ def process_quote_response(agent, json_response, agentAttestState): # Ensure hash_alg is in accept_tpm_hash_alg list if not algorithms.is_accepted(hash_alg, agent['accept_tpm_hash_algs']): - raise Exception( - "TPM Quote is using an unaccepted hash algorithm: %s" % hash_alg) + logger.error(f"TPM Quote is using an unaccepted hash algorithm: {hash_alg}") + failure.add_event("invalid_hash_alg", + {"message": f"TPM Quote is using an unaccepted hash algorithm: {hash_alg}", "data": hash_alg}, + False) + return failure # Ensure enc_alg is in accept_tpm_encryption_algs list if not algorithms.is_accepted(enc_alg, agent['accept_tpm_encryption_algs']): - raise Exception( - "TPM Quote is using an unaccepted encryption algorithm: %s" % enc_alg) + logger.error(f"TPM Quote is using an unaccepted encryption algorithm: {enc_alg}") + failure.add_event("invalid_enc_alg", + {"message": f"TPM Quote is using an unaccepted encryption algorithm: {enc_alg}", "data": enc_alg}, + False) + return failure # Ensure sign_alg is in accept_tpm_encryption_algs list if not algorithms.is_accepted(sign_alg, agent['accept_tpm_signing_algs']): - raise Exception( - "TPM Quote is using an unaccepted signing algorithm: %s" % sign_alg) + logger.error(f"TPM Quote is using an unaccepted signing algorithm: {sign_alg}") + failure.add_event("invalid_sign_alg", + {"message": f"TPM Quote is using an unaccepted signing algorithm: {sign_alg}", "data": {sign_alg}}, + False) + return failure if ima_measurement_list_entry == 0: agentAttestState.reset_ima_attestation() elif ima_measurement_list_entry != agentAttestState.get_next_ima_ml_entry(): # If we requested a particular entry number then the agent must return either # starting at 0 (handled above) or with the requested number. - raise Exception( - "Agent did not respond with requested next IMA measurement list entry %d but started at %d" % - (agentAttestState.get_next_ima_ml_entry(), ima_measurement_list_entry)) + logger.error("Agent did not respond with requested next IMA measurement list entry " + f"{agentAttestState.get_next_ima_ml_entry()} but started at {ima_measurement_list_entry}") + failure.add_event("invalid_ima_entry_nb", + {"message": "Agent did not respond with requested next IMA measurement list entry", + "got": ima_measurement_list_entry, "expected": agentAttestState.get_next_ima_ml_entry()}, + False) elif not agentAttestState.is_expected_boottime(boottime): # agent sent a list not starting at 0 and provided a boottime that doesn't # match the expected boottime, so it must have been rebooted; we would fail # attestation this time so we retry with a full attestation next time. agentAttestState.reset_ima_attestation() - return True + return failure agentAttestState.set_boottime(boottime) ima_keyring = ima_file_signatures.ImaKeyring.from_string(agent['ima_sign_verification_keys']) - validQuote = get_tpm_instance().check_quote( + quote_validation_failure = get_tpm_instance().check_quote( agentAttestState, agent['nonce'], received_public_key, @@ -238,21 +256,21 @@ def process_quote_response(agent, json_response, agentAttestState): ima_keyring, mb_measurement_list, agent['mb_refstate']) - if not validQuote: - return False + failure.merge(quote_validation_failure) - # set a flag so that we know that the agent was verified once. - # we only issue notifications for agents that were at some point good - agent['first_verified'] = True + if not failure: + # set a flag so that we know that the agent was verified once. + # we only issue notifications for agents that were at some point good + agent['first_verified'] = True - # has public key changed? if so, clear out b64_encrypted_V, it is no longer valid - if received_public_key != agent.get('public_key', ""): - agent['public_key'] = received_public_key - agent['b64_encrypted_V'] = "" - agent['provide_V'] = True + # has public key changed? if so, clear out b64_encrypted_V, it is no longer valid + if received_public_key != agent.get('public_key', ""): + agent['public_key'] = received_public_key + agent['b64_encrypted_V'] = "" + agent['provide_V'] = True # ok we're done - return validQuote + return failure def prepare_v(agent): @@ -333,6 +351,8 @@ def process_get_status(agent): 'verifier_id' : agent.verifier_id, 'verifier_ip' : agent.verifier_ip, 'verifier_port' : agent.verifier_port, + 'severity_level': agent.severity_level, + 'last_event_id': agent.last_event_id } return response @@ -340,7 +360,7 @@ def process_get_status(agent): # sign a message with revocation key. telling of verification problem -def notify_error(agent, msgtype='revocation'): +def notify_error(agent, msgtype='revocation', event=None): if not config.getboolean('cloud_verifier', 'revocation_notifier'): return @@ -353,6 +373,10 @@ def notify_error(agent, msgtype='revocation'): 'vtpm_policy': agent['vtpm_policy'], 'meta_data': agent['meta_data'], 'event_time': time.asctime()} + if event: + revocation['event_id'] = event.event_id + revocation['severity_label'] = event.severity_label.name + revocation['context'] = event.context tosend = {'msg': json.dumps(revocation).encode('utf-8')} diff --git a/keylime/cloud_verifier_tornado.py b/keylime/cloud_verifier_tornado.py index f13e0fc45..1b4f6e798 100644 --- a/keylime/cloud_verifier_tornado.py +++ b/keylime/cloud_verifier_tornado.py @@ -27,6 +27,7 @@ from keylime import tornado_requests from keylime import api_version as keylime_api_version from keylime.ima_ast import START_HASH +from keylime.failure import MAX_SEVERITY_LABEL, Failure, Component logger = keylime_logging.init_logging('cloudverifier') @@ -702,6 +703,7 @@ def data_received(self, chunk): async def invoke_get_quote(agent, need_pubkey): + failure = Failure(Component.INTERNAL, ["verifier"]) if agent is None: raise Exception("agent deleted while being processed") params = cloud_verifier_common.prepare_get_quote(agent) @@ -724,7 +726,8 @@ async def invoke_get_quote(agent, need_pubkey): else: # catastrophic error, do not continue logger.critical("Unexpected Get Quote response error for cloud agent %s, Error: %s", agent['agent_id'], response.status_code) - asyncio.ensure_future(process_agent(agent, states.FAILED)) + failure.add_event("no_quote", "Unexpected Get Quote reponse from agent", False) + asyncio.ensure_future(process_agent(agent, states.FAILED, failure)) else: try: json_response = json.loads(response.body) @@ -733,13 +736,14 @@ async def invoke_get_quote(agent, need_pubkey): if 'provide_V' not in agent : agent['provide_V'] = True agentAttestState = get_AgentAttestStates().get_by_agent_id(agent['agent_id']) - if cloud_verifier_common.process_quote_response(agent, json_response['results'], agentAttestState): + failure = cloud_verifier_common.process_quote_response(agent, json_response['results'], agentAttestState) + if not failure: if agent['provide_V']: asyncio.ensure_future(process_agent(agent, states.PROVIDE_V)) else: asyncio.ensure_future(process_agent(agent, states.GET_QUOTE)) else: - asyncio.ensure_future(process_agent(agent, states.INVALID_QUOTE)) + asyncio.ensure_future(process_agent(agent, states.INVALID_QUOTE, failure)) # store the attestation state store_attestation_state(agentAttestState) @@ -749,6 +753,7 @@ async def invoke_get_quote(agent, need_pubkey): async def invoke_provide_v(agent): + failure = Failure(Component.INTERNAL, ["verifier"]) if agent is None: raise Exception("Agent deleted while being processed") try: @@ -769,18 +774,19 @@ async def invoke_provide_v(agent): else: # catastrophic error, do not continue logger.critical("Unexpected Provide V response error for cloud agent %s, Error: %s", agent['agent_id'], response.status_code) - asyncio.ensure_future(process_agent(agent, states.FAILED)) + failure.add_event("no_v", {"message": "Unexpected provide V response", "data": response.error}, False) + asyncio.ensure_future(process_agent(agent, states.FAILED, failure)) else: asyncio.ensure_future(process_agent(agent, states.GET_QUOTE)) -async def process_agent(agent, new_operational_state): +async def process_agent(agent, new_operational_state, failure=Failure(Component.INTERNAL, ["verifier"])): # Convert to dict if the agent arg is a db object if not isinstance(agent, dict): agent = _from_db_obj(agent) session = get_session() - try: + try: # pylint: disable=R1702 main_agent_operational_state = agent['operational_state'] try: stored_agent = session.query(VerfierMain).filter_by( @@ -808,24 +814,28 @@ async def process_agent(agent, new_operational_state): # If failed during processing, log regardless and drop it on the floor # The administration application (tenant) can GET the status and act accordingly (delete/retry/etc). if new_operational_state in (states.FAILED, states.INVALID_QUOTE): - agent['operational_state'] = new_operational_state - - # issue notification for invalid quotes - if new_operational_state == states.INVALID_QUOTE: - cloud_verifier_common.notify_error(agent) - - if agent['pending_event'] is not None: - tornado.ioloop.IOLoop.current().remove_timeout( - agent['pending_event']) - for key in exclude_db: - if key in agent: - del agent[key] - session.query(VerfierMain).filter_by( - agent_id=agent['agent_id']).update(agent) - session.commit() - - logger.warning("Agent %s failed, stopping polling", agent['agent_id']) - return + assert failure, "States FAILED and INVALID QUOTE should only be reached with a failure message" + + if agent.get('severity_level') is None or agent['severity_level'] < failure.highest_severity.severity: + agent['severity_level'] = failure.highest_severity.severity + agent['last_event_id'] = failure.highest_severity_event.event_id + agent['operational_state'] = new_operational_state + + # issue notification for invalid quotes + if new_operational_state == states.INVALID_QUOTE: + cloud_verifier_common.notify_error(agent, event=failure.highest_severity_event) + + # When the failure is irrecoverable we stop polling the agent + if not failure.recoverable or failure.highest_severity == MAX_SEVERITY_LABEL: + if agent['pending_event'] is not None: + tornado.ioloop.IOLoop.current().remove_timeout( + agent['pending_event']) + for key in exclude_db: + if key in agent: + del agent[key] + session.query(VerfierMain).filter_by( + agent_id=agent['agent_id']).update(agent) + session.commit() # propagate all state, but remove none DB keys first (using exclude_db) try: @@ -840,6 +850,16 @@ async def process_agent(agent, new_operational_state): except SQLAlchemyError as e: logger.error('SQLAlchemy Error: %s', e) + # If agent was in a failed state we check if we either stop polling + # or just add it again to the event loop + if new_operational_state in [states.FAILED, states.INVALID_QUOTE]: + if not failure.recoverable or failure.highest_severity == MAX_SEVERITY_LABEL: + logger.warning("Agent %s failed, stopping polling", agent['agent_id']) + return + + await invoke_get_quote(agent, False) + return + # if new, get a quote if (main_agent_operational_state == states.START and new_operational_state == states.GET_QUOTE): @@ -876,12 +896,13 @@ async def process_agent(agent, new_operational_state): new_operational_state == states.GET_QUOTE_RETRY): if agent['num_retries'] >= maxr: logger.warning("Agent %s was not reachable for quote in %d tries, setting state to FAILED", agent['agent_id'], maxr) + failure.add_event("not_reachable", "agent was not reachable from verifier", False) if agent['first_verified']: # only notify on previously good agents cloud_verifier_common.notify_error( - agent, msgtype='comm_error') + agent, msgtype='comm_error', event=failure.highest_severity_event) else: logger.debug("Communication error for new agent. No notification will be sent") - await process_agent(agent, states.FAILED) + await process_agent(agent, states.FAILED, failure) else: agent['operational_state'] = states.GET_QUOTE cb = functools.partial(invoke_get_quote, agent, True) @@ -894,9 +915,10 @@ async def process_agent(agent, new_operational_state): new_operational_state == states.PROVIDE_V_RETRY): if agent['num_retries'] >= maxr: logger.warning("Agent %s was not reachable to provide v in %d tries, setting state to FAILED", agent['agent_id'], maxr) + failure.add_event("not_reachable_v", "agent was not reachable to provide V", False) cloud_verifier_common.notify_error( - agent, msgtype='comm_error') - await process_agent(agent, states.FAILED) + agent, msgtype='comm_error', event=failure.highest_severity_event) + await process_agent(agent, states.FAILED, failure) else: agent['operational_state'] = states.PROVIDE_V cb = functools.partial(invoke_provide_v, agent) diff --git a/keylime/db/verifier_db.py b/keylime/db/verifier_db.py index e01f13257..f5ddd6c86 100644 --- a/keylime/db/verifier_db.py +++ b/keylime/db/verifier_db.py @@ -46,6 +46,8 @@ class VerfierMain(Base): ima_pcrs = Column(JSONPickleType(pickler=json)) pcr10 = Column(LargeBinary) next_ima_ml_entry = Column(Integer) + severity_level = Column(Integer, nullable=True) + last_event_id = Column(String(200), nullable=True) class VerifierAllowlist(Base): diff --git a/keylime/migrations/versions/257fe0f0c039_add_fields_for_revocation_context_to_.py b/keylime/migrations/versions/257fe0f0c039_add_fields_for_revocation_context_to_.py new file mode 100644 index 000000000..c5cb3e466 --- /dev/null +++ b/keylime/migrations/versions/257fe0f0c039_add_fields_for_revocation_context_to_.py @@ -0,0 +1,45 @@ +"""Add fields for revocation context to verifier + +Revision ID: 257fe0f0c039 +Revises: f35cdd35eb83 +Create Date: 2021-08-20 12:42:30.427138 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '257fe0f0c039' +down_revision = 'f35cdd35eb83' +branch_labels = None +depends_on = None + + +def upgrade(engine_name): + globals()["upgrade_%s" % engine_name]() + + +def downgrade(engine_name): + globals()["downgrade_%s" % engine_name]() + + + + + +def upgrade_registrar(): + pass + + +def downgrade_registrar(): + pass + + +def upgrade_cloud_verifier(): + op.add_column('verifiermain', sa.Column('severity_level', sa.Integer)) + op.add_column('verifiermain', sa.Column('last_event_id', sa.String)) + + +def downgrade_cloud_verifier(): + op.drop_column('verifiermain', 'severity_level') + op.drop_column('verifiermain', sa.Column('last_event_id', sa.Integer))