Skip to content

Commit

Permalink
verifier: move to new failure architecture
Browse files Browse the repository at this point in the history
We only send notifications if a event with a higher severity level was
generated and save the severity level (as integer) and the first event id
that generated the failure with that level in the database.

The current design assumes that when a device is added no failures should
occur on the first validation to mark the device as in use to recive
revocation notifcations.

If a failure is generated that is irrecoverable we stop polling the agent.

Part of enhancement proposal keylime/enhancements#48

Signed-off-by: Thore Sommer <[email protected]>
  • Loading branch information
THS-on committed Sep 13, 2021
1 parent dd212a6 commit 06645a8
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 56 deletions.
80 changes: 52 additions & 28 deletions keylime/cloud_verifier_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from keylime import ca_util
from keylime import revocation_notifier
from keylime.agentstates import AgentAttestStates
from keylime.failure import Failure, Component
from keylime.tpm.tpm_main import tpm
from keylime.tpm.tpm_abstract import TPM_Utilities
from keylime.common import algorithms
Expand Down Expand Up @@ -133,11 +134,12 @@ def init_mtls(section='cloud_verifier', generatedir='cv_ca'):
return context


def process_quote_response(agent, json_response, agentAttestState):
def process_quote_response(agent, json_response, agentAttestState) -> Failure:
"""Validates the response from the Cloud agent.
This method invokes an Registrar Server call to register, and then check the quote.
"""
failure = Failure(Component.QUOTE_VALIDATION)
received_public_key = None
quote = None
# in case of failure in response content do not continue
Expand All @@ -157,9 +159,11 @@ def process_quote_response(agent, json_response, agentAttestState):
logger.debug("received ima_measurement_list_entry: %d", ima_measurement_list_entry)
logger.debug("received boottime: %s", boottime)
logger.debug("received boot log %s", (mb_measurement_list is not None))
except Exception:
return None
except Exception as e:
failure.add_event("invalid_data", {"message": "parsing agents get quote respone failed", "data": e}, False)
return failure

# TODO: Are those separate failures?
if not isinstance(ima_measurement_list_entry, int):
raise Exception("ima_measurement_list_entry parameter must be an integer")

Expand All @@ -170,7 +174,8 @@ def process_quote_response(agent, json_response, agentAttestState):
if received_public_key is None:
if agent.get('public_key', "") == "" or agent.get('b64_encrypted_V', "") == "":
logger.error("agent did not provide public key and no key or encrypted_v was cached at CV")
return False
failure.add_event("no_pubkey", "agent did not provide public key and no key or encrypted_v was cached at CV", False)
return failure
agent['provide_V'] = False
received_public_key = agent['public_key']

Expand All @@ -180,7 +185,8 @@ def process_quote_response(agent, json_response, agentAttestState):
"cloud_verifier", "registrar_port"), agent['agent_id'])
if registrar_data is None:
logger.warning("AIK not found in registrar, quote not validated")
return False
failure.add_event("no_aik", "AIK not found in registrar, quote not validated", False)
return failure
agent['registrar_data'] = registrar_data

hash_alg = json_response.get('hash_alg')
Expand All @@ -194,38 +200,50 @@ def process_quote_response(agent, json_response, agentAttestState):

# Ensure hash_alg is in accept_tpm_hash_alg list
if not algorithms.is_accepted(hash_alg, agent['accept_tpm_hash_algs']):
raise Exception(
"TPM Quote is using an unaccepted hash algorithm: %s" % hash_alg)
logger.error(f"TPM Quote is using an unaccepted hash algorithm: {hash_alg}")
failure.add_event("invalid_hash_alg",
{"message": f"TPM Quote is using an unaccepted hash algorithm: {hash_alg}", "data": hash_alg},
False)
return failure

# Ensure enc_alg is in accept_tpm_encryption_algs list
if not algorithms.is_accepted(enc_alg, agent['accept_tpm_encryption_algs']):
raise Exception(
"TPM Quote is using an unaccepted encryption algorithm: %s" % enc_alg)
logger.error(f"TPM Quote is using an unaccepted encryption algorithm: {enc_alg}")
failure.add_event("invalid_enc_alg",
{"message": f"TPM Quote is using an unaccepted encryption algorithm: {enc_alg}", "data": enc_alg},
False)
return failure

# Ensure sign_alg is in accept_tpm_encryption_algs list
if not algorithms.is_accepted(sign_alg, agent['accept_tpm_signing_algs']):
raise Exception(
"TPM Quote is using an unaccepted signing algorithm: %s" % sign_alg)
logger.error(f"TPM Quote is using an unaccepted signing algorithm: {sign_alg}")
failure.add_event("invalid_sign_alg",
{"message": f"TPM Quote is using an unaccepted signing algorithm: {sign_alg}", "data": {sign_alg}},
False)
return failure

if ima_measurement_list_entry == 0:
agentAttestState.reset_ima_attestation()
elif ima_measurement_list_entry != agentAttestState.get_next_ima_ml_entry():
# If we requested a particular entry number then the agent must return either
# starting at 0 (handled above) or with the requested number.
raise Exception(
"Agent did not respond with requested next IMA measurement list entry %d but started at %d" %
(agentAttestState.get_next_ima_ml_entry(), ima_measurement_list_entry))
logger.error("Agent did not respond with requested next IMA measurement list entry "
f"{agentAttestState.get_next_ima_ml_entry()} but started at {ima_measurement_list_entry}")
failure.add_event("invalid_ima_entry_nb",
{"message": "Agent did not respond with requested next IMA measurement list entry",
"got": ima_measurement_list_entry, "expected": agentAttestState.get_next_ima_ml_entry()},
False)
elif not agentAttestState.is_expected_boottime(boottime):
# agent sent a list not starting at 0 and provided a boottime that doesn't
# match the expected boottime, so it must have been rebooted; we would fail
# attestation this time so we retry with a full attestation next time.
agentAttestState.reset_ima_attestation()
return True
return failure

agentAttestState.set_boottime(boottime)

ima_keyring = ima_file_signatures.ImaKeyring.from_string(agent['ima_sign_verification_keys'])
validQuote = get_tpm_instance().check_quote(
quote_validation_failure = get_tpm_instance().check_quote(
agentAttestState,
agent['nonce'],
received_public_key,
Expand All @@ -238,21 +256,21 @@ def process_quote_response(agent, json_response, agentAttestState):
ima_keyring,
mb_measurement_list,
agent['mb_refstate'])
if not validQuote:
return False
failure.merge(quote_validation_failure)

# set a flag so that we know that the agent was verified once.
# we only issue notifications for agents that were at some point good
agent['first_verified'] = True
if not failure:
# set a flag so that we know that the agent was verified once.
# we only issue notifications for agents that were at some point good
agent['first_verified'] = True

# has public key changed? if so, clear out b64_encrypted_V, it is no longer valid
if received_public_key != agent.get('public_key', ""):
agent['public_key'] = received_public_key
agent['b64_encrypted_V'] = ""
agent['provide_V'] = True
# has public key changed? if so, clear out b64_encrypted_V, it is no longer valid
if received_public_key != agent.get('public_key', ""):
agent['public_key'] = received_public_key
agent['b64_encrypted_V'] = ""
agent['provide_V'] = True

# ok we're done
return validQuote
return failure


def prepare_v(agent):
Expand Down Expand Up @@ -333,14 +351,16 @@ def process_get_status(agent):
'verifier_id' : agent.verifier_id,
'verifier_ip' : agent.verifier_ip,
'verifier_port' : agent.verifier_port,
'severity_level': agent.severity_level,
'last_event_id': agent.last_event_id
}
return response


# sign a message with revocation key. telling of verification problem


def notify_error(agent, msgtype='revocation'):
def notify_error(agent, msgtype='revocation', event=None):
if not config.getboolean('cloud_verifier', 'revocation_notifier'):
return

Expand All @@ -353,6 +373,10 @@ def notify_error(agent, msgtype='revocation'):
'vtpm_policy': agent['vtpm_policy'],
'meta_data': agent['meta_data'],
'event_time': time.asctime()}
if event:
revocation['event_id'] = event.event_id
revocation['severity_label'] = event.severity_label.name
revocation['context'] = event.context

tosend = {'msg': json.dumps(revocation).encode('utf-8')}

Expand Down
78 changes: 50 additions & 28 deletions keylime/cloud_verifier_tornado.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from keylime import tornado_requests
from keylime import api_version as keylime_api_version
from keylime.ima_ast import START_HASH
from keylime.failure import MAX_SEVERITY_LABEL, Failure, Component

logger = keylime_logging.init_logging('cloudverifier')

Expand Down Expand Up @@ -702,6 +703,7 @@ def data_received(self, chunk):


async def invoke_get_quote(agent, need_pubkey):
failure = Failure(Component.INTERNAL, ["verifier"])
if agent is None:
raise Exception("agent deleted while being processed")
params = cloud_verifier_common.prepare_get_quote(agent)
Expand All @@ -724,7 +726,8 @@ async def invoke_get_quote(agent, need_pubkey):
else:
# catastrophic error, do not continue
logger.critical("Unexpected Get Quote response error for cloud agent %s, Error: %s", agent['agent_id'], response.status_code)
asyncio.ensure_future(process_agent(agent, states.FAILED))
failure.add_event("no_quote", "Unexpected Get Quote reponse from agent", False)
asyncio.ensure_future(process_agent(agent, states.FAILED, failure))
else:
try:
json_response = json.loads(response.body)
Expand All @@ -733,13 +736,14 @@ async def invoke_get_quote(agent, need_pubkey):
if 'provide_V' not in agent :
agent['provide_V'] = True
agentAttestState = get_AgentAttestStates().get_by_agent_id(agent['agent_id'])
if cloud_verifier_common.process_quote_response(agent, json_response['results'], agentAttestState):
failure = cloud_verifier_common.process_quote_response(agent, json_response['results'], agentAttestState)
if not failure:
if agent['provide_V']:
asyncio.ensure_future(process_agent(agent, states.PROVIDE_V))
else:
asyncio.ensure_future(process_agent(agent, states.GET_QUOTE))
else:
asyncio.ensure_future(process_agent(agent, states.INVALID_QUOTE))
asyncio.ensure_future(process_agent(agent, states.INVALID_QUOTE, failure))

# store the attestation state
store_attestation_state(agentAttestState)
Expand All @@ -749,6 +753,7 @@ async def invoke_get_quote(agent, need_pubkey):


async def invoke_provide_v(agent):
failure = Failure(Component.INTERNAL, ["verifier"])
if agent is None:
raise Exception("Agent deleted while being processed")
try:
Expand All @@ -769,18 +774,19 @@ async def invoke_provide_v(agent):
else:
# catastrophic error, do not continue
logger.critical("Unexpected Provide V response error for cloud agent %s, Error: %s", agent['agent_id'], response.status_code)
asyncio.ensure_future(process_agent(agent, states.FAILED))
failure.add_event("no_v", {"message": "Unexpected provide V response", "data": response.error}, False)
asyncio.ensure_future(process_agent(agent, states.FAILED, failure))
else:
asyncio.ensure_future(process_agent(agent, states.GET_QUOTE))


async def process_agent(agent, new_operational_state):
async def process_agent(agent, new_operational_state, failure=Failure(Component.INTERNAL, ["verifier"])):
# Convert to dict if the agent arg is a db object
if not isinstance(agent, dict):
agent = _from_db_obj(agent)

session = get_session()
try:
try: # pylint: disable=R1702
main_agent_operational_state = agent['operational_state']
try:
stored_agent = session.query(VerfierMain).filter_by(
Expand Down Expand Up @@ -808,24 +814,28 @@ async def process_agent(agent, new_operational_state):
# If failed during processing, log regardless and drop it on the floor
# The administration application (tenant) can GET the status and act accordingly (delete/retry/etc).
if new_operational_state in (states.FAILED, states.INVALID_QUOTE):
agent['operational_state'] = new_operational_state

# issue notification for invalid quotes
if new_operational_state == states.INVALID_QUOTE:
cloud_verifier_common.notify_error(agent)

if agent['pending_event'] is not None:
tornado.ioloop.IOLoop.current().remove_timeout(
agent['pending_event'])
for key in exclude_db:
if key in agent:
del agent[key]
session.query(VerfierMain).filter_by(
agent_id=agent['agent_id']).update(agent)
session.commit()

logger.warning("Agent %s failed, stopping polling", agent['agent_id'])
return
assert failure, "States FAILED and INVALID QUOTE should only be reached with a failure message"

if agent.get('severity_level') is None or agent['severity_level'] < failure.highest_severity.severity:
agent['severity_level'] = failure.highest_severity.severity
agent['last_event_id'] = failure.highest_severity_event.event_id
agent['operational_state'] = new_operational_state

# issue notification for invalid quotes
if new_operational_state == states.INVALID_QUOTE:
cloud_verifier_common.notify_error(agent, event=failure.highest_severity_event)

# When the failure is irrecoverable we stop polling the agent
if not failure.recoverable or failure.highest_severity == MAX_SEVERITY_LABEL:
if agent['pending_event'] is not None:
tornado.ioloop.IOLoop.current().remove_timeout(
agent['pending_event'])
for key in exclude_db:
if key in agent:
del agent[key]
session.query(VerfierMain).filter_by(
agent_id=agent['agent_id']).update(agent)
session.commit()

# propagate all state, but remove none DB keys first (using exclude_db)
try:
Expand All @@ -840,6 +850,16 @@ async def process_agent(agent, new_operational_state):
except SQLAlchemyError as e:
logger.error('SQLAlchemy Error: %s', e)

# If agent was in a failed state we check if we either stop polling
# or just add it again to the event loop
if new_operational_state in [states.FAILED, states.INVALID_QUOTE]:
if not failure.recoverable or failure.highest_severity == MAX_SEVERITY_LABEL:
logger.warning("Agent %s failed, stopping polling", agent['agent_id'])
return

await invoke_get_quote(agent, False)
return

# if new, get a quote
if (main_agent_operational_state == states.START and
new_operational_state == states.GET_QUOTE):
Expand Down Expand Up @@ -876,12 +896,13 @@ async def process_agent(agent, new_operational_state):
new_operational_state == states.GET_QUOTE_RETRY):
if agent['num_retries'] >= maxr:
logger.warning("Agent %s was not reachable for quote in %d tries, setting state to FAILED", agent['agent_id'], maxr)
failure.add_event("not_reachable", "agent was not reachable from verifier", False)
if agent['first_verified']: # only notify on previously good agents
cloud_verifier_common.notify_error(
agent, msgtype='comm_error')
agent, msgtype='comm_error', event=failure.highest_severity_event)
else:
logger.debug("Communication error for new agent. No notification will be sent")
await process_agent(agent, states.FAILED)
await process_agent(agent, states.FAILED, failure)
else:
agent['operational_state'] = states.GET_QUOTE
cb = functools.partial(invoke_get_quote, agent, True)
Expand All @@ -894,9 +915,10 @@ async def process_agent(agent, new_operational_state):
new_operational_state == states.PROVIDE_V_RETRY):
if agent['num_retries'] >= maxr:
logger.warning("Agent %s was not reachable to provide v in %d tries, setting state to FAILED", agent['agent_id'], maxr)
failure.add_event("not_reachable_v", "agent was not reachable to provide V", False)
cloud_verifier_common.notify_error(
agent, msgtype='comm_error')
await process_agent(agent, states.FAILED)
agent, msgtype='comm_error', event=failure.highest_severity_event)
await process_agent(agent, states.FAILED, failure)
else:
agent['operational_state'] = states.PROVIDE_V
cb = functools.partial(invoke_provide_v, agent)
Expand Down
2 changes: 2 additions & 0 deletions keylime/db/verifier_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ class VerfierMain(Base):
ima_pcrs = Column(JSONPickleType(pickler=json))
pcr10 = Column(LargeBinary)
next_ima_ml_entry = Column(Integer)
severity_level = Column(Integer, nullable=True)
last_event_id = Column(String(200), nullable=True)


class VerifierAllowlist(Base):
Expand Down
Loading

0 comments on commit 06645a8

Please sign in to comment.