Skip to content
This repository has been archived by the owner on Jul 1, 2021. It is now read-only.

Validator client doesn't crash due to unresponsive beacon node. #1743

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions eth2/validator_client/beacon_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
Duty,
DutyType,
)
from trinity.exceptions import BeaconNodeRequestFailure

SYNCING_POLL_INTERVAL = 10 # seconds
CONNECTION_RETRY_INTERVAL = 1 # second(s)
Expand All @@ -47,19 +48,19 @@ async def _get_json(session: Session, url: str, params: Any = None) -> Any:
params = {}
try:
return (await session.get(url, params=params)).json()
except OSError:
raise
except Exception as e:
logger.exception(e)
except OSError as err:
raise BeaconNodeRequestFailure(err)
except Exception as err:
logger.exception(err)


async def _post_json(session: Session, url: str, json: Any) -> None:
try:
await session.post(url, json=json)
except OSError:
raise
except Exception as e:
logger.exception(e)
except OSError as err:
raise BeaconNodeRequestFailure(err)
except Exception as err:
logger.exception(err)


async def _get_node_version(session: Session, url: str) -> str:
Expand Down
57 changes: 35 additions & 22 deletions eth2/validator_client/duty_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from eth2.validator_client.duty import AttestationDuty, Duty, DutyType
from eth2.validator_client.duty_store import DutyStore
from eth2.validator_client.typing import RandaoProvider, ResolvedDuty
from trinity.exceptions import BeaconNodeRequestFailure

logger = logging.getLogger("eth2.validator_client.duty_scheduler")

Expand All @@ -22,13 +23,19 @@ async def resolve_duty(
) -> None:
if duty.duty_type == DutyType.Attestation:
duty = cast(AttestationDuty, duty)
attestation = await beacon_node.fetch_attestation(
duty.validator_public_key,
duty.tick_for_execution.slot,
duty.committee_index,
)
if attestation:
await resolved_duties.send((duty, attestation))

try:
attestation = await beacon_node.fetch_attestation(
duty.validator_public_key,
duty.tick_for_execution.slot,
duty.committee_index,
)
except BeaconNodeRequestFailure as err:
logger.warning("could not fetch attestation from beacon node: %s", err)
else:
if attestation:
await resolved_duties.send((duty, attestation))

elif duty.duty_type == DutyType.BlockProposal:
randao_reveal = randao_provider(
duty.validator_public_key, duty.tick_for_execution.epoch
Expand Down Expand Up @@ -69,23 +76,29 @@ async def _fetch_latest_duties(
current_epoch = tick.epoch
next_epoch = Epoch(current_epoch + 1)

current_duties = await beacon_node.fetch_duties(
tick, validator_public_keys, current_epoch
)
upcoming_duties = await beacon_node.fetch_duties(
tick, validator_public_keys, next_epoch
)
latest_duties = cast(Tuple[Duty, ...], current_duties) + cast(
Tuple[Duty, ...], upcoming_duties
)
if not latest_duties:
return
try:
current_duties = await beacon_node.fetch_duties(
tick, validator_public_keys, current_epoch
)
upcoming_duties = await beacon_node.fetch_duties(
tick, validator_public_keys, next_epoch
)
except BeaconNodeRequestFailure as err:
logger.warning(
"could not fetch latest duties from beacon node at %s: %s", tick, err
)
else:
latest_duties = cast(Tuple[Duty, ...], current_duties) + cast(
Tuple[Duty, ...], upcoming_duties
)
if not latest_duties:
return

logger.debug("%s: found %d duties", tick, len(latest_duties))
logger.debug("%s: found %d duties", tick, len(latest_duties))

# TODO manage duties correctly, accounting for re-orgs, etc.
# NOTE: the naive strategy is likely "last write wins"
await duty_store.add_duties(*latest_duties)
# TODO manage duties correctly, accounting for re-orgs, etc.
# NOTE: the naive strategy is likely "last write wins"
await duty_store.add_duties(*latest_duties)


async def schedule_and_dispatch_duties_at_tick(
Expand Down
6 changes: 5 additions & 1 deletion eth2/validator_client/signatory.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from eth2.validator_client.abc import BeaconNodeAPI, SignatoryDatabaseAPI
from eth2.validator_client.duty import Duty, DutyType
from eth2.validator_client.typing import PrivateKeyProvider
from trinity.exceptions import BeaconNodeRequestFailure

logger = logging.getLogger("eth2.validator_client.signatory")

Expand Down Expand Up @@ -88,4 +89,7 @@ async def sign_and_broadcast_operation_if_valid(
duty,
humanize_bytes(operation_with_signature.hash_tree_root),
)
await beacon_node.publish(duty, operation_with_signature)
try:
await beacon_node.publish(duty, operation_with_signature)
except BeaconNodeRequestFailure as err:
logger.warning("could not publish opperation to beacon node: %s", err)
7 changes: 7 additions & 0 deletions trinity/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,10 @@ class MetricsReportingError(BaseTrinityError):
Raised when there is an error while reporting metrics.
"""
pass


class BeaconNodeRequestFailure(BaseTrinityError):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there may be some python conventions i'm missing but i'd think we want to define an exception as close to its possible source of creation as possible w/in the broader module/file hierarchy.... if we moved this concern to be entirely inside the scope of the BeaconNode then we can even keep this exception private to that file/module

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the general reason to do this is that i think it tends towards better maintainability and discoverability of the code base over time -- imagine you are exploring the trinity codebase and you suddenly have N exception types that tie you across not only different parts of the client stack, but different clients! it would quickly overflow my cognitive stack :)

"""
Raised when a request made to a Beacon Node fails.
"""
pass