Skip to content

Commit

Permalink
Merge pull request #1098 from skalenetwork/monitor-stuck
Browse files Browse the repository at this point in the history
Fix stuck monitor recovery. Avoid DB related deadlocks.
  • Loading branch information
DmytroNazarenko authored Oct 17, 2024
2 parents ad492d3 + 0ac7970 commit d2ba47f
Show file tree
Hide file tree
Showing 24 changed files with 825 additions and 443 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: Test
on: [push, pull_request]
on: [push]
env:
ETH_PRIVATE_KEY: ${{ secrets.ETH_PRIVATE_KEY }}
SCHAIN_TYPE: ${{ secrets.SCHAIN_TYPE }}
Expand Down
2 changes: 1 addition & 1 deletion core/schains/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
from core.schains.dkg.utils import get_secret_key_share_filepath
from core.schains.firewall.types import IRuleController
from core.schains.ima import get_ima_time_frame, get_migration_ts as get_ima_migration_ts
from core.schains.process_manager_helper import is_monitor_process_alive
from core.schains.process import is_monitor_process_alive
from core.schains.rpc import (
check_endpoint_alive,
check_endpoint_blocks,
Expand Down
24 changes: 12 additions & 12 deletions core/schains/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
get_node_ips_from_config,
get_own_ip_from_config,
)
from core.schains.process_manager_helper import terminate_schain_process
from core.schains.process import ProcessReport, terminate_process
from core.schains.runner import get_container_name, is_exited
from core.schains.external_config import ExternalConfig
from core.schains.types import ContainerType
Expand Down Expand Up @@ -109,12 +109,15 @@ def monitor(skale, node_config, dutils=None):

for schain_name in schains_on_node:
if schain_name not in schain_names_on_contracts:
logger.warning(f'sChain {schain_name} was found on node, but not on contracts: \
{schain_names_on_contracts}, going to remove it!')
logger.warning(
'%s was found on node, but not on contracts: %s, trying to cleanup',
schain_name,
schain_names_on_contracts,
)
try:
ensure_schain_removed(skale, schain_name, node_config.id, dutils=dutils)
except Exception:
logger.exception(f'sChain removal {schain_name} failed')
logger.exception('%s removal failed', schain_name)
logger.info('Cleanup procedure finished')


Expand Down Expand Up @@ -185,9 +188,10 @@ def remove_schain(
msg: str,
dutils: Optional[DockerUtils] = None,
) -> None:
schain_record = upsert_schain_record(schain_name)
logger.warning(msg)
terminate_schain_process(schain_record)
report = ProcessReport(name=schain_name)
if report.is_exist():
terminate_process(report.pid)

delete_bls_keys(skale, schain_name)
sync_agent_ranges = get_sync_agent_ranges(skale)
Expand Down Expand Up @@ -240,9 +244,7 @@ def cleanup_schain(
)
check_status = checks.get_all()
if check_status['skaled_container'] or is_exited(
schain_name,
container_type=ContainerType.schain,
dutils=dutils
schain_name, container_type=ContainerType.schain, dutils=dutils
):
remove_schain_container(schain_name, dutils=dutils)
if check_status['volume']:
Expand All @@ -259,9 +261,7 @@ def cleanup_schain(
rc.cleanup()
if estate is not None and estate.ima_linked:
if check_status.get('ima_container', False) or is_exited(
schain_name,
container_type=ContainerType.ima,
dutils=dutils
schain_name, container_type=ContainerType.ima, dutils=dutils
):
remove_ima_container(schain_name, dutils=dutils)
if check_status['config_dir']:
Expand Down
3 changes: 2 additions & 1 deletion core/schains/monitor/action.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def dkg(self) -> bool:
def upstream_config(self) -> bool:
with self.statsd_client.timer(f'admin.action.upstream_config.{no_hyphens(self.name)}'):
logger.info(
'Creating new upstream_config rotation_id: %s, stream: %s',
'Generating new upstream_config rotation_id: %s, stream: %s',
self.rotation_data.get('rotation_id'), self.stream_version
)
new_config = create_new_upstream_config(
Expand All @@ -229,6 +229,7 @@ def upstream_config(self) -> bool:
result = False
if not self.cfm.upstream_config_exists() or \
new_config != self.cfm.latest_upstream_config:
logger.info('Saving new config')
rotation_id = self.rotation_data['rotation_id']
logger.info(
'Saving new upstream config rotation_id: %d, ips: %s',
Expand Down
2 changes: 2 additions & 0 deletions core/schains/monitor/config_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def run(self):
self.execute()
self.am.log_executed_blocks()
self.am._upd_last_seen()
except Exception as e:
logger.info('Config monitor type failed %s', typename, exc_info=e)
finally:
logger.info('Config monitor type finished %s', typename)

Expand Down
Loading

0 comments on commit d2ba47f

Please sign in to comment.