Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix stuck monitor recovery. Avoid DB related deadlocks. #1098

Merged
merged 37 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
4689c00
Add Pipeline namedtuple
badrogger Jul 25, 2024
438d062
Increase log file size
badrogger Jul 30, 2024
ea46230
Fix skaled monitor
badrogger Jul 30, 2024
b5a7bfe
Switch to new pipeline runner in process_manager
badrogger Jul 30, 2024
a74f381
Add ProcessReport
badrogger Aug 2, 2024
c40b4db
Add process_manager tests
badrogger Aug 6, 2024
54d1201
Remove redundant steps in test cleanup
badrogger Aug 6, 2024
396a846
Remove DKG_TIMEOUT_COEFFICIENT duplicate
badrogger Aug 6, 2024
bd8d7cd
Update IMA agent versions in container.json
badrogger Aug 7, 2024
563a677
Fix IMA migration tests
badrogger Aug 7, 2024
da43817
Fix routes tests
badrogger Aug 7, 2024
6fdce4e
Remove locking from SChainRecord
badrogger Aug 7, 2024
6736628
Switch to info log level for gunicorn
badrogger Aug 7, 2024
d9a3ccd
Remove unused Runner class
badrogger Aug 28, 2024
dd8ab4f
Merge branch 'v2.8.x' into monitor-stuck
badrogger Aug 28, 2024
63a80d9
Fix new chain
badrogger Aug 29, 2024
7d1bdc2
Fix cleaner
badrogger Sep 10, 2024
ee9653c
Fix terminate_process usage
badrogger Sep 13, 2024
ba6a624
Merge branch 'v2.8.x' into monitor-stuck
badrogger Sep 19, 2024
87b0915
Fix process_manager
badrogger Sep 19, 2024
5138285
Fix start_monitor arguments type_hints
badrogger Sep 19, 2024
fca4ac6
Rework monitor execution strategy
badrogger Sep 23, 2024
0960d5a
Merge branch 'v2.8.x' into monitor-stuck
badrogger Sep 23, 2024
ccc6bde
Wrap pipelines into Tasks classes
badrogger Sep 24, 2024
3a7d3d8
Fix tests
badrogger Sep 25, 2024
e4471d0
Fix process_manager_tests
badrogger Sep 26, 2024
967ccdb
Fix process_manager_test
badrogger Sep 26, 2024
7a203dc
Remove unused code
badrogger Sep 26, 2024
d8947ed
Remove unused pipeline module
badrogger Sep 27, 2024
5665cc8
Reduce task sleeping interval. Improve logs.
badrogger Sep 27, 2024
d81affc
Merge branch 'v2.8.x' into monitor-stuck
badrogger Sep 27, 2024
f7fa831
Remove unused classes
badrogger Sep 30, 2024
812a579
Add additional monitor tasks tests
badrogger Oct 1, 2024
0fa074c
Fix linter
badrogger Oct 1, 2024
69d5d2c
Merge branch 'v2.8.x' into monitor-stuck
badrogger Oct 9, 2024
be712d6
Remove unused code
badrogger Oct 16, 2024
0ac7970
Run tests only for push
badrogger Oct 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: Test
on: [push, pull_request]
on: [push]
env:
ETH_PRIVATE_KEY: ${{ secrets.ETH_PRIVATE_KEY }}
SCHAIN_TYPE: ${{ secrets.SCHAIN_TYPE }}
Expand Down
2 changes: 1 addition & 1 deletion core/schains/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
from core.schains.dkg.utils import get_secret_key_share_filepath
from core.schains.firewall.types import IRuleController
from core.schains.ima import get_ima_time_frame, get_migration_ts as get_ima_migration_ts
from core.schains.process_manager_helper import is_monitor_process_alive
from core.schains.process import is_monitor_process_alive
from core.schains.rpc import (
check_endpoint_alive,
check_endpoint_blocks,
Expand Down
24 changes: 12 additions & 12 deletions core/schains/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
get_node_ips_from_config,
get_own_ip_from_config,
)
from core.schains.process_manager_helper import terminate_schain_process
from core.schains.process import ProcessReport, terminate_process
from core.schains.runner import get_container_name, is_exited
from core.schains.external_config import ExternalConfig
from core.schains.types import ContainerType
Expand Down Expand Up @@ -109,12 +109,15 @@

for schain_name in schains_on_node:
if schain_name not in schain_names_on_contracts:
logger.warning(f'sChain {schain_name} was found on node, but not on contracts: \
{schain_names_on_contracts}, going to remove it!')
logger.warning(
'%s was found on node, but not on contracts: %s, trying to cleanup',
schain_name,
schain_names_on_contracts,
)
try:
ensure_schain_removed(skale, schain_name, node_config.id, dutils=dutils)
except Exception:
logger.exception(f'sChain removal {schain_name} failed')
logger.exception('%s removal failed', schain_name)
logger.info('Cleanup procedure finished')


Expand Down Expand Up @@ -185,9 +188,10 @@
msg: str,
dutils: Optional[DockerUtils] = None,
) -> None:
schain_record = upsert_schain_record(schain_name)
logger.warning(msg)
terminate_schain_process(schain_record)
report = ProcessReport(name=schain_name)
if report.is_exist():
terminate_process(report.pid)

Check warning on line 194 in core/schains/cleaner.py

View check run for this annotation

Codecov / codecov/patch

core/schains/cleaner.py#L194

Added line #L194 was not covered by tests

delete_bls_keys(skale, schain_name)
sync_agent_ranges = get_sync_agent_ranges(skale)
Expand Down Expand Up @@ -240,9 +244,7 @@
)
check_status = checks.get_all()
if check_status['skaled_container'] or is_exited(
schain_name,
container_type=ContainerType.schain,
dutils=dutils
schain_name, container_type=ContainerType.schain, dutils=dutils
):
remove_schain_container(schain_name, dutils=dutils)
if check_status['volume']:
Expand All @@ -259,9 +261,7 @@
rc.cleanup()
if estate is not None and estate.ima_linked:
if check_status.get('ima_container', False) or is_exited(
schain_name,
container_type=ContainerType.ima,
dutils=dutils
schain_name, container_type=ContainerType.ima, dutils=dutils
):
remove_ima_container(schain_name, dutils=dutils)
if check_status['config_dir']:
Expand Down
3 changes: 2 additions & 1 deletion core/schains/monitor/action.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def dkg(self) -> bool:
def upstream_config(self) -> bool:
with self.statsd_client.timer(f'admin.action.upstream_config.{no_hyphens(self.name)}'):
logger.info(
'Creating new upstream_config rotation_id: %s, stream: %s',
'Generating new upstream_config rotation_id: %s, stream: %s',
self.rotation_data.get('rotation_id'), self.stream_version
)
new_config = create_new_upstream_config(
Expand All @@ -229,6 +229,7 @@ def upstream_config(self) -> bool:
result = False
if not self.cfm.upstream_config_exists() or \
new_config != self.cfm.latest_upstream_config:
logger.info('Saving new config')
rotation_id = self.rotation_data['rotation_id']
logger.info(
'Saving new upstream config rotation_id: %d, ips: %s',
Expand Down
2 changes: 2 additions & 0 deletions core/schains/monitor/config_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
self.execute()
self.am.log_executed_blocks()
self.am._upd_last_seen()
except Exception as e:
logger.info('Config monitor type failed %s', typename, exc_info=e)

Check warning on line 49 in core/schains/monitor/config_monitor.py

View check run for this annotation

Codecov / codecov/patch

core/schains/monitor/config_monitor.py#L48-L49

Added lines #L48 - L49 were not covered by tests
finally:
logger.info('Config monitor type finished %s', typename)

Expand Down
Loading
Loading