From 4689c00fa9f34ffd30ad969601846620d59562e0 Mon Sep 17 00:00:00 2001 From: badrogger Date: Thu, 25 Jul 2024 18:37:43 +0000 Subject: [PATCH 01/32] Add Pipeline namedtuple --- core/schains/monitor/main.py | 88 ++++++++++++++++++++++++++---- tests/schains/monitor/main_test.py | 29 +++++++++- 2 files changed, 105 insertions(+), 12 deletions(-) diff --git a/core/schains/monitor/main.py b/core/schains/monitor/main.py index 58010d347..1892769db 100644 --- a/core/schains/monitor/main.py +++ b/core/schains/monitor/main.py @@ -18,13 +18,15 @@ # along with this program. If not, see . import functools -import time -import random import logging -from typing import Dict +import queue +import random +import sys +import threading +import time +from typing import Callable, Dict, List, NamedTuple, Optional from concurrent.futures import Future, ThreadPoolExecutor from importlib import reload -from typing import List, Optional from skale import Skale, SkaleIma from web3._utils import request as web3_request @@ -56,10 +58,17 @@ SKALED_PIPELINE_SLEEP = 2 CONFIG_PIPELINE_SLEEP = 3 +STUCK_TIMEOUT = 60 * 60 * 3 +SHUTDOWN_INTERVAL = 60 * 10 logger = logging.getLogger(__name__) +class Pipeline(NamedTuple): + name: str + job: Callable + + def run_config_pipeline( skale: Skale, skale_ima: SkaleIma, schain: Dict, node_config: NodeConfig, stream_version: str ) -> None: @@ -105,9 +114,7 @@ def run_config_pipeline( if SYNC_NODE: logger.info( - 'Sync node last_dkg_successful %s, rotation_data %s', - last_dkg_successful, - rotation_data + 'Sync node last_dkg_successful %s, rotation_data %s', last_dkg_successful, rotation_data ) mon = SyncConfigMonitor(config_am, config_checks) else: @@ -151,18 +158,18 @@ def run_skaled_pipeline( econfig=ExternalConfig(name), dutils=dutils, ) - status = skaled_checks.get_all(log=False, expose=True) + check_status = skaled_checks.get_all(log=False, expose=True) automatic_repair = get_automatic_repair_option() - api_status = get_api_checks_status(status=status, allowed=TG_ALLOWED_CHECKS) + api_status = get_api_checks_status(status=check_status, allowed=TG_ALLOWED_CHECKS) notify_checks(name, node_config.all(), api_status) - logger.info('Skaled status: %s', status) + logger.info('Skaled check status: %s', check_status) logger.info('Upstream config %s', skaled_am.upstream_config_path) mon = get_skaled_monitor( action_manager=skaled_am, - status=status, + check_status=check_status, schain_record=schain_record, skaled_status=skaled_status, automatic_repair=automatic_repair, @@ -286,3 +293,62 @@ def run_monitor_for_schain( if once: return False post_monitor_sleep() + + +def run_pipelines( + pipelines: list[Pipeline], + once: bool = False, + stuck_timeout: int = STUCK_TIMEOUT, + shutdown_interval: int = SHUTDOWN_INTERVAL, +) -> None: + init_ts = time.time() + + heartbeat_queues = [queue.Queue() for _ in range(len(pipelines))] + terminating_events = [threading.Event() for _ in range(len(pipelines))] + heartbeat_ts = [init_ts for _ in range(len(pipelines))] + + threads = [ + threading.Thread( + name=pipeline.name, + target=keep_pipeline, args=[heartbeat_queue, terminating_event, pipeline.job], + daemon=True + ) + for heartbeat_queue, terminating_event, pipeline in zip( + heartbeat_queues, terminating_events, pipelines + ) + ] + + for th in threads: + th.start() + + stuck = False + while not stuck: + for pindex, heartbeat_queue in enumerate(heartbeat_queues): + if not heartbeat_queue.empty(): + heartbeat_ts[pindex] = heartbeat_queue.get() + if time.time() - heartbeat_ts[pindex] > stuck_timeout: + logger.info('Pipeline with number %d/%d stuck', pindex, len(pipelines)) + stuck = True + break + if once and all((lambda ts: ts > init_ts, heartbeat_ts)): + logger.info('Successfully completed required one run. Shutting down the process') + break + + logger.info('Terminating all pipelines') + for event in terminating_events: + event.set() + if stuck: + logger.info('Waiting for graceful completion interval') + time.sleep(shutdown_interval) + logger.info('Stuck was detected') + sys.exit(1) + + +def keep_pipeline( + heartbeat_queue: queue.Queue, terminating_event: threading.Event, pipeline: Callable +) -> None: + while not terminating_event.is_set(): + logger.info('Running pipeline') + pipeline() + heartbeat_queue.put(time.time()) + post_monitor_sleep() diff --git a/tests/schains/monitor/main_test.py b/tests/schains/monitor/main_test.py index 3c094ab4b..91da5b75c 100644 --- a/tests/schains/monitor/main_test.py +++ b/tests/schains/monitor/main_test.py @@ -1,11 +1,14 @@ +import functools import mock +import logging +import time from concurrent.futures import ThreadPoolExecutor import pytest from core.schains.firewall.types import IpRange from core.schains.firewall.utils import get_sync_agent_ranges -from core.schains.monitor.main import run_monitor_for_schain +from core.schains.monitor.main import Pipeline, run_monitor_for_schain, run_pipelines from core.schains.task import Task from tools.helper import is_node_part_of_chain @@ -91,3 +94,27 @@ def test_run_monitor_for_schain_left( once=True ) keep_tasks_running_mock.assert_not_called() + + +def test_run_pipelines(): + def simple_pipeline(index: int): + logging.info('Running simple pipeline %d', index) + time.sleep(1) + logging.info('Finishing simple pipeline %d', index) + + def stuck_pipeline(index: int): + logging.info('Running stuck pipeline %d', index) + while True: + logging.info('Stuck pipeline %d beat', index) + time.sleep(2) + + run_pipelines([ + Pipeline(name='healthy0', job=functools.partial(simple_pipeline, index=0)), + Pipeline(name='healthy1', job=functools.partial(simple_pipeline, index=1)), + ], once=True, stuck_timeout=5, shutdown_interval=10) + + with pytest.raises(SystemExit): + run_pipelines([ + Pipeline(name='healthy', job=functools.partial(simple_pipeline, index=0)), + Pipeline(name='stuck', job=functools.partial(stuck_pipeline, index=1)) + ], stuck_timeout=5, shutdown_interval=10) From 438d0626a8f761fe0edd9490416433e2c8cdd86f Mon Sep 17 00:00:00 2001 From: badrogger Date: Tue, 30 Jul 2024 17:11:28 +0000 Subject: [PATCH 02/32] Increase log file size --- tools/configs/logs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/configs/logs.py b/tools/configs/logs.py index d21c8da41..0d9205d81 100644 --- a/tools/configs/logs.py +++ b/tools/configs/logs.py @@ -42,10 +42,10 @@ REMOVED_CONTAINERS_FOLDER_NAME ) -LOG_FILE_SIZE_MB = 40 +LOG_FILE_SIZE_MB = 100 LOG_FILE_SIZE_BYTES = LOG_FILE_SIZE_MB * 1000000 -LOG_BACKUP_COUNT = 10 +LOG_BACKUP_COUNT = 20 ADMIN_LOG_FORMAT = '[%(asctime)s %(levelname)s][%(process)d][%(processName)s][%(threadName)s] - %(name)s:%(lineno)d - %(message)s' # noqa API_LOG_FORMAT = '[%(asctime)s] %(process)d %(levelname)s %(url)s %(module)s: %(message)s' # noqa From ea46230d39e62461553e2f5b633c20130a294161 Mon Sep 17 00:00:00 2001 From: badrogger Date: Tue, 30 Jul 2024 17:33:16 +0000 Subject: [PATCH 03/32] Fix skaled monitor --- core/schains/monitor/skaled_monitor.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/core/schains/monitor/skaled_monitor.py b/core/schains/monitor/skaled_monitor.py index aac40e16a..58b43b06d 100644 --- a/core/schains/monitor/skaled_monitor.py +++ b/core/schains/monitor/skaled_monitor.py @@ -297,7 +297,7 @@ def no_config(status: Dict) -> bool: def get_skaled_monitor( action_manager: SkaledActionManager, - status: Dict, + check_status: Dict, schain_record: SChainRecord, skaled_status: SkaledStatus, automatic_repair: bool = True @@ -309,32 +309,32 @@ def get_skaled_monitor( mon_type: Type[BaseSkaledMonitor] = RegularSkaledMonitor if SYNC_NODE: - if no_config(status): + if no_config(check_status): mon_type = NoConfigSkaledMonitor - if is_recreate_mode(status, schain_record): + if is_recreate_mode(check_status, schain_record): mon_type = RecreateSkaledMonitor - elif is_config_update_time(status, skaled_status): + elif is_config_update_time(check_status, skaled_status): mon_type = UpdateConfigSkaledMonitor - elif is_reload_group_mode(status, action_manager.upstream_finish_ts): + elif is_reload_group_mode(check_status, action_manager.upstream_finish_ts): mon_type = ReloadGroupSkaledMonitor - elif is_reload_ip_mode(status, action_manager.econfig.reload_ts): + elif is_reload_ip_mode(check_status, action_manager.econfig.reload_ts): mon_type = ReloadIpSkaledMonitor return mon_type - if no_config(status): + if no_config(check_status): mon_type = NoConfigSkaledMonitor elif is_backup_mode(schain_record): mon_type = BackupSkaledMonitor - elif is_repair_mode(schain_record, status, skaled_status, automatic_repair): + elif is_repair_mode(schain_record, check_status, skaled_status, automatic_repair): mon_type = RepairSkaledMonitor - elif is_recreate_mode(status, schain_record): + elif is_recreate_mode(check_status, schain_record): mon_type = RecreateSkaledMonitor elif is_new_node_mode(schain_record, action_manager.finish_ts): mon_type = NewNodeSkaledMonitor - elif is_config_update_time(status, skaled_status): + elif is_config_update_time(check_status, skaled_status): mon_type = UpdateConfigSkaledMonitor - elif is_reload_group_mode(status, action_manager.upstream_finish_ts): + elif is_reload_group_mode(check_status, action_manager.upstream_finish_ts): mon_type = ReloadGroupSkaledMonitor - elif is_reload_ip_mode(status, action_manager.econfig.reload_ts): + elif is_reload_ip_mode(check_status, action_manager.econfig.reload_ts): mon_type = ReloadIpSkaledMonitor return mon_type From b5a7bfe970af632bbeda27de4bea59687e0894cd Mon Sep 17 00:00:00 2001 From: badrogger Date: Tue, 30 Jul 2024 17:37:55 +0000 Subject: [PATCH 04/32] Switch to new pipeline runner in process_manager --- core/schains/monitor/main.py | 123 +++++++++++++------------------- core/schains/process_manager.py | 9 +-- 2 files changed, 55 insertions(+), 77 deletions(-) diff --git a/core/schains/monitor/main.py b/core/schains/monitor/main.py index 1892769db..af650f0db 100644 --- a/core/schains/monitor/main.py +++ b/core/schains/monitor/main.py @@ -21,11 +21,9 @@ import logging import queue import random -import sys import threading import time -from typing import Callable, Dict, List, NamedTuple, Optional -from concurrent.futures import Future, ThreadPoolExecutor +from typing import Callable, Dict, NamedTuple, Optional from importlib import reload from skale import Skale, SkaleIma @@ -40,7 +38,6 @@ from core.schains.monitor import get_skaled_monitor, RegularConfigMonitor, SyncConfigMonitor from core.schains.monitor.action import ConfigActionManager, SkaledActionManager from core.schains.external_config import ExternalConfig, ExternalState -from core.schains.task import keep_tasks_running, Task from core.schains.config.static_params import get_automatic_repair_option from core.schains.skaled_status import get_skaled_status from core.node import get_current_nodes @@ -58,7 +55,7 @@ SKALED_PIPELINE_SLEEP = 2 CONFIG_PIPELINE_SLEEP = 3 -STUCK_TIMEOUT = 60 * 60 * 3 +STUCK_TIMEOUT = 60 * 60 * 2 SHUTDOWN_INTERVAL = 60 * 10 logger = logging.getLogger(__name__) @@ -189,20 +186,19 @@ def post_monitor_sleep(): time.sleep(schain_monitor_sleep) -def create_and_execute_tasks( - skale, - schain, +def create_and_execute_pipelines( + skale: Skale, + schain: dict, node_config: NodeConfig, skale_ima: SkaleIma, - stream_version, - schain_record, - executor, - futures, - dutils, -): + schain_record: SChainRecord, + dutils: Optional[DockerUtils] = None, +) -> bool: reload(web3_request) name = schain['name'] + stream_version = get_skale_node_version() + is_rotation_active = skale.node_rotation.is_rotation_active(name) leaving_chain = not SYNC_NODE and not is_node_part_of_chain(skale, name, node_config.id) @@ -222,13 +218,13 @@ def create_and_execute_tasks( statsd_client.incr(f'admin.schain.monitor.{no_hyphens(name)}') statsd_client.gauge(f'admin.schain.monitor_last_seen.{no_hyphens(name)}', monitor_last_seen_ts) - tasks = [] + pipelines = [] if not leaving_chain: - logger.info('Adding config task to the pool') - tasks.append( - Task( - f'{name}-config', - functools.partial( + logger.info('Adding config pipelines to the pool') + pipelines.append( + Pipeline( + name='config', + job=functools.partial( run_config_pipeline, skale=skale, skale_ima=skale_ima, @@ -236,7 +232,6 @@ def create_and_execute_tasks( node_config=node_config, stream_version=stream_version, ), - sleep=CONFIG_PIPELINE_SLEEP, ) ) if schain_record.config_version != stream_version or ( @@ -244,55 +239,26 @@ def create_and_execute_tasks( ): ConfigFileManager(name).remove_skaled_config() else: - logger.info('Adding skaled task to the pool') - tasks.append( - Task( - f'{name}-skaled', - functools.partial( + logger.info('Adding skaled pipeline to the pool') + pipelines.append( + Pipeline( + name='skaled', + job=functools.partial( run_skaled_pipeline, skale=skale, schain=schain, node_config=node_config, dutils=dutils, ), - sleep=SKALED_PIPELINE_SLEEP, ) ) - if len(tasks) == 0: - logger.warning('No tasks to run') - keep_tasks_running(executor, tasks, futures) - + if len(pipelines) == 0: + logger.warning('No pipelines to run') + return False -def run_monitor_for_schain( - skale, skale_ima, node_config: NodeConfig, schain, dutils=None, once=False -): - stream_version = get_skale_node_version() - tasks_number = 2 - with ThreadPoolExecutor(max_workers=tasks_number, thread_name_prefix='T') as executor: - futures: List[Optional[Future]] = [None for i in range(tasks_number)] - while True: - schain_record = SChainRecord.get_by_name(schain['name']) - try: - create_and_execute_tasks( - skale, - schain, - node_config, - skale_ima, - stream_version, - schain_record, - executor, - futures, - dutils, - ) - if once: - return True - post_monitor_sleep() - except Exception: - logger.exception('Monitor iteration failed') - if once: - return False - post_monitor_sleep() + run_pipelines(pipelines) + return True def run_pipelines( @@ -310,8 +276,8 @@ def run_pipelines( threads = [ threading.Thread( name=pipeline.name, - target=keep_pipeline, args=[heartbeat_queue, terminating_event, pipeline.job], - daemon=True + target=keep_pipeline, + args=[heartbeat_queue, terminating_event, pipeline.job] ) for heartbeat_queue, terminating_event, pipeline in zip( heartbeat_queues, terminating_events, pipelines @@ -326,29 +292,40 @@ def run_pipelines( for pindex, heartbeat_queue in enumerate(heartbeat_queues): if not heartbeat_queue.empty(): heartbeat_ts[pindex] = heartbeat_queue.get() - if time.time() - heartbeat_ts[pindex] > stuck_timeout: - logger.info('Pipeline with number %d/%d stuck', pindex, len(pipelines)) + ts = time.time() + if ts - heartbeat_ts[pindex] > stuck_timeout: + logger.warning( + '%s pipeline has stucked (last heartbeat %d)', + pipelines[pindex].name, + heartbeat_ts[pindex], + ) stuck = True break if once and all((lambda ts: ts > init_ts, heartbeat_ts)): - logger.info('Successfully completed required one run. Shutting down the process') + logger.info('Successfully completed requested single run') break logger.info('Terminating all pipelines') for event in terminating_events: - event.set() + if not event.is_set(): + event.set() if stuck: - logger.info('Waiting for graceful completion interval') - time.sleep(shutdown_interval) - logger.info('Stuck was detected') - sys.exit(1) + logger.info('Joining threads with timeout') + for thread in threads: + thread.join(timeout=shutdown_interval) + logger.warning('Stuck was detected') + logger.info('Finishing with pipelines') def keep_pipeline( - heartbeat_queue: queue.Queue, terminating_event: threading.Event, pipeline: Callable + heartbeat_queue: queue.Queue, terminate: threading.Event, pipeline: Callable ) -> None: - while not terminating_event.is_set(): + while not terminate.is_set(): logger.info('Running pipeline') - pipeline() + try: + pipeline() + except Exception: + logger.exception('Pipeline run failed') + terminate.set() heartbeat_queue.put(time.time()) post_monitor_sleep() diff --git a/core/schains/process_manager.py b/core/schains/process_manager.py index fddaa6a4d..190f47c75 100644 --- a/core/schains/process_manager.py +++ b/core/schains/process_manager.py @@ -24,7 +24,7 @@ from skale import Skale -from core.schains.monitor.main import run_monitor_for_schain +from core.schains.monitor.main import create_and_execute_pipelines from core.schains.notifications import notify_if_not_enough_balance from core.schains.process_manager_helper import ( terminate_stuck_schain_process, is_monitor_process_alive, terminate_process @@ -77,12 +77,13 @@ def run_pm_schain(skale, skale_ima, node_config, schain: Dict) -> None: logger.info(f'{log_prefix} PID {schain_record.monitor_id} is not running, spawning...') process = Process( name=schain['name'], - target=run_monitor_for_schain, + target=create_and_execute_pipelines, args=( skale, - skale_ima, + schain, node_config, - schain + skale_ima, + schain_record ) ) process.start() From a74f381af49c577f92736f971a8b76635c7d7c20 Mon Sep 17 00:00:00 2001 From: badrogger Date: Fri, 2 Aug 2024 13:36:28 +0000 Subject: [PATCH 05/32] Add ProcessReport --- core/schains/checks.py | 2 +- core/schains/cleaner.py | 2 +- core/schains/monitor/main.py | 98 +----------- core/schains/monitor/pipeline.py | 148 ++++++++++++++++++ .../{process_manager_helper.py => process.py} | 74 +++++---- core/schains/process_manager.py | 18 ++- tests/schains/monitor/main_test.py | 69 ++------ tests/schains/monitor/process_test.py | 34 ++++ 8 files changed, 263 insertions(+), 182 deletions(-) create mode 100644 core/schains/monitor/pipeline.py rename core/schains/{process_manager_helper.py => process.py} (64%) create mode 100644 tests/schains/monitor/process_test.py diff --git a/core/schains/checks.py b/core/schains/checks.py index 8f4b0d9ee..0202ed22d 100644 --- a/core/schains/checks.py +++ b/core/schains/checks.py @@ -41,7 +41,7 @@ from core.schains.dkg.utils import get_secret_key_share_filepath from core.schains.firewall.types import IRuleController from core.schains.ima import get_ima_time_frame, get_migration_ts as get_ima_migration_ts -from core.schains.process_manager_helper import is_monitor_process_alive +from core.schains.process import is_monitor_process_alive from core.schains.rpc import ( check_endpoint_alive, check_endpoint_blocks, diff --git a/core/schains/cleaner.py b/core/schains/cleaner.py index 985089db6..0be1454b1 100644 --- a/core/schains/cleaner.py +++ b/core/schains/cleaner.py @@ -35,7 +35,7 @@ get_node_ips_from_config, get_own_ip_from_config ) -from core.schains.process_manager_helper import terminate_schain_process +from core.schains.process import terminate_schain_process from core.schains.runner import get_container_name, is_exited from core.schains.external_config import ExternalConfig from core.schains.types import ContainerType diff --git a/core/schains/monitor/main.py b/core/schains/monitor/main.py index af650f0db..52922e3bf 100644 --- a/core/schains/monitor/main.py +++ b/core/schains/monitor/main.py @@ -19,11 +19,7 @@ import functools import logging -import queue -import random -import threading -import time -from typing import Callable, Dict, NamedTuple, Optional +from typing import Dict, Optional from importlib import reload from skale import Skale, SkaleIma @@ -33,12 +29,14 @@ from core.node_config import NodeConfig from core.schains.checks import ConfigChecks, get_api_checks_status, TG_ALLOWED_CHECKS, SkaledChecks from core.schains.config.file_manager import ConfigFileManager +from core.schains.config.static_params import get_automatic_repair_option from core.schains.firewall import get_default_rule_controller from core.schains.firewall.utils import get_sync_agent_ranges +from core.schains.external_config import ExternalConfig, ExternalState from core.schains.monitor import get_skaled_monitor, RegularConfigMonitor, SyncConfigMonitor from core.schains.monitor.action import ConfigActionManager, SkaledActionManager -from core.schains.external_config import ExternalConfig, ExternalState -from core.schains.config.static_params import get_automatic_repair_option +from core.schains.monitor.pipeline import Pipeline, run_pipelines +from core.schains.process import ProcessReport from core.schains.skaled_status import get_skaled_status from core.node import get_current_nodes @@ -53,19 +51,12 @@ MIN_SCHAIN_MONITOR_SLEEP_INTERVAL = 20 MAX_SCHAIN_MONITOR_SLEEP_INTERVAL = 40 -SKALED_PIPELINE_SLEEP = 2 -CONFIG_PIPELINE_SLEEP = 3 STUCK_TIMEOUT = 60 * 60 * 2 SHUTDOWN_INTERVAL = 60 * 10 logger = logging.getLogger(__name__) -class Pipeline(NamedTuple): - name: str - job: Callable - - def run_config_pipeline( skale: Skale, skale_ima: SkaleIma, schain: Dict, node_config: NodeConfig, stream_version: str ) -> None: @@ -178,20 +169,13 @@ def run_skaled_pipeline( mon(skaled_am, skaled_checks).run() -def post_monitor_sleep(): - schain_monitor_sleep = random.randint( - MIN_SCHAIN_MONITOR_SLEEP_INTERVAL, MAX_SCHAIN_MONITOR_SLEEP_INTERVAL - ) - logger.info('Monitor iteration completed, sleeping for %d', schain_monitor_sleep) - time.sleep(schain_monitor_sleep) - - def create_and_execute_pipelines( skale: Skale, schain: dict, node_config: NodeConfig, skale_ima: SkaleIma, schain_record: SChainRecord, + process_report: ProcessReport, dutils: Optional[DockerUtils] = None, ) -> bool: reload(web3_request) @@ -259,73 +243,3 @@ def create_and_execute_pipelines( run_pipelines(pipelines) return True - - -def run_pipelines( - pipelines: list[Pipeline], - once: bool = False, - stuck_timeout: int = STUCK_TIMEOUT, - shutdown_interval: int = SHUTDOWN_INTERVAL, -) -> None: - init_ts = time.time() - - heartbeat_queues = [queue.Queue() for _ in range(len(pipelines))] - terminating_events = [threading.Event() for _ in range(len(pipelines))] - heartbeat_ts = [init_ts for _ in range(len(pipelines))] - - threads = [ - threading.Thread( - name=pipeline.name, - target=keep_pipeline, - args=[heartbeat_queue, terminating_event, pipeline.job] - ) - for heartbeat_queue, terminating_event, pipeline in zip( - heartbeat_queues, terminating_events, pipelines - ) - ] - - for th in threads: - th.start() - - stuck = False - while not stuck: - for pindex, heartbeat_queue in enumerate(heartbeat_queues): - if not heartbeat_queue.empty(): - heartbeat_ts[pindex] = heartbeat_queue.get() - ts = time.time() - if ts - heartbeat_ts[pindex] > stuck_timeout: - logger.warning( - '%s pipeline has stucked (last heartbeat %d)', - pipelines[pindex].name, - heartbeat_ts[pindex], - ) - stuck = True - break - if once and all((lambda ts: ts > init_ts, heartbeat_ts)): - logger.info('Successfully completed requested single run') - break - - logger.info('Terminating all pipelines') - for event in terminating_events: - if not event.is_set(): - event.set() - if stuck: - logger.info('Joining threads with timeout') - for thread in threads: - thread.join(timeout=shutdown_interval) - logger.warning('Stuck was detected') - logger.info('Finishing with pipelines') - - -def keep_pipeline( - heartbeat_queue: queue.Queue, terminate: threading.Event, pipeline: Callable -) -> None: - while not terminate.is_set(): - logger.info('Running pipeline') - try: - pipeline() - except Exception: - logger.exception('Pipeline run failed') - terminate.set() - heartbeat_queue.put(time.time()) - post_monitor_sleep() diff --git a/core/schains/monitor/pipeline.py b/core/schains/monitor/pipeline.py new file mode 100644 index 000000000..16f738cea --- /dev/null +++ b/core/schains/monitor/pipeline.py @@ -0,0 +1,148 @@ +import logging +import queue +import random +import sys +import threading +import time + +from typing import Callable, NamedTuple +from core.schains.config.directory import schain_config_dir + +logger = logging.getLogger(__name__) + + +MIN_SCHAIN_MONITOR_SLEEP_INTERVAL = 20 +MAX_SCHAIN_MONITOR_SLEEP_INTERVAL = 40 + +SKALED_PIPELINE_SLEEP = 2 +CONFIG_PIPELINE_SLEEP = 3 +STUCK_TIMEOUT = 60 * 60 * 2 +SHUTDOWN_INTERVAL = 60 * 10 + + +class Pipeline(NamedTuple): + name: str + job: Callable + + +# class Runner: +# def __init__( +# self, +# pipelines: list[Pipeline], +# reporting_queue: queue.Queue, +# stuck_timeout: int = STUCK_TIMEOUT, +# shutdown_interval: int = SHUTDOWN_INTERVAL +# ) -> None: +# self.pipelines = pipelines +# self.reporting_queue = reporting_queue +# self.stuck_timeout = stuck_timeout +# self.shutdown_interval = shutdown_interval +# self.number = len(pipelines) +# +# def make_threads(self) -> None: +# init_ts = time.time() +# heartbeat_queues = [queue.Queue() for _ in range(self.number)] +# terminating_events = [threading.Event() for _ in range(self.number)] +# heartbeat_ts = [init_ts for _ in range(self.number)] +# +# return [ +# threading.Thread( +# name=pipeline.name, +# target=keep_pipeline, +# args=[heartbeat_queue, terminating_event, pipeline.job], +# ) +# for heartbeat_queue, terminating_event, pipeline in zip( +# heartbeat_queues, terminating_events, self.pipelines +# ) +# ] +# +# def run_threads(self, threads: list[threading.Thread]) -> None: +# for th in threads: +# th.start() +# +# def run(self, once: bool = False) -> None: +# threads = self.make_threads() +# self.run_threads(threads) + + +def run_pipelines( + pipelines: list[Pipeline], + process_report: ProcessReport, + once: bool = False, + stuck_timeout: int = STUCK_TIMEOUT, + shutdown_interval: int = SHUTDOWN_INTERVAL, +) -> None: + init_ts = time.time() + + heartbeat_queues = [queue.Queue() for _ in range(len(pipelines))] + terminating_events = [threading.Event() for _ in range(len(pipelines))] + heartbeat_ts = [init_ts for _ in range(len(pipelines))] + + threads = [ + threading.Thread( + name=pipeline.name, + target=keep_pipeline, + args=[heartbeat_queue, terminating_event, pipeline.job], + ) + for heartbeat_queue, terminating_event, pipeline in zip( + heartbeat_queues, terminating_events, pipelines + ) + ] + + for th in threads: + th.start() + + stuck = False + while not stuck: + for pindex, heartbeat_queue in enumerate(heartbeat_queues): + if not heartbeat_queue.empty(): + heartbeat_ts[pindex] = heartbeat_queue.get() + ts = int(time.time()) + if ts - heartbeat_ts[pindex] > stuck_timeout: + logger.warning( + '%s pipeline has stucked (last heartbeat %d)', + pipelines[pindex].name, + heartbeat_ts[pindex], + ) + stuck = True + break + if once and all((lambda ts: ts > init_ts, heartbeat_ts)): + logger.info('Successfully completed requested single run') + break + ts = int(time.time()) + process_report.ts = ts + + logger.info('Terminating all pipelines') + for event in terminating_events: + if not event.is_set(): + event.set() + if stuck: + logger.info('Joining threads with timeout') + for thread in threads: + thread.join(timeout=shutdown_interval) + process_report.ts = 0 + logger.warning('Stuck was detected') + + logger.info('Finishing with pipelines') + + +def keep_pipeline( + reporting_queue: queue.Queue, terminate: threading.Event, pipeline: Callable +) -> None: + while not terminate.is_set(): + logger.info('Running pipeline') + try: + pipeline() + except Exception: + logger.exception('Pipeline run failed') + terminate.set() + reporting_queue.put(time.time()) + sleep_for_a_while() + + +def sleep_for_a_while(): + schain_monitor_sleep = random.randint( + MIN_SCHAIN_MONITOR_SLEEP_INTERVAL, MAX_SCHAIN_MONITOR_SLEEP_INTERVAL + ) + logger.info('Monitor iteration completed, sleeping for %d', schain_monitor_sleep) + time.sleep(schain_monitor_sleep) diff --git a/core/schains/process_manager_helper.py b/core/schains/process.py similarity index 64% rename from core/schains/process_manager_helper.py rename to core/schains/process.py index 2128c7ba1..6e6317daf 100644 --- a/core/schains/process_manager_helper.py +++ b/core/schains/process.py @@ -19,12 +19,15 @@ import logging import os +import shutil import signal -from datetime import datetime +import json +import pathlib import psutil +from tools.configs.schains import SCHAINS_DIR_PATH from tools.helper import check_pid @@ -34,28 +37,6 @@ P_KILL_WAIT_TIMEOUT = 60 -def terminate_stuck_schain_process(skale, schain_record, schain): - """ - This function terminates the process if last_seen time is less than - DKG timeout * TIMEOUT_COEFFICIENT - """ - allowed_last_seen_time = _calc_allowed_last_seen_time(skale) - if not schain_record.monitor_last_seen: - logging.warning(f'schain: {schain["name"]}, monitor_last_seen is None, skipping...') - return - schain_monitor_last_seen = schain_record.monitor_last_seen.timestamp() - if allowed_last_seen_time > schain_monitor_last_seen: - logger.warning(f'schain: {schain["name"]}, pid {schain_record.monitor_id} last seen is \ -{schain_monitor_last_seen}, while max allowed last_seen is {allowed_last_seen_time}, pid \ -{schain_record.monitor_id} will be terminated now!') - terminate_schain_process(schain_record) - - -def terminate_schain_process(schain_record): - log_msg = f'schain: {schain_record.name}' - terminate_process(schain_record.monitor_id, log_msg=log_msg) - - def terminate_process(pid, kill_timeout=P_KILL_WAIT_TIMEOUT, log_msg=''): log_prefix = f'pid: {pid} - ' if log_msg != '': @@ -79,14 +60,49 @@ def terminate_process(pid, kill_timeout=P_KILL_WAIT_TIMEOUT, log_msg=''): logging.exception(f'{log_prefix} - termination failed!') +def terminate_schain_process(schain_record): + log_msg = f'schain: {schain_record.name}' + terminate_process(schain_record.monitor_id, log_msg=log_msg) + + def is_monitor_process_alive(monitor_id): """Checks that provided monitor_id is inited and alive""" return monitor_id != 0 and check_pid(monitor_id) -def _calc_allowed_last_seen_time(skale): - dkg_timeout = skale.constants_holder.get_dkg_timeout() - allowed_diff = int(dkg_timeout * TIMEOUT_COEFFICIENT) - logger.info(f'dkg_timeout: {dkg_timeout}, TIMEOUT_COEFFICIENT: {TIMEOUT_COEFFICIENT}, \ -allowed_diff: {allowed_diff}') - return datetime.now().timestamp() - allowed_diff +class ProcessReport: + REPORT_FILENAME = 'process.json' + + def __init__(self, name: str) -> None: + self.path = pathlib.Path.joinpath(SCHAINS_DIR_PATH, name, self.REPORT_FILENAME) + + @property + def ts(self) -> int: + return self.read()['ts'] + + @property + def pid(self) -> int: + return self.read()['pid'] + + @property + def _tmp_path(self) -> str: + path = pathlib.Path(self.path) + return path.with_stem('.tmp.' + path.stem) + + def read(self) -> dict: + with open(self.path) as process_file: + data = json.load(process_file) + return data + + def _save_tmp(self, pid: int, ts: int) -> None: + data = {'pid': pid, 'ts': ts} + with open(self._tmp_path, 'w') as tmp_file: + json.dump(data, tmp_file) + + def _move(self) -> str: + if os.path.isfile(self._tmp_path): + shutil.move(self._tmp_path, self.path) + + def update(self, pid: int, ts: int) -> None: + self._save_tmp(pid=pid, ts=ts) + self._move() diff --git a/core/schains/process_manager.py b/core/schains/process_manager.py index 190f47c75..c8208a36d 100644 --- a/core/schains/process_manager.py +++ b/core/schains/process_manager.py @@ -17,8 +17,9 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -import sys import logging +import sys +import time from typing import Dict from multiprocessing import Process @@ -26,8 +27,10 @@ from core.schains.monitor.main import create_and_execute_pipelines from core.schains.notifications import notify_if_not_enough_balance -from core.schains.process_manager_helper import ( - terminate_stuck_schain_process, is_monitor_process_alive, terminate_process +from core.schains.process import ( + is_monitor_process_alive, + terminate_process, + ProcessReport, ) from web.models.schain import upsert_schain_record, SChainRecord @@ -37,6 +40,9 @@ logger = logging.getLogger(__name__) +DKG_TIMEOUT_COEFFICIENT = 2.2 + + def pm_signal_handler(*args): """ This function is trigerred when SIGTERM signal is received by the main process of the app. @@ -68,9 +74,13 @@ def run_process_manager(skale, skale_ima, node_config): def run_pm_schain(skale, skale_ima, node_config, schain: Dict) -> None: schain_record = upsert_schain_record(schain['name']) + process_report = ProcessReport(schain['name']) log_prefix = f'sChain {schain["name"]} -' # todo - move to logger formatter - terminate_stuck_schain_process(skale, schain_record, schain) + dkg_timeout = skale.constants_holder.get_dkg_timeout() + allowed_diff = int(dkg_timeout * DKG_TIMEOUT_COEFFICIENT) + if int(time.time()) - process_report.ts > allowed_diff: + terminate_process(process_report.pid) monitor_process_alive = is_monitor_process_alive(schain_record.monitor_id) if not monitor_process_alive: diff --git a/tests/schains/monitor/main_test.py b/tests/schains/monitor/main_test.py index 91da5b75c..0dcdcb3ef 100644 --- a/tests/schains/monitor/main_test.py +++ b/tests/schains/monitor/main_test.py @@ -1,18 +1,15 @@ import functools -import mock import logging import time -from concurrent.futures import ThreadPoolExecutor +from multiprocessing import Process import pytest from core.schains.firewall.types import IpRange from core.schains.firewall.utils import get_sync_agent_ranges -from core.schains.monitor.main import Pipeline, run_monitor_for_schain, run_pipelines -from core.schains.task import Task +from core.schains.monitor.main import Pipeline, run_pipelines from tools.helper import is_node_part_of_chain -from web.models.schain import upsert_schain_record @pytest.fixture @@ -52,50 +49,6 @@ def test_is_node_part_of_chain(skale, schain_on_contracts, node_config): assert not chain_on_node -def test_run_monitor_for_schain( - skale, - skale_ima, - schain_on_contracts, - node_config, - schain_db, - dutils -): - with mock.patch('core.schains.monitor.main.keep_tasks_running') as keep_tasks_running_mock: - run_monitor_for_schain( - skale, - skale_ima, - node_config, - schain={'name': schain_db, 'partOfNode': 0, 'generation': 0}, - dutils=dutils, - once=True - ) - assert isinstance(keep_tasks_running_mock.call_args[0][0], ThreadPoolExecutor) - assert isinstance(keep_tasks_running_mock.call_args[0][1][0], Task) - assert isinstance(keep_tasks_running_mock.call_args[0][1][1], Task) - assert keep_tasks_running_mock.call_args[0][2] == [None, None] - - -def test_run_monitor_for_schain_left( - skale, - skale_ima, - node_config, - schain_db, - dutils -): - schain_not_exists = 'not-on-node' - upsert_schain_record(schain_not_exists) - with mock.patch('core.schains.monitor.main.keep_tasks_running') as keep_tasks_running_mock: - run_monitor_for_schain( - skale, - skale_ima, - node_config, - schain={'name': schain_not_exists, 'partOfNode': 0, 'generation': 0}, - dutils=dutils, - once=True - ) - keep_tasks_running_mock.assert_not_called() - - def test_run_pipelines(): def simple_pipeline(index: int): logging.info('Running simple pipeline %d', index) @@ -108,13 +61,19 @@ def stuck_pipeline(index: int): logging.info('Stuck pipeline %d beat', index) time.sleep(2) - run_pipelines([ + target = functools.partial(run_pipelines, pipelines=[ Pipeline(name='healthy0', job=functools.partial(simple_pipeline, index=0)), Pipeline(name='healthy1', job=functools.partial(simple_pipeline, index=1)), ], once=True, stuck_timeout=5, shutdown_interval=10) + monitor_process = Process(target=target) + monitor_process.start() + monitor_process.join() + + run_pipelines([ + Pipeline(name='healthy', job=functools.partial(simple_pipeline, index=0)), + Pipeline(name='stuck', job=functools.partial(stuck_pipeline, index=1)) + ], stuck_timeout=5, shutdown_interval=10) - with pytest.raises(SystemExit): - run_pipelines([ - Pipeline(name='healthy', job=functools.partial(simple_pipeline, index=0)), - Pipeline(name='stuck', job=functools.partial(stuck_pipeline, index=1)) - ], stuck_timeout=5, shutdown_interval=10) + monitor_process = Process(target=target) + monitor_process.start() + monitor_process.join(timeout=50) diff --git a/tests/schains/monitor/process_test.py b/tests/schains/monitor/process_test.py new file mode 100644 index 000000000..a46572744 --- /dev/null +++ b/tests/schains/monitor/process_test.py @@ -0,0 +1,34 @@ +import os +import shutil +import time +from pathlib import Path + +import pytest + +from core.schains.process import ProcessReport + +from tools.configs.schains import SCHAINS_DIR_PATH + + +@pytest.fixture +def tmp_dir(): + schain_name = 'test' + path = os.path.join(SCHAINS_DIR_PATH, schain_name) + Path(path).mkdir() + try: + yield path + finally: + shutil.rmtree(path, ignore_errors=True) + + +def test_process_report(tmp_dir): + path = os.path.join(tmp_dir, 'process.json') + report = ProcessReport(path) + with pytest.raises(FileNotFoundError): + assert report.ts == 0 + + ts = int(time.time()) + pid = 10 + report.update(pid=pid, ts=ts) + assert report.ts == ts + assert report.pid == pid From c40b4db18913409d73563475d4a7825ad8406349 Mon Sep 17 00:00:00 2001 From: badrogger Date: Tue, 6 Aug 2024 18:15:05 +0000 Subject: [PATCH 06/32] Add process_manager tests --- core/schains/monitor/main.py | 17 +++-- core/schains/monitor/pipeline.py | 3 +- core/schains/process.py | 34 ++++++++-- core/schains/process_manager.py | 94 +++++++++++++-------------- tests/schains/monitor/main_test.py | 51 ++++++++++++--- tests/schains/monitor/process_test.py | 10 ++- tests/schains/process_manager_test.py | 75 +++++++++++++++++++++ 7 files changed, 204 insertions(+), 80 deletions(-) create mode 100644 tests/schains/process_manager_test.py diff --git a/core/schains/monitor/main.py b/core/schains/monitor/main.py index 52922e3bf..ffefdad46 100644 --- a/core/schains/monitor/main.py +++ b/core/schains/monitor/main.py @@ -19,7 +19,7 @@ import functools import logging -from typing import Dict, Optional +from typing import Optional from importlib import reload from skale import Skale, SkaleIma @@ -45,7 +45,7 @@ from tools.notifications.messages import notify_checks from tools.helper import is_node_part_of_chain, no_hyphens from tools.resources import get_statsd_client -from web.models.schain import SChainRecord +from web.models.schain import SChainRecord, upsert_schain_record MIN_SCHAIN_MONITOR_SLEEP_INTERVAL = 20 @@ -53,12 +53,13 @@ STUCK_TIMEOUT = 60 * 60 * 2 SHUTDOWN_INTERVAL = 60 * 10 +DKG_TIMEOUT_COEFFICIENT = 2.2 logger = logging.getLogger(__name__) def run_config_pipeline( - skale: Skale, skale_ima: SkaleIma, schain: Dict, node_config: NodeConfig, stream_version: str + skale: Skale, skale_ima: SkaleIma, schain: dict, node_config: NodeConfig, stream_version: str ) -> None: name = schain['name'] schain_record = SChainRecord.get_by_name(name) @@ -119,7 +120,7 @@ def run_config_pipeline( def run_skaled_pipeline( - skale: Skale, schain: Dict, node_config: NodeConfig, dutils: DockerUtils + skale: Skale, schain: dict, node_config: NodeConfig, dutils: DockerUtils ) -> None: name = schain['name'] schain_record = SChainRecord.get_by_name(name) @@ -169,12 +170,11 @@ def run_skaled_pipeline( mon(skaled_am, skaled_checks).run() -def create_and_execute_pipelines( +def start_monitor( skale: Skale, schain: dict, node_config: NodeConfig, skale_ima: SkaleIma, - schain_record: SChainRecord, process_report: ProcessReport, dutils: Optional[DockerUtils] = None, ) -> bool: @@ -182,6 +182,7 @@ def create_and_execute_pipelines( name = schain['name'] stream_version = get_skale_node_version() + schain_record = upsert_schain_record(name) is_rotation_active = skale.node_rotation.is_rotation_active(name) @@ -205,6 +206,8 @@ def create_and_execute_pipelines( pipelines = [] if not leaving_chain: logger.info('Adding config pipelines to the pool') + dkg_timeout = skale.constants_holder.get_dkg_timeout() + stuck_timeout = int(dkg_timeout * DKG_TIMEOUT_COEFFICIENT) pipelines.append( Pipeline( name='config', @@ -241,5 +244,5 @@ def create_and_execute_pipelines( logger.warning('No pipelines to run') return False - run_pipelines(pipelines) + run_pipelines(pipelines=pipelines, process_report=process_report) return True diff --git a/core/schains/monitor/pipeline.py b/core/schains/monitor/pipeline.py index 16f738cea..cf9f73ae7 100644 --- a/core/schains/monitor/pipeline.py +++ b/core/schains/monitor/pipeline.py @@ -1,12 +1,11 @@ import logging import queue import random -import sys import threading import time from typing import Callable, NamedTuple -from core.schains.config.directory import schain_config_dir +from core.schains.process import ProcessReport logger = logging.getLogger(__name__) diff --git a/core/schains/process.py b/core/schains/process.py index 6e6317daf..734bd7d8a 100644 --- a/core/schains/process.py +++ b/core/schains/process.py @@ -74,35 +74,55 @@ class ProcessReport: REPORT_FILENAME = 'process.json' def __init__(self, name: str) -> None: - self.path = pathlib.Path.joinpath(SCHAINS_DIR_PATH, name, self.REPORT_FILENAME) + self.path = pathlib.Path(SCHAINS_DIR_PATH).joinpath(name, self.REPORT_FILENAME) + + def is_exist(self) -> bool: + return os.path.isfile(self.path) @property def ts(self) -> int: return self.read()['ts'] + @ts.setter + def ts(self, value: int) -> None: + report = {} + if self.is_exist(): + report = self.read() + report['ts'] = value + self._save_tmp(report) + self._move() + @property def pid(self) -> int: return self.read()['pid'] + @pid.setter + def pid(self, value: int) -> None: + report = {} + if self.is_exist(): + report = self.read() + report['pid'] = value + self._save_tmp(report) + self._move() + @property def _tmp_path(self) -> str: - path = pathlib.Path(self.path) - return path.with_stem('.tmp.' + path.stem) + return self.path.with_stem('.tmp.' + self.path.stem) def read(self) -> dict: with open(self.path) as process_file: data = json.load(process_file) return data - def _save_tmp(self, pid: int, ts: int) -> None: - data = {'pid': pid, 'ts': ts} + def _save_tmp(self, report: dict) -> None: with open(self._tmp_path, 'w') as tmp_file: - json.dump(data, tmp_file) + json.dump(report, tmp_file) def _move(self) -> str: if os.path.isfile(self._tmp_path): shutil.move(self._tmp_path, self.path) def update(self, pid: int, ts: int) -> None: - self._save_tmp(pid=pid, ts=ts) + report = {'pid': pid, 'ts': ts} + self._save_tmp(report=report) self._move() diff --git a/core/schains/process_manager.py b/core/schains/process_manager.py index c8208a36d..549f492cd 100644 --- a/core/schains/process_manager.py +++ b/core/schains/process_manager.py @@ -18,14 +18,14 @@ # along with this program. If not, see . import logging -import sys import time -from typing import Dict from multiprocessing import Process +from typing import Optional -from skale import Skale +from skale import Skale, SkaleIma -from core.schains.monitor.main import create_and_execute_pipelines +from core.node_config import NodeConfig +from core.schains.monitor.main import start_monitor from core.schains.notifications import notify_if_not_enough_balance from core.schains.process import ( is_monitor_process_alive, @@ -33,7 +33,6 @@ ProcessReport, ) -from web.models.schain import upsert_schain_record, SChainRecord from tools.str_formatters import arguments_list_string @@ -43,24 +42,7 @@ DKG_TIMEOUT_COEFFICIENT = 2.2 -def pm_signal_handler(*args): - """ - This function is trigerred when SIGTERM signal is received by the main process of the app. - The purpose of the process manager signal handler is to forward SIGTERM signal to all sChain - processes so they can gracefully save DKG results before - """ - records = SChainRecord.select() - print(f'schain_records: {len(records)}') - print(f'schain_records: {records}') - for r in records: - logger.warning(f'Sending SIGTERM to {r.name}, {r.monitor_id}') - terminate_process(r.monitor_id) - logger.warning('All sChain processes stopped, exiting...') - sys.exit(0) - - -def run_process_manager(skale, skale_ima, node_config): - # signal.signal(signal.SIGTERM, pm_signal_handler) +def run_process_manager(skale: Skale, skale_ima: SkaleIma, node_config: NodeConfig) -> None: logger.info('Process manager started') node_id = node_config.id node_info = node_config.all() @@ -72,35 +54,42 @@ def run_process_manager(skale, skale_ima, node_config): logger.info('Process manager procedure finished') -def run_pm_schain(skale, skale_ima, node_config, schain: Dict) -> None: - schain_record = upsert_schain_record(schain['name']) - process_report = ProcessReport(schain['name']) - log_prefix = f'sChain {schain["name"]} -' # todo - move to logger formatter +def run_pm_schain( + skale: Skale, + skale_ima: SkaleIma, + node_config: NodeConfig, + schain: dict, + timeout: Optional[int] = None, +) -> None: + log_prefix = f'sChain {schain["name"]} -' - dkg_timeout = skale.constants_holder.get_dkg_timeout() - allowed_diff = int(dkg_timeout * DKG_TIMEOUT_COEFFICIENT) - if int(time.time()) - process_report.ts > allowed_diff: - terminate_process(process_report.pid) - monitor_process_alive = is_monitor_process_alive(schain_record.monitor_id) + if timeout is not None: + allowed_diff = timeout + else: + dkg_timeout = skale.constants_holder.get_dkg_timeout() + allowed_diff = timeout or int(dkg_timeout * DKG_TIMEOUT_COEFFICIENT) - if not monitor_process_alive: - logger.info(f'{log_prefix} PID {schain_record.monitor_id} is not running, spawning...') + process_report = ProcessReport(schain['name']) + init_ts = int(time.time()) + if process_report.is_exist(): + if init_ts - process_report.ts > allowed_diff: + logger.info('%s Terminating process: PID = %d', log_prefix, process_report.pid) + terminate_process(process_report.pid) + else: + pid = process_report.pid + logger.info('%s Process is running: PID = %d', log_prefix, pid) + + if not process_report.is_exist() or not is_monitor_process_alive(process_report.pid): + process_report.ts = init_ts process = Process( name=schain['name'], - target=create_and_execute_pipelines, - args=( - skale, - schain, - node_config, - skale_ima, - schain_record - ) + target=start_monitor, + args=(skale, schain, node_config, skale_ima, process_report), ) process.start() - schain_record.set_monitor_id(process.ident) - logger.info(f'{log_prefix} Process started: PID = {process.ident}') - else: - logger.info(f'{log_prefix} Process is running: PID = {schain_record.monitor_id}') + pid = process.ident + process_report.pid = pid + logger.info('%s Process started: PID = %d', log_prefix, pid) def fetch_schains_to_monitor(skale: Skale, node_id: int) -> list: @@ -114,9 +103,16 @@ def fetch_schains_to_monitor(skale: Skale, node_id: int) -> list: active_schains = list(filter(lambda schain: schain['active'], schains)) schains_holes = len(schains) - len(active_schains) logger.info( - arguments_list_string({'Node ID': node_id, 'sChains on node': active_schains, - 'Number of sChains on node': len(active_schains), - 'Empty sChain structs': schains_holes}, 'Monitoring sChains')) + arguments_list_string( + { + 'Node ID': node_id, + 'sChains on node': active_schains, + 'Number of sChains on node': len(active_schains), + 'Empty sChain structs': schains_holes, + }, + 'Monitoring sChains', + ) + ) return active_schains diff --git a/tests/schains/monitor/main_test.py b/tests/schains/monitor/main_test.py index 0dcdcb3ef..da003fc33 100644 --- a/tests/schains/monitor/main_test.py +++ b/tests/schains/monitor/main_test.py @@ -1,5 +1,8 @@ import functools import logging +import os +import pathlib +import shutil import time from multiprocessing import Process @@ -8,10 +11,21 @@ from core.schains.firewall.types import IpRange from core.schains.firewall.utils import get_sync_agent_ranges from core.schains.monitor.main import Pipeline, run_pipelines - +from core.schains.process import ProcessReport, terminate_process +from tools.configs.schains import SCHAINS_DIR_PATH from tools.helper import is_node_part_of_chain +@pytest.fixture +def tmp_dir(_schain_name): + path = os.path.join(SCHAINS_DIR_PATH, _schain_name) + pathlib.Path(path).mkdir() + try: + yield path + finally: + shutil.rmtree(path, ignore_errors=True) + + @pytest.fixture def sync_ranges(skale): skale.sync_manager.grant_sync_manager_role(skale.wallet.address) @@ -49,7 +63,7 @@ def test_is_node_part_of_chain(skale, schain_on_contracts, node_config): assert not chain_on_node -def test_run_pipelines(): +def test_run_pipelines(tmp_dir, _schain_name): def simple_pipeline(index: int): logging.info('Running simple pipeline %d', index) time.sleep(1) @@ -61,19 +75,38 @@ def stuck_pipeline(index: int): logging.info('Stuck pipeline %d beat', index) time.sleep(2) + process_report = ProcessReport(name=_schain_name) + target = functools.partial(run_pipelines, pipelines=[ Pipeline(name='healthy0', job=functools.partial(simple_pipeline, index=0)), Pipeline(name='healthy1', job=functools.partial(simple_pipeline, index=1)), - ], once=True, stuck_timeout=5, shutdown_interval=10) + ], process_report=process_report, once=True, stuck_timeout=5, shutdown_interval=10) + + terminated = False monitor_process = Process(target=target) - monitor_process.start() - monitor_process.join() + try: + monitor_process.start() + monitor_process.join() + finally: + if monitor_process.is_alive(): + terminated = True + terminate_process(monitor_process.ident) + assert not terminated - run_pipelines([ + target = functools.partial(run_pipelines, pipelines=[ Pipeline(name='healthy', job=functools.partial(simple_pipeline, index=0)), Pipeline(name='stuck', job=functools.partial(stuck_pipeline, index=1)) - ], stuck_timeout=5, shutdown_interval=10) + ], process_report=process_report, stuck_timeout=5, shutdown_interval=10) monitor_process = Process(target=target) - monitor_process.start() - monitor_process.join(timeout=50) + terminated = False + + try: + monitor_process.start() + monitor_process.join(timeout=50) + finally: + if monitor_process.is_alive(): + terminated = True + terminate_process(monitor_process.ident) + + assert terminated diff --git a/tests/schains/monitor/process_test.py b/tests/schains/monitor/process_test.py index a46572744..d5b495d03 100644 --- a/tests/schains/monitor/process_test.py +++ b/tests/schains/monitor/process_test.py @@ -11,9 +11,8 @@ @pytest.fixture -def tmp_dir(): - schain_name = 'test' - path = os.path.join(SCHAINS_DIR_PATH, schain_name) +def tmp_dir(_schain_name): + path = os.path.join(SCHAINS_DIR_PATH, _schain_name) Path(path).mkdir() try: yield path @@ -21,9 +20,8 @@ def tmp_dir(): shutil.rmtree(path, ignore_errors=True) -def test_process_report(tmp_dir): - path = os.path.join(tmp_dir, 'process.json') - report = ProcessReport(path) +def test_process_report(_schain_name, tmp_dir): + report = ProcessReport(_schain_name) with pytest.raises(FileNotFoundError): assert report.ts == 0 diff --git a/tests/schains/process_manager_test.py b/tests/schains/process_manager_test.py new file mode 100644 index 000000000..80171e749 --- /dev/null +++ b/tests/schains/process_manager_test.py @@ -0,0 +1,75 @@ +import mock +import os +import pathlib +import shutil +import time + +import psutil +import pytest + +from core.schains.process import ProcessReport, terminate_process +from core.schains.process_manager import run_pm_schain +from tools.configs.schains import SCHAINS_DIR_PATH + + +@pytest.fixture +def tmp_dir(_schain_name): + path = os.path.join(SCHAINS_DIR_PATH, _schain_name) + pathlib.Path(path).mkdir() + try: + yield path + finally: + shutil.rmtree(path, ignore_errors=True) + + +def test_run_pm_schain(tmp_dir, skale, skale_ima, node_config, _schain_name): + def target_regular_mock(*args, **kwargs): + process_report = args[-1] + iterations = 5 + for _ in range(iterations): + process_report.ts = int(time.time()) + time.sleep(1) + + def target_stuck_mock(*args, **kwargs): + iterations = 10000 + for _ in range(iterations): + time.sleep(1) + + schain = {'name': _schain_name} + + timeout = 7 + + with mock.patch('core.schains.process_manager.start_monitor', target_regular_mock): + run_pm_schain(skale, skale_ima, node_config, schain, timeout=timeout) + pid = ProcessReport(_schain_name).pid + assert psutil.Process(pid).is_running() + + start_ts = int(time.time()) + + while int(time.time()) - start_ts < 2 * timeout: + time.sleep(1) + assert psutil.Process(pid).status() == 'zombie' + + with mock.patch('core.schains.process_manager.start_monitor', target_stuck_mock): + run_pm_schain(skale, skale_ima, node_config, schain, timeout=timeout) + + pid = ProcessReport(_schain_name).pid + + assert psutil.Process(pid).is_running() + + start_ts = int(time.time()) + + while int(time.time()) - start_ts < 2 * timeout: + try: + psutil.Process(pid).is_running() + except psutil.NoSuchProcess: + break + with mock.patch('core.schains.process_manager.start_monitor', target_stuck_mock): + run_pm_schain(skale, skale_ima, node_config, schain, timeout=timeout) + time.sleep(1) + + with pytest.raises(psutil.NoSuchProcess): + psutil.Process(pid).is_running() + + pid = ProcessReport(_schain_name).pid + terminate_process(pid) From 54d1201e871b19c950158c105a36462baa477889 Mon Sep 17 00:00:00 2001 From: badrogger Date: Tue, 6 Aug 2024 18:15:52 +0000 Subject: [PATCH 07/32] Remove redundant steps in test cleanup --- scripts/helper.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/helper.sh b/scripts/helper.sh index b8f15641a..4e7ee18cc 100644 --- a/scripts/helper.sh +++ b/scripts/helper.sh @@ -31,11 +31,9 @@ export_test_env () { tests_cleanup () { export_test_env - docker rm -f skale_schain_test && docker volume rm test || true - sudo rm -r tests/skale-data/lib || true + rm -r tests/skale-data/lib || true rm tests/skale-data/node_data/node_config.json || true docker rm -f sgx-simulator || true - docker rm -f skale_schain_test1 skale_schain_test2 skale_schain_test3 || true find . -name \*.pyc -delete || true mkdir -p $SGX_CERTIFICATES_FOLDER || true rm -rf $SGX_CERTIFICATES_FOLDER/sgx.* || true From 396a846a83f8c7510de86da832ce478c90ca8005 Mon Sep 17 00:00:00 2001 From: badrogger Date: Tue, 6 Aug 2024 19:08:01 +0000 Subject: [PATCH 08/32] Remove DKG_TIMEOUT_COEFFICIENT duplicate --- core/schains/monitor/main.py | 9 +++++---- core/schains/process_manager.py | 5 +---- tools/configs/schains.py | 2 ++ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/core/schains/monitor/main.py b/core/schains/monitor/main.py index ffefdad46..9196921bd 100644 --- a/core/schains/monitor/main.py +++ b/core/schains/monitor/main.py @@ -42,6 +42,7 @@ from tools.docker_utils import DockerUtils from tools.configs import SYNC_NODE +from tools.configs.schains import DKG_TIMEOUT_COEFFICIENT from tools.notifications.messages import notify_checks from tools.helper import is_node_part_of_chain, no_hyphens from tools.resources import get_statsd_client @@ -53,7 +54,6 @@ STUCK_TIMEOUT = 60 * 60 * 2 SHUTDOWN_INTERVAL = 60 * 10 -DKG_TIMEOUT_COEFFICIENT = 2.2 logger = logging.getLogger(__name__) @@ -184,6 +184,9 @@ def start_monitor( stream_version = get_skale_node_version() schain_record = upsert_schain_record(name) + dkg_timeout = skale.constants_holder.get_dkg_timeout() + stuck_timeout = int(dkg_timeout * DKG_TIMEOUT_COEFFICIENT) + is_rotation_active = skale.node_rotation.is_rotation_active(name) leaving_chain = not SYNC_NODE and not is_node_part_of_chain(skale, name, node_config.id) @@ -206,8 +209,6 @@ def start_monitor( pipelines = [] if not leaving_chain: logger.info('Adding config pipelines to the pool') - dkg_timeout = skale.constants_holder.get_dkg_timeout() - stuck_timeout = int(dkg_timeout * DKG_TIMEOUT_COEFFICIENT) pipelines.append( Pipeline( name='config', @@ -244,5 +245,5 @@ def start_monitor( logger.warning('No pipelines to run') return False - run_pipelines(pipelines=pipelines, process_report=process_report) + run_pipelines(pipelines=pipelines, process_report=process_report, stuck_timeout=stuck_timeout) return True diff --git a/core/schains/process_manager.py b/core/schains/process_manager.py index 549f492cd..a7785af1b 100644 --- a/core/schains/process_manager.py +++ b/core/schains/process_manager.py @@ -34,14 +34,11 @@ ) from tools.str_formatters import arguments_list_string - +from tools.configs.schains import DKG_TIMEOUT_COEFFICIENT logger = logging.getLogger(__name__) -DKG_TIMEOUT_COEFFICIENT = 2.2 - - def run_process_manager(skale: Skale, skale_ima: SkaleIma, node_config: NodeConfig) -> None: logger.info('Process manager started') node_id = node_config.id diff --git a/tools/configs/schains.py b/tools/configs/schains.py index 566709ca8..390275471 100644 --- a/tools/configs/schains.py +++ b/tools/configs/schains.py @@ -53,3 +53,5 @@ RPC_CHECK_TIMEOUT_STEP = 10 MAX_CONSENSUS_STORAGE_INF_VALUE = 1000000000000000000 + +DKG_TIMEOUT_COEFFICIENT = 2.2 From bd8d7cde4238d56aa766db063afb6fdffd8c78f1 Mon Sep 17 00:00:00 2001 From: badrogger Date: Wed, 7 Aug 2024 11:52:55 +0000 Subject: [PATCH 09/32] Update IMA agent versions in container.json --- tests/skale-data/config/containers.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/skale-data/config/containers.json b/tests/skale-data/config/containers.json index 3561d2539..3e0221bd6 100644 --- a/tests/skale-data/config/containers.json +++ b/tests/skale-data/config/containers.json @@ -31,8 +31,8 @@ }, "ima": { "name": "skalenetwork/ima", - "version": "2.0.0-develop.3", - "new_version": "2.0.0-beta.9", + "version": "2.1.0-beta.3", + "new_version": "2.1.0", "custom_args": {}, "args": { "restart_policy": { From 563a67751274e4c1bf305dc3325360bc3f9d73ff Mon Sep 17 00:00:00 2001 From: badrogger Date: Wed, 7 Aug 2024 17:00:24 +0000 Subject: [PATCH 10/32] Fix IMA migration tests --- tests/schains/monitor/action/skaled_action_test.py | 8 ++++---- tests/skale-data/config/containers.json | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/schains/monitor/action/skaled_action_test.py b/tests/schains/monitor/action/skaled_action_test.py index 3281ec6bd..22f59f876 100644 --- a/tests/schains/monitor/action/skaled_action_test.py +++ b/tests/schains/monitor/action/skaled_action_test.py @@ -282,7 +282,7 @@ def test_ima_container_action_from_scratch( container_name = containers[0].name assert container_name == f'skale_ima_{skaled_am.name}' image = dutils.get_container_image_name(container_name) - assert image == 'skalenetwork/ima:2.0.0-beta.9' + assert image == 'skalenetwork/ima:2.1.0' # @pytest.mark.skip('Docker API GA issues need to be resolved') @@ -304,8 +304,8 @@ def test_ima_container_action_image_pulling( container_name = containers[0].name assert container_name == f'skale_ima_{skaled_am.name}' image = dutils.get_container_image_name(container_name) - assert image == 'skalenetwork/ima:2.0.0-develop.3' - assert dutils.pulled('skalenetwork/ima:2.0.0-beta.9') + assert image == 'skalenetwork/ima:2.1.0-beta.3' + assert dutils.pulled('skalenetwork/ima:2.1.0') def test_ima_container_action_image_migration( @@ -325,7 +325,7 @@ def test_ima_container_action_image_migration( container_name = containers[0].name assert container_name == f'skale_ima_{skaled_am.name}' image = dutils.get_container_image_name(container_name) - assert image == 'skalenetwork/ima:2.0.0-beta.9' + assert image == 'skalenetwork/ima:2.1.0' def test_ima_container_action_time_frame_migration( diff --git a/tests/skale-data/config/containers.json b/tests/skale-data/config/containers.json index 3e0221bd6..41f3f514e 100644 --- a/tests/skale-data/config/containers.json +++ b/tests/skale-data/config/containers.json @@ -1,7 +1,7 @@ { "schain": { "name": "skalenetwork/schain", - "version": "3.16.1", + "version": "3.19.0", "custom_args": { "ulimits_list": [ { From da438179ece4e9bb945a776ce8c557316e262af0 Mon Sep 17 00:00:00 2001 From: badrogger Date: Wed, 7 Aug 2024 17:06:55 +0000 Subject: [PATCH 11/32] Fix routes tests --- tests/routes/schains_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/routes/schains_test.py b/tests/routes/schains_test.py index 329987985..e9dd2218e 100644 --- a/tests/routes/schains_test.py +++ b/tests/routes/schains_test.py @@ -172,8 +172,8 @@ def test_get_schain( def test_schain_containers_versions(skale_bp): - expected_skaled_version = '3.16.1' - expected_ima_version = '2.0.0-beta.9' + expected_skaled_version = '3.19.0' + expected_ima_version = '2.1.0' data = get_bp_data(skale_bp, get_api_url( BLUEPRINT_NAME, 'container-versions')) assert data == { From 6fdce4e616e4742774a87728f4dd5bd73af18e14 Mon Sep 17 00:00:00 2001 From: badrogger Date: Wed, 7 Aug 2024 18:10:59 +0000 Subject: [PATCH 12/32] Remove locking from SChainRecord --- web/models/schain.py | 61 +++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/web/models/schain.py b/web/models/schain.py index a7f67eb79..f60e97e72 100644 --- a/web/models/schain.py +++ b/web/models/schain.py @@ -17,12 +17,13 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import functools import logging -import threading +import time from datetime import datetime from peewee import (CharField, DateTimeField, - IntegrityError, IntegerField, BooleanField) + IntegrityError, IntegerField, BooleanField, OperationalError) from core.schains.dkg.structures import DKGStatus from web.models.base import BaseModel @@ -30,10 +31,31 @@ logger = logging.getLogger(__name__) DEFAULT_CONFIG_VERSION = '0.0.0' +RETRY_ATTEMPTS = 10 +TIMEOUTS = [2 ** p for p in range(RETRY_ATTEMPTS)] + + +def operational_error_retry(func): + @functools.wraps(func) + def wrapper(cls, *args, **kwargs): + result, error = None, None + for i, timeout in enumerate(TIMEOUTS): + try: + result = func(cls, *args, **kwargs) + except OperationalError as e: + logger.exception('DB operational error') + error = e + time.sleep(timeout) + else: + error = None + break + if error is not None: + raise error + return result + return wrapper class SChainRecord(BaseModel): - _lock = threading.Lock() name = CharField(unique=True) added_at = DateTimeField() dkg_status = IntegerField() @@ -70,6 +92,7 @@ def add(cls, name): return (None, err) @classmethod + @operational_error_retry def get_by_name(cls, name): return cls.get(cls.name == name) @@ -107,10 +130,6 @@ def to_dict(cls, record): 'failed_rpc_count': record.failed_rpc_count } - def upload(self, *args, **kwargs) -> None: - with SChainRecord._lock: - self.save(*args, **kwargs) - def dkg_started(self): self.set_dkg_status(DKGStatus.IN_PROGRESS) @@ -126,66 +145,66 @@ def dkg_done(self): def set_dkg_status(self, val: DKGStatus) -> None: logger.info(f'Changing DKG status for {self.name} to {val.name}') self.dkg_status = val - self.upload() + self.save() def set_deleted(self): self.is_deleted = True - self.upload() + self.save() def set_first_run(self, val): logger.info(f'Changing first_run for {self.name} to {val}') self.first_run = val - self.upload(only=[SChainRecord.first_run]) + self.save(only=[SChainRecord.first_run]) def set_backup_run(self, val): logger.info(f'Changing backup_run for {self.name} to {val}') self.backup_run = val - self.upload(only=[SChainRecord.backup_run]) + self.save(only=[SChainRecord.backup_run]) def set_repair_mode(self, value): logger.info(f'Changing repair_mode for {self.name} to {value}') self.repair_mode = value - self.upload() + self.save() def set_new_schain(self, value): logger.info(f'Changing new_schain for {self.name} to {value}') self.new_schain = value - self.upload() + self.save() def set_needs_reload(self, value): logger.info(f'Changing needs_reload for {self.name} to {value}') self.needs_reload = value - self.upload() + self.save() def set_monitor_last_seen(self, value): logger.info(f'Changing monitor_last_seen for {self.name} to {value}') self.monitor_last_seen = value - self.upload() + self.save() def set_monitor_id(self, value): logger.info(f'Changing monitor_id for {self.name} to {value}') self.monitor_id = value - self.upload() + self.save() def set_config_version(self, value): logger.info(f'Changing config_version for {self.name} to {value}') self.config_version = value - self.upload() + self.save() def set_restart_count(self, value: int) -> None: logger.info(f'Changing restart count for {self.name} to {value}') self.restart_count = value - self.upload() + self.save() def set_failed_rpc_count(self, value: int) -> None: logger.info(f'Changing failed rpc count for {self.name} to {value}') self.failed_rpc_count = value - self.upload() + self.save() def set_snapshot_from(self, value: str) -> None: logger.info(f'Changing snapshot from for {self.name} to {value}') self.snapshot_from = value - self.upload() + self.save() def reset_failed_counters(self) -> None: logger.info(f'Resetting failed counters for {self.name}') @@ -203,7 +222,7 @@ def is_dkg_done(self) -> bool: def set_sync_config_run(self, value): logger.info(f'Changing sync_config_run for {self.name} to {value}') self.sync_config_run = value - self.upload() + self.save() def is_dkg_unsuccessful(self) -> bool: return self.dkg_status in [ From 6736628c4a7fc321f9d6efdf526b325ccb38cb35 Mon Sep 17 00:00:00 2001 From: badrogger Date: Wed, 7 Aug 2024 18:11:34 +0000 Subject: [PATCH 13/32] Switch to info log level for gunicorn --- gunicorn.conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gunicorn.conf.py b/gunicorn.conf.py index e301f4843..51958e107 100644 --- a/gunicorn.conf.py +++ b/gunicorn.conf.py @@ -1,4 +1,4 @@ bind = "127.0.0.1:3007" workers = 2 timeout = 1000 -loglevel = "debug" +loglevel = "info" From d9a3ccd7ec376597dcd0b0f2aaa62e2af55cc4f6 Mon Sep 17 00:00:00 2001 From: badrogger Date: Wed, 28 Aug 2024 10:49:41 +0000 Subject: [PATCH 14/32] Remove unused Runner class --- core/schains/monitor/pipeline.py | 40 -------------------------------- 1 file changed, 40 deletions(-) diff --git a/core/schains/monitor/pipeline.py b/core/schains/monitor/pipeline.py index cf9f73ae7..eaf5c2ccf 100644 --- a/core/schains/monitor/pipeline.py +++ b/core/schains/monitor/pipeline.py @@ -24,46 +24,6 @@ class Pipeline(NamedTuple): job: Callable -# class Runner: -# def __init__( -# self, -# pipelines: list[Pipeline], -# reporting_queue: queue.Queue, -# stuck_timeout: int = STUCK_TIMEOUT, -# shutdown_interval: int = SHUTDOWN_INTERVAL -# ) -> None: -# self.pipelines = pipelines -# self.reporting_queue = reporting_queue -# self.stuck_timeout = stuck_timeout -# self.shutdown_interval = shutdown_interval -# self.number = len(pipelines) -# -# def make_threads(self) -> None: -# init_ts = time.time() -# heartbeat_queues = [queue.Queue() for _ in range(self.number)] -# terminating_events = [threading.Event() for _ in range(self.number)] -# heartbeat_ts = [init_ts for _ in range(self.number)] -# -# return [ -# threading.Thread( -# name=pipeline.name, -# target=keep_pipeline, -# args=[heartbeat_queue, terminating_event, pipeline.job], -# ) -# for heartbeat_queue, terminating_event, pipeline in zip( -# heartbeat_queues, terminating_events, self.pipelines -# ) -# ] -# -# def run_threads(self, threads: list[threading.Thread]) -> None: -# for th in threads: -# th.start() -# -# def run(self, once: bool = False) -> None: -# threads = self.make_threads() -# self.run_threads(threads) - - def run_pipelines( pipelines: list[Pipeline], process_report: ProcessReport, From 63a80d96a64fcd6512e552a8af397c3b0b32de72 Mon Sep 17 00:00:00 2001 From: badrogger Date: Thu, 29 Aug 2024 18:10:05 +0000 Subject: [PATCH 15/32] Fix new chain --- core/schains/process.py | 1 + 1 file changed, 1 insertion(+) diff --git a/core/schains/process.py b/core/schains/process.py index 734bd7d8a..a66290d03 100644 --- a/core/schains/process.py +++ b/core/schains/process.py @@ -75,6 +75,7 @@ class ProcessReport: def __init__(self, name: str) -> None: self.path = pathlib.Path(SCHAINS_DIR_PATH).joinpath(name, self.REPORT_FILENAME) + self.path.parent.mkdir(parents=True, exist_ok=True) def is_exist(self) -> bool: return os.path.isfile(self.path) From 7d1bdc297aa7ca1f16c8e68759f038e9cd6cda5e Mon Sep 17 00:00:00 2001 From: badrogger Date: Tue, 10 Sep 2024 16:07:20 +0000 Subject: [PATCH 16/32] Fix cleaner --- core/schains/cleaner.py | 7 ++-- core/schains/process.py | 74 ++++++++++++++++++--------------- core/schains/process_manager.py | 20 ++++----- 3 files changed, 54 insertions(+), 47 deletions(-) diff --git a/core/schains/cleaner.py b/core/schains/cleaner.py index f4545bcbc..2248c8a14 100644 --- a/core/schains/cleaner.py +++ b/core/schains/cleaner.py @@ -37,7 +37,7 @@ get_node_ips_from_config, get_own_ip_from_config, ) -from core.schains.process import terminate_schain_process +from core.schains.process import ProcessReport, terminate_process from core.schains.runner import get_container_name, is_exited from core.schains.external_config import ExternalConfig from core.schains.types import ContainerType @@ -185,9 +185,10 @@ def remove_schain( msg: str, dutils: Optional[DockerUtils] = None, ) -> None: - schain_record = upsert_schain_record(schain_name) logger.warning(msg) - terminate_schain_process(schain_record) + report = ProcessReport(name=schain_name) + if report.is_exist(): + terminate_process(report) delete_bls_keys(skale, schain_name) sync_agent_ranges = get_sync_agent_ranges(skale) diff --git a/core/schains/process.py b/core/schains/process.py index a66290d03..b655bff3e 100644 --- a/core/schains/process.py +++ b/core/schains/process.py @@ -26,7 +26,6 @@ import pathlib import psutil - from tools.configs.schains import SCHAINS_DIR_PATH from tools.helper import check_pid @@ -37,39 +36,6 @@ P_KILL_WAIT_TIMEOUT = 60 -def terminate_process(pid, kill_timeout=P_KILL_WAIT_TIMEOUT, log_msg=''): - log_prefix = f'pid: {pid} - ' - if log_msg != '': - log_prefix += f'{log_msg} - ' - if pid == 0: - logger.warning(f'{log_prefix} - pid is 0, skipping') - return - try: - logger.warning(f'{log_prefix} - going to terminate') - p = psutil.Process(pid) - os.kill(p.pid, signal.SIGTERM) - p.wait(timeout=kill_timeout) - logger.info(f'{log_prefix} was terminated') - except psutil.NoSuchProcess: - logger.info(f'{log_prefix} - no such process') - except psutil.TimeoutExpired: - logger.warning(f'{log_prefix} - timout expired, going to kill') - p.kill() - logger.info(f'{log_prefix} - process was killed') - except Exception: - logging.exception(f'{log_prefix} - termination failed!') - - -def terminate_schain_process(schain_record): - log_msg = f'schain: {schain_record.name}' - terminate_process(schain_record.monitor_id, log_msg=log_msg) - - -def is_monitor_process_alive(monitor_id): - """Checks that provided monitor_id is inited and alive""" - return monitor_id != 0 and check_pid(monitor_id) - - class ProcessReport: REPORT_FILENAME = 'process.json' @@ -127,3 +93,43 @@ def update(self, pid: int, ts: int) -> None: report = {'pid': pid, 'ts': ts} self._save_tmp(report=report) self._move() + + def cleanup(self) -> None: + os.remove(self.path) + + +def terminate_process( + process_report: ProcessReport, + kill_timeout: int = P_KILL_WAIT_TIMEOUT, + log_msg: str = '' +) -> None: + pid = process_report.pid + log_prefix = f'pid: {pid} - ' + + if log_msg != '': + log_prefix += f'{log_msg} - ' + if pid == 0: + logger.warning(f'{log_prefix} - pid is 0, skipping') + return + try: + logger.warning(f'{log_prefix} - going to terminate') + p = psutil.Process(pid) + os.kill(p.pid, signal.SIGTERM) + p.wait(timeout=kill_timeout) + logger.info(f'{log_prefix} was terminated') + except psutil.NoSuchProcess: + logger.info(f'{log_prefix} - no such process') + except psutil.TimeoutExpired: + logger.warning(f'{log_prefix} - timout expired, going to kill') + p.kill() + logger.info(f'{log_prefix} - process was killed') + except Exception: + logger.exception(f'{log_prefix} - termination failed!') + return + logger.info(f'{log_prefix} - removing process report for {pid}') + process_report.cleanup() + + +def is_monitor_process_alive(monitor_pid: int) -> bool: + """Checks that provided monitor_id is inited and alive""" + return monitor_pid != 0 and check_pid(monitor_pid) diff --git a/core/schains/process_manager.py b/core/schains/process_manager.py index a7785af1b..b926db4b5 100644 --- a/core/schains/process_manager.py +++ b/core/schains/process_manager.py @@ -66,26 +66,26 @@ def run_pm_schain( dkg_timeout = skale.constants_holder.get_dkg_timeout() allowed_diff = timeout or int(dkg_timeout * DKG_TIMEOUT_COEFFICIENT) - process_report = ProcessReport(schain['name']) + report = ProcessReport(schain['name']) init_ts = int(time.time()) - if process_report.is_exist(): - if init_ts - process_report.ts > allowed_diff: - logger.info('%s Terminating process: PID = %d', log_prefix, process_report.pid) - terminate_process(process_report.pid) + if report.is_exist() and is_monitor_process_alive(report.pid): + if init_ts - report.ts > allowed_diff: + logger.info('%s Terminating process: PID = %d', log_prefix, report.pid) + terminate_process(report.pid) else: - pid = process_report.pid + pid = report.pid logger.info('%s Process is running: PID = %d', log_prefix, pid) - if not process_report.is_exist() or not is_monitor_process_alive(process_report.pid): - process_report.ts = init_ts + if not report.is_exist() or not is_monitor_process_alive(report.pid): + report.ts = init_ts process = Process( name=schain['name'], target=start_monitor, - args=(skale, schain, node_config, skale_ima, process_report), + args=(skale, schain, node_config, skale_ima, report), ) process.start() pid = process.ident - process_report.pid = pid + report.pid = pid logger.info('%s Process started: PID = %d', log_prefix, pid) From ee9653c7c25a089a56fbb73858b7e2866ee1d5e4 Mon Sep 17 00:00:00 2001 From: badrogger Date: Fri, 13 Sep 2024 12:29:16 +0000 Subject: [PATCH 17/32] Fix terminate_process usage --- core/schains/process_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/schains/process_manager.py b/core/schains/process_manager.py index b926db4b5..2f3cbbf1b 100644 --- a/core/schains/process_manager.py +++ b/core/schains/process_manager.py @@ -71,7 +71,7 @@ def run_pm_schain( if report.is_exist() and is_monitor_process_alive(report.pid): if init_ts - report.ts > allowed_diff: logger.info('%s Terminating process: PID = %d', log_prefix, report.pid) - terminate_process(report.pid) + terminate_process(report) else: pid = report.pid logger.info('%s Process is running: PID = %d', log_prefix, pid) From 87b0915279b741deaaf4026a98bcd8e56b198432 Mon Sep 17 00:00:00 2001 From: badrogger Date: Thu, 19 Sep 2024 16:09:36 +0000 Subject: [PATCH 18/32] Fix process_manager --- core/schains/process_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/schains/process_manager.py b/core/schains/process_manager.py index b07959713..d9e1f1027 100644 --- a/core/schains/process_manager.py +++ b/core/schains/process_manager.py @@ -59,7 +59,7 @@ def run_pm_schain( schain: SchainStructure, timeout: Optional[int] = None, ) -> None: - log_prefix = f'sChain {schain["name"]} -' + log_prefix = f'sChain {schain.name} -' if timeout is not None: allowed_diff = timeout From 51382854601d244687f96330de14b3b58719b67c Mon Sep 17 00:00:00 2001 From: badrogger Date: Thu, 19 Sep 2024 17:46:38 +0000 Subject: [PATCH 19/32] Fix start_monitor arguments type_hints --- core/schains/monitor/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/schains/monitor/main.py b/core/schains/monitor/main.py index 7835eb385..435c1a8cc 100644 --- a/core/schains/monitor/main.py +++ b/core/schains/monitor/main.py @@ -173,7 +173,7 @@ def run_skaled_pipeline( def start_monitor( skale: Skale, - schain: dict, + schain: SchainStructure, node_config: NodeConfig, skale_ima: SkaleIma, process_report: ProcessReport, From fca4ac615e54a50336bf53aab8e89bc9050288ee Mon Sep 17 00:00:00 2001 From: badrogger Date: Mon, 23 Sep 2024 19:19:43 +0000 Subject: [PATCH 20/32] Rework monitor execution strategy --- core/schains/monitor/main.py | 79 +++++++++++++++++++ core/schains/monitor/tasks.py | 119 +++++++++++++++++++++++++++++ core/schains/process.py | 14 +++- core/schains/process_manager.py | 4 +- tests/schains/monitor/main_test.py | 100 +++++++++++++++++++++--- 5 files changed, 302 insertions(+), 14 deletions(-) create mode 100644 core/schains/monitor/tasks.py diff --git a/core/schains/monitor/main.py b/core/schains/monitor/main.py index 435c1a8cc..5bc6183dd 100644 --- a/core/schains/monitor/main.py +++ b/core/schains/monitor/main.py @@ -248,3 +248,82 @@ def start_monitor( run_pipelines(pipelines=pipelines, process_report=process_report, stuck_timeout=stuck_timeout) return True + + +def start_jobs( + skale: Skale, + schain: SchainStructure, + node_config: NodeConfig, + skale_ima: SkaleIma, + process_report: ProcessReport, + dutils: Optional[DockerUtils] = None, +) -> bool: + reload(web3_request) + name = schain.name + + stream_version = get_skale_node_version() + schain_record = upsert_schain_record(name) + + dkg_timeout = skale.constants_holder.get_dkg_timeout() + stuck_timeout = int(dkg_timeout * DKG_TIMEOUT_COEFFICIENT) + + is_rotation_active = skale.node_rotation.is_rotation_active(name) + + leaving_chain = not SYNC_NODE and not is_node_part_of_chain(skale, name, node_config.id) + if leaving_chain and not is_rotation_active: + logger.info('Not on node (%d), finishing process', node_config.id) + return True + + logger.info( + 'sync_config_run %s, config_version %s, stream_version %s', + schain_record.sync_config_run, + schain_record.config_version, + stream_version, + ) + + statsd_client = get_statsd_client() + monitor_last_seen_ts = schain_record.monitor_last_seen.timestamp() + statsd_client.incr(f'admin.schain.monitor.{no_hyphens(name)}') + statsd_client.gauge(f'admin.schain.monitor_last_seen.{no_hyphens(name)}', monitor_last_seen_ts) + + job_builders = [] + if not leaving_chain: + logger.info('Adding config pipelines to the pool') + pipelines.append( + Pipeline( + name='config', + job=functools.partial( + run_config_pipeline, + skale=skale, + skale_ima=skale_ima, + schain=schain, + node_config=node_config, + stream_version=stream_version, + ), + ) + ) + if schain_record.config_version != stream_version or ( + schain_record.sync_config_run and schain_record.first_run + ): + ConfigFileManager(name).remove_skaled_config() + else: + logger.info('Adding skaled pipeline to the pool') + pipelines.append( + Pipeline( + name='skaled', + job=functools.partial( + run_skaled_pipeline, + skale=skale, + schain=schain, + node_config=node_config, + dutils=dutils, + ), + ) + ) + + if len(pipelines) == 0: + logger.warning('No pipelines to run') + return False + + run_pipelines(pipelines=pipelines, process_report=process_report, stuck_timeout=stuck_timeout) + return True diff --git a/core/schains/monitor/tasks.py b/core/schains/monitor/tasks.py new file mode 100644 index 000000000..8d9e5bc2d --- /dev/null +++ b/core/schains/monitor/tasks.py @@ -0,0 +1,119 @@ +import abc +import logging +import time +from concurrent.futures import Future, ThreadPoolExecutor +from typing import Callable, NamedTuple + +from core.schains.process import ProcessReport + + +logger = logging.getLogger(__name__) + + +STUCK_TIMEOUT = 60 * 60 * 2 +SHUTDOWN_INTERVAL = 60 * 10 + + +class Pipeline(NamedTuple): + name: str + job: Callable + + +class ITaskBuilder(metaclass=abc.ABCMeta): + @property + @abc.abstractmethod + def task_name(self) -> str: + pass + + @property + @abc.abstractmethod + def stuck_timeout(self) -> int: + pass + + @abc.abstractmethod + def build_task(self) -> Callable: + pass + + +def test_job1(): + for i in range(500): + if i % 3 == 1: + print('Job 1 OOO') + time.sleep(1) + + +def test_job2(): + for i in range(500): + if i % 3 == 1: + print('Job 2 YYY') + time.sleep(1) + + +class TestPipelineBuilder1: + def __init__(self) -> None: + self._task_name = 'task2' + self._stuck_timeout = 5 + + @property + def task_name(self) -> str: + return self._task_name + + @property + def stuck_timeout(self) -> str: + return self._stuck_timeout + + def build_task(self): + return [ + Pipeline(name='test0', job=test_job1()), + ] + + +class TestPipelineBuilder2: + def __init__(self) -> None: + self.task_name = 'task2' + self.stuck_timeout = 5 + + def build_task(self): + return [ + Pipeline(name='test1', job=test_job2()), + ] + + +class StuckMonitorError(Exception): + pass + + +def execute_tasks( + task_builders: list[ITaskBuilder], + process_report: ProcessReport, + once: bool = False, + shutdown_interval: int = SHUTDOWN_INTERVAL, +) -> None: + with ThreadPoolExecutor(max_workers=len(task_builders), thread_name_prefix='mon_') as executor: + stucked = [] + futures = [Future() for _ in task_builders] + start_ts = [0 for _ in task_builders] + while True: + for index, builder in enumerate(task_builders): + if not futures[index].running(): + job = builder.build_task() + start_ts[index] = int(time.time()) + futures[index] = executor.submit(job) + else: + if time.time() - start_ts[index] > builder.stuck_timeout: + canceled = futures[index].cancel() + if not canceled: + logger.warning('Stuck detected for job {builder.name}') + stucked.append(builder.name) + + logger.info('Sleeping before stopping executor') + time.sleep(shutdown_interval) + + if len(stucked) > 0: + executor.shutdown(wait=False) + logger.info('Subverting execution. Stucked %s', stucked) + process_report.ts = 0 + break + if once: + break + process_report.ts = int(time.time()) diff --git a/core/schains/process.py b/core/schains/process.py index b655bff3e..4b4173ad0 100644 --- a/core/schains/process.py +++ b/core/schains/process.py @@ -98,12 +98,22 @@ def cleanup(self) -> None: os.remove(self.path) -def terminate_process( +def shutdown_process( process_report: ProcessReport, kill_timeout: int = P_KILL_WAIT_TIMEOUT, log_msg: str = '' ) -> None: pid = process_report.pid + terminate_process(pid=pid, kill_timeout=kill_timeout, log_msg=log_msg) + logger.info(f'Removing process report for {pid}') + process_report.cleanup() + + +def terminate_process( + pid: int, + kill_timeout: int = P_KILL_WAIT_TIMEOUT, + log_msg: str = '' +) -> None: log_prefix = f'pid: {pid} - ' if log_msg != '': @@ -126,8 +136,6 @@ def terminate_process( except Exception: logger.exception(f'{log_prefix} - termination failed!') return - logger.info(f'{log_prefix} - removing process report for {pid}') - process_report.cleanup() def is_monitor_process_alive(monitor_pid: int) -> bool: diff --git a/core/schains/process_manager.py b/core/schains/process_manager.py index d9e1f1027..9527ea2bf 100644 --- a/core/schains/process_manager.py +++ b/core/schains/process_manager.py @@ -30,7 +30,7 @@ from core.schains.notifications import notify_if_not_enough_balance from core.schains.process import ( is_monitor_process_alive, - terminate_process, + shutdown_process, ProcessReport, ) @@ -72,7 +72,7 @@ def run_pm_schain( if report.is_exist() and is_monitor_process_alive(report.pid): if init_ts - report.ts > allowed_diff: logger.info('%s Terminating process: PID = %d', log_prefix, report.pid) - terminate_process(report) + shutdown_process(report) else: pid = report.pid logger.info('%s Process is running: PID = %d', log_prefix, pid) diff --git a/tests/schains/monitor/main_test.py b/tests/schains/monitor/main_test.py index da003fc33..36f7584e9 100644 --- a/tests/schains/monitor/main_test.py +++ b/tests/schains/monitor/main_test.py @@ -5,6 +5,7 @@ import shutil import time from multiprocessing import Process +from typing import Callable import pytest @@ -12,6 +13,7 @@ from core.schains.firewall.utils import get_sync_agent_ranges from core.schains.monitor.main import Pipeline, run_pipelines from core.schains.process import ProcessReport, terminate_process +from core.schains.monitor.tasks import execute_tasks, ITaskBuilder from tools.configs.schains import SCHAINS_DIR_PATH from tools.helper import is_node_part_of_chain @@ -42,7 +44,7 @@ def test_get_sync_agent_ranges(skale, sync_ranges): ranges = get_sync_agent_ranges(skale) assert ranges == [ IpRange(start_ip='127.0.0.1', end_ip='127.0.0.2'), - IpRange(start_ip='127.0.0.5', end_ip='127.0.0.7') + IpRange(start_ip='127.0.0.5', end_ip='127.0.0.7'), ] @@ -77,10 +79,17 @@ def stuck_pipeline(index: int): process_report = ProcessReport(name=_schain_name) - target = functools.partial(run_pipelines, pipelines=[ - Pipeline(name='healthy0', job=functools.partial(simple_pipeline, index=0)), - Pipeline(name='healthy1', job=functools.partial(simple_pipeline, index=1)), - ], process_report=process_report, once=True, stuck_timeout=5, shutdown_interval=10) + target = functools.partial( + run_pipelines, + pipelines=[ + Pipeline(name='healthy0', job=functools.partial(simple_pipeline, index=0)), + Pipeline(name='healthy1', job=functools.partial(simple_pipeline, index=1)), + ], + process_report=process_report, + once=True, + stuck_timeout=5, + shutdown_interval=10, + ) terminated = False monitor_process = Process(target=target) @@ -93,10 +102,83 @@ def stuck_pipeline(index: int): terminate_process(monitor_process.ident) assert not terminated - target = functools.partial(run_pipelines, pipelines=[ - Pipeline(name='healthy', job=functools.partial(simple_pipeline, index=0)), - Pipeline(name='stuck', job=functools.partial(stuck_pipeline, index=1)) - ], process_report=process_report, stuck_timeout=5, shutdown_interval=10) + target = functools.partial( + run_pipelines, + pipelines=[ + Pipeline(name='healthy', job=functools.partial(simple_pipeline, index=0)), + Pipeline(name='stuck', job=functools.partial(stuck_pipeline, index=1)), + ], + process_report=process_report, + stuck_timeout=5, + shutdown_interval=10, + ) + + monitor_process = Process(target=target) + terminated = False + + try: + monitor_process.start() + monitor_process.join(timeout=50) + finally: + if monitor_process.is_alive(): + terminated = True + terminate_process(monitor_process.ident) + + assert terminated + + +def test_execute_jobs(tmp_dir, _schain_name): + def simple_pipeline(index: int) -> None: + logging.info('Running simple pipeline %d', index) + iterations = 7 + for i in range(iterations): + logging.info('Simple pipeline beat %d', index) + time.sleep(1) + + def stuck_pipeline(index: int) -> None: + logging.info('Running stuck pipeline %d', index) + iterations = 7 + for i in range(iterations): + logging.info('Stuck pipeline %d beat', index) + time.sleep(1) + + class NormalTaskBuilder(ITaskBuilder): + def __init__(self, index: int) -> None: + self.index = index + self.task_name = 'task-a' + self._stuck_timeout = 11 + + def task_name(self) -> str: + return self._task_name + + def stuck_timeout(self) -> int: + return self._stuck_timeout + + def build_task(self) -> Callable: + return functools.partial(simple_pipeline, index=self.index) + + class StuckTaskBuilder(ITaskBuilder): + def __init__(self, index) -> None: + self._task_name = 'task-b' + self.index = index + self._stuck_timeout = 5 + + def task_name(self) -> str: + return self._task_name + + def stuck_timeout(self) -> int: + return self._stuck_timeout + + def build_task(self) -> Callable: + return functools.partial(stuck_pipeline, index=self.index) + + process_report = ProcessReport(name=_schain_name) + target = functools.partial( + execute_tasks, + task_builders=[StuckTaskBuilder(0), StuckTaskBuilder(1)], + process_report=process_report, + shutdown_interval=10, + ) monitor_process = Process(target=target) terminated = False From ccc6bde7d0d9b5129c4baef7584703fd0f44a6b7 Mon Sep 17 00:00:00 2001 From: badrogger Date: Tue, 24 Sep 2024 20:10:53 +0000 Subject: [PATCH 21/32] Wrap pipelines into Tasks classes --- core/schains/monitor/main.py | 238 +++++++++++++++++++++-------- core/schains/monitor/tasks.py | 101 +++++------- core/schains/process_manager.py | 4 +- tests/schains/monitor/main_test.py | 72 +++++---- 4 files changed, 258 insertions(+), 157 deletions(-) diff --git a/core/schains/monitor/main.py b/core/schains/monitor/main.py index f0b3ac9cb..fe1eb63d8 100644 --- a/core/schains/monitor/main.py +++ b/core/schains/monitor/main.py @@ -19,7 +19,7 @@ import functools import logging -from typing import Optional +from typing import Callable, Optional from importlib import reload from skale import Skale, SkaleIma @@ -37,6 +37,7 @@ from core.schains.monitor import get_skaled_monitor, RegularConfigMonitor, SyncConfigMonitor from core.schains.monitor.action import ConfigActionManager, SkaledActionManager from core.schains.monitor.pipeline import Pipeline, run_pipelines +from core.schains.monitor.tasks import execute_tasks, Future, ITask from core.schains.process import ProcessReport from core.schains.status import get_node_cli_status, get_skaled_status from core.node import get_current_nodes @@ -59,24 +60,32 @@ logger = logging.getLogger(__name__) +class NoTasksToRunError(Exception): + pass + + def run_config_pipeline( - skale: Skale, skale_ima: SkaleIma, schain: dict, node_config: NodeConfig, stream_version: str + schain_name: str, + skale: Skale, + skale_ima: SkaleIma, + node_config: NodeConfig, + stream_version: str, ) -> None: - name = schain.name - schain_record = SChainRecord.get_by_name(name) - rotation_data = skale.node_rotation.get_rotation(name) + schain = skale.schains.get_by_name(schain_name) + schain_record = SChainRecord.get_by_name(schain_name) + rotation_data = skale.node_rotation.get_rotation(schain_name) allowed_ranges = get_sync_agent_ranges(skale) - ima_linked = not SYNC_NODE and skale_ima.linker.has_schain(name) - group_index = skale.schains.name_to_group_id(name) + ima_linked = not SYNC_NODE and skale_ima.linker.has_schain(schain_name) + group_index = skale.schains.name_to_group_id(schain_name) last_dkg_successful = skale.dkg.is_last_dkg_successful(group_index) - current_nodes = get_current_nodes(skale, name) + current_nodes = get_current_nodes(skale, schain_name) estate = ExternalState( ima_linked=ima_linked, chain_id=skale_ima.web3.eth.chain_id, ranges=allowed_ranges ) - econfig = ExternalConfig(name) + econfig = ExternalConfig(schain_name) config_checks = ConfigChecks( - schain_name=name, + schain_name=schain_name, node_id=node_config.id, schain_record=schain_record, stream_version=stream_version, @@ -112,24 +121,25 @@ def run_config_pipeline( mon = RegularConfigMonitor(config_am, config_checks) statsd_client = get_statsd_client() - statsd_client.incr(f'admin.config_pipeline.{mon.__class__.__name__}.{no_hyphens(name)}') + statsd_client.incr(f'admin.config_pipeline.{mon.__class__.__name__}.{no_hyphens(schain_name)}') statsd_client.gauge( - f'admin.config_pipeline.rotation_id.{no_hyphens(name)}', rotation_data['rotation_id'] + f'admin.config_pipeline.rotation_id.{no_hyphens(schain_name)}', rotation_data['rotation_id'] ) - with statsd_client.timer(f'admin.config_pipeline.duration.{no_hyphens(name)}'): + with statsd_client.timer(f'admin.config_pipeline.duration.{no_hyphens(schain_name)}'): mon.run() def run_skaled_pipeline( - skale: Skale, schain: SchainStructure, node_config: NodeConfig, dutils: DockerUtils + schain_name: str, skale: Skale, node_config: NodeConfig, dutils: DockerUtils ) -> None: - name = schain.name - schain_record = SChainRecord.get_by_name(name) + schain = skale.schains.get_by_name(schain_name) + schain_record = SChainRecord.get_by_name(schain_name) + logger.info('Record: %s', SChainRecord.to_dict(schain_record)) dutils = dutils or DockerUtils() - rc = get_default_rule_controller(name=name) + rc = get_default_rule_controller(name=schain_name) skaled_checks = SkaledChecks( schain_name=schain.name, schain_record=schain_record, @@ -138,8 +148,8 @@ def run_skaled_pipeline( sync_node=SYNC_NODE, ) - skaled_status = get_skaled_status(name) - ncli_status = get_node_cli_status(name) + skaled_status = get_skaled_status(schain_name) + ncli_status = get_node_cli_status(schain_name) skaled_am = SkaledActionManager( schain=schain, @@ -147,13 +157,13 @@ def run_skaled_pipeline( checks=skaled_checks, node_config=node_config, ncli_status=ncli_status, - econfig=ExternalConfig(name), + econfig=ExternalConfig(schain_name), dutils=dutils, ) check_status = skaled_checks.get_all(log=False, expose=True) automatic_repair = get_automatic_repair_option() api_status = get_api_checks_status(status=check_status, allowed=TG_ALLOWED_CHECKS) - notify_checks(name, node_config.all(), api_status) + notify_checks(schain_name, node_config.all(), api_status) logger.info('Skaled check status: %s', check_status) @@ -165,12 +175,12 @@ def run_skaled_pipeline( schain_record=schain_record, skaled_status=skaled_status, ncli_status=ncli_status, - automatic_repair=automatic_repair + automatic_repair=automatic_repair, ) statsd_client = get_statsd_client() - statsd_client.incr(f'admin.skaled_pipeline.{mon.__name__}.{no_hyphens(name)}') - with statsd_client.timer(f'admin.skaled_pipeline.duration.{no_hyphens(name)}'): + statsd_client.incr(f'admin.skaled_pipeline.{mon.__name__}.{no_hyphens(schain_name)}') + with statsd_client.timer(f'admin.skaled_pipeline.duration.{no_hyphens(schain_name)}'): mon(skaled_am, skaled_checks).run() @@ -253,7 +263,127 @@ def start_monitor( return True -def start_jobs( +class SkaledTask(ITask): + NAME = 'skaled' + STUCK_TIMEOUT = 3600 # 1 hours + + def __init__( + self, + schain_name: str, + skale: Skale, + node_config: NodeConfig, + stream_version: str, + dutils: Optional[DockerUtils] = None, + ) -> None: + self.schain_name = schain_name + self.skale = skale + self.node_config = node_config + self.dutils = dutils + self._future = Future() + self.stream_version = stream_version + + @property + def name(self) -> str: + return self.NAME + + @property + def stuck_timeout(self) -> int: + return self.STUCK_TIMEOUT + + @property + def future(self) -> Future: + return self._future + + @future.setter + def future(self, value: Future) -> None: + self._future = value + + @property + def start_ts(self) -> int: + return self._start_ts + + @start_ts.setter + def start_ts(self, value: int) -> None: + self._start_ts = value + + @property + def needed(self) -> bool: + schain_record = upsert_schain_record(self.schain_name) + return schain_record.config_version == self.stream_version and ( + not schain_record.sync_config_run or not schain_record.first_run + ) + + def create_pipeline(self) -> Callable: + return functools.partial( + run_skaled_pipeline, + schain_name=self.schain_name, + skale=self.skale, + node_config=self.node_config, + dutils=self.dutils, + ) + + +class ConfigTask(ITask): + NAME = 'config' + STUCK_TIMEOUT = 60 * 60 * 2 + + def __init__( + self, + schain_name: str, + skale: Skale, + skale_ima: SkaleIma, + node_config: NodeConfig, + stream_version: str, + ) -> None: + self.schain_name = schain_name + self.skale = skale + self.skale_ima = skale_ima + self.node_config = node_config + self.stream_version = stream_version + self._start_ts = 0 + self._future = Future() + + @property + def name(self) -> str: + return self.NAME + + @property + def future(self) -> Future: + return self._future + + @future.setter + def future(self, value: Future) -> None: + self._future = value + + @property + def stuck_timeout(self) -> int: + dkg_timeout = self.skale.constants_holder.get_dkg_timeout() + return int(dkg_timeout * DKG_TIMEOUT_COEFFICIENT) + + @property + def start_ts(self) -> int: + return self._start_ts + + @start_ts.setter + def start_ts(self, value: int) -> None: + self._start_ts = value + + @property + def needed(self) -> bool: + return SYNC_NODE or is_node_part_of_chain(self.skale, self.schain_name, self.node_config.id) + + def create_pipeline(self) -> Callable: + return functools.partial( + run_config_pipeline, + schain_name=self.schain_name, + skale=self.skale, + skale_ima=self.skale_ima, + node_config=self.node_config, + stream_version=self.stream_version, + ) + + +def start_tasks( skale: Skale, schain: SchainStructure, node_config: NodeConfig, @@ -262,14 +392,12 @@ def start_jobs( dutils: Optional[DockerUtils] = None, ) -> bool: reload(web3_request) + name = schain.name stream_version = get_skale_node_version() schain_record = upsert_schain_record(name) - dkg_timeout = skale.constants_holder.get_dkg_timeout() - stuck_timeout = int(dkg_timeout * DKG_TIMEOUT_COEFFICIENT) - is_rotation_active = skale.node_rotation.is_rotation_active(name) leaving_chain = not SYNC_NODE and not is_node_part_of_chain(skale, name, node_config.id) @@ -289,44 +417,26 @@ def start_jobs( statsd_client.incr(f'admin.schain.monitor.{no_hyphens(name)}') statsd_client.gauge(f'admin.schain.monitor_last_seen.{no_hyphens(name)}', monitor_last_seen_ts) - job_builders = [] - if not leaving_chain: - logger.info('Adding config pipelines to the pool') - pipelines.append( - Pipeline( - name='config', - job=functools.partial( - run_config_pipeline, - skale=skale, - skale_ima=skale_ima, - schain=schain, - node_config=node_config, - stream_version=stream_version, - ), - ) - ) if schain_record.config_version != stream_version or ( schain_record.sync_config_run and schain_record.first_run ): + logger.info('Fetching upstream config requested. Removing the old skaled config') ConfigFileManager(name).remove_skaled_config() - else: - logger.info('Adding skaled pipeline to the pool') - pipelines.append( - Pipeline( - name='skaled', - job=functools.partial( - run_skaled_pipeline, - skale=skale, - schain=schain, - node_config=node_config, - dutils=dutils, - ), - ) - ) - if len(pipelines) == 0: - logger.warning('No pipelines to run') - return False - - run_pipelines(pipelines=pipelines, process_report=process_report, stuck_timeout=stuck_timeout) - return True + tasks = [ + ConfigTask( + schain_name=schain.name, + skale=skale, + skale_ima=skale_ima, + node_config=node_config, + stream_version=stream_version, + ), + SkaledTask( + schain_name=schain.name, + skale=skale, + node_config=node_config, + stream_version=stream_version, + dutils=dutils + ), + ] + execute_tasks(tasks=tasks, process_report=process_report) diff --git a/core/schains/monitor/tasks.py b/core/schains/monitor/tasks.py index 8d9e5bc2d..1fd86052b 100644 --- a/core/schains/monitor/tasks.py +++ b/core/schains/monitor/tasks.py @@ -11,7 +11,7 @@ STUCK_TIMEOUT = 60 * 60 * 2 -SHUTDOWN_INTERVAL = 60 * 10 +SLEEP_INTERVAL = 60 * 10 class Pipeline(NamedTuple): @@ -19,10 +19,10 @@ class Pipeline(NamedTuple): job: Callable -class ITaskBuilder(metaclass=abc.ABCMeta): +class ITask(metaclass=abc.ABCMeta): @property @abc.abstractmethod - def task_name(self) -> str: + def name(self) -> str: pass @property @@ -31,52 +31,32 @@ def stuck_timeout(self) -> int: pass @abc.abstractmethod - def build_task(self) -> Callable: + def create_pipeline(self) -> Callable: pass + @property + @abc.abstractmethod + def future(self) -> Future: + pass -def test_job1(): - for i in range(500): - if i % 3 == 1: - print('Job 1 OOO') - time.sleep(1) - - -def test_job2(): - for i in range(500): - if i % 3 == 1: - print('Job 2 YYY') - time.sleep(1) - - -class TestPipelineBuilder1: - def __init__(self) -> None: - self._task_name = 'task2' - self._stuck_timeout = 5 + @future.setter + @abc.abstractmethod + def future(self, value: Future) -> None: + pass @property - def task_name(self) -> str: - return self._task_name + def needed(self) -> bool: + pass @property - def stuck_timeout(self) -> str: - return self._stuck_timeout - - def build_task(self): - return [ - Pipeline(name='test0', job=test_job1()), - ] - - -class TestPipelineBuilder2: - def __init__(self) -> None: - self.task_name = 'task2' - self.stuck_timeout = 5 + @abc.abstractmethod + def start_ts(self) -> int: + pass - def build_task(self): - return [ - Pipeline(name='test1', job=test_job2()), - ] + @start_ts.setter + @abc.abstractmethod + def start_ts(self, value: int) -> None: + pass class StuckMonitorError(Exception): @@ -84,36 +64,31 @@ class StuckMonitorError(Exception): def execute_tasks( - task_builders: list[ITaskBuilder], + tasks: list[ITask], process_report: ProcessReport, - once: bool = False, - shutdown_interval: int = SHUTDOWN_INTERVAL, + sleep_interval: int = SLEEP_INTERVAL, ) -> None: - with ThreadPoolExecutor(max_workers=len(task_builders), thread_name_prefix='mon_') as executor: + with ThreadPoolExecutor(max_workers=len(tasks), thread_name_prefix='mon') as executor: stucked = [] - futures = [Future() for _ in task_builders] - start_ts = [0 for _ in task_builders] while True: - for index, builder in enumerate(task_builders): - if not futures[index].running(): - job = builder.build_task() - start_ts[index] = int(time.time()) - futures[index] = executor.submit(job) - else: - if time.time() - start_ts[index] > builder.stuck_timeout: - canceled = futures[index].cancel() + for index, task in enumerate(tasks): + if not task.future.running() and task.needed: + task.start_ts = int(time.time()) + logger.info('Starting task %s at %d', task.name, task.start_ts) + pipeline = task.create_pipeline() + task.future = executor.submit(pipeline) + if task.future.running(): + if int(time.time()) - task.start_ts > task.stuck_timeout: + logger.info('Canceling future for %s', task.name) + canceled = task.future.cancel() if not canceled: - logger.warning('Stuck detected for job {builder.name}') - stucked.append(builder.name) - - logger.info('Sleeping before stopping executor') - time.sleep(shutdown_interval) - + logger.warning('Stuck detected for job {task.name}') + stucked.append(task.name) + time.sleep(sleep_interval) if len(stucked) > 0: + logger.info('Sleeping before subverting execution') executor.shutdown(wait=False) logger.info('Subverting execution. Stucked %s', stucked) process_report.ts = 0 break - if once: - break process_report.ts = int(time.time()) diff --git a/core/schains/process_manager.py b/core/schains/process_manager.py index 9527ea2bf..36e02253a 100644 --- a/core/schains/process_manager.py +++ b/core/schains/process_manager.py @@ -26,7 +26,7 @@ from skale.contracts.manager.schains import SchainStructure from core.node_config import NodeConfig -from core.schains.monitor.main import start_monitor +from core.schains.monitor.main import start_tasks from core.schains.notifications import notify_if_not_enough_balance from core.schains.process import ( is_monitor_process_alive, @@ -81,7 +81,7 @@ def run_pm_schain( report.ts = init_ts process = Process( name=schain.name, - target=start_monitor, + target=start_tasks, args=(skale, schain, node_config, skale_ima, report), ) process.start() diff --git a/tests/schains/monitor/main_test.py b/tests/schains/monitor/main_test.py index 36f7584e9..6192b13e5 100644 --- a/tests/schains/monitor/main_test.py +++ b/tests/schains/monitor/main_test.py @@ -4,6 +4,7 @@ import pathlib import shutil import time +from concurrent.futures import Future from multiprocessing import Process from typing import Callable @@ -13,7 +14,7 @@ from core.schains.firewall.utils import get_sync_agent_ranges from core.schains.monitor.main import Pipeline, run_pipelines from core.schains.process import ProcessReport, terminate_process -from core.schains.monitor.tasks import execute_tasks, ITaskBuilder +from core.schains.monitor.tasks import execute_tasks, ITask from tools.configs.schains import SCHAINS_DIR_PATH from tools.helper import is_node_part_of_chain @@ -127,55 +128,70 @@ def stuck_pipeline(index: int): assert terminated -def test_execute_jobs(tmp_dir, _schain_name): - def simple_pipeline(index: int) -> None: - logging.info('Running simple pipeline %d', index) - iterations = 7 - for i in range(iterations): - logging.info('Simple pipeline beat %d', index) - time.sleep(1) - - def stuck_pipeline(index: int) -> None: +def test_execute_tasks(tmp_dir, _schain_name): + def run_stuck_pipeline(index: int) -> None: logging.info('Running stuck pipeline %d', index) iterations = 7 for i in range(iterations): logging.info('Stuck pipeline %d beat', index) time.sleep(1) - class NormalTaskBuilder(ITaskBuilder): - def __init__(self, index: int) -> None: + class StuckedTask(ITask): + def __init__(self, index) -> None: + self._name = 'stucked-task' self.index = index - self.task_name = 'task-a' - self._stuck_timeout = 11 + self._stuck_timeout = 5 + self._start_ts = 0 + self._future = Future() - def task_name(self) -> str: - return self._task_name + @property + def name(self) -> str: + return self._name - def stuck_timeout(self) -> int: - return self._stuck_timeout + @property + def future(self) -> Future: + return self._future - def build_task(self) -> Callable: - return functools.partial(simple_pipeline, index=self.index) + @future.setter + def future(self, value: Future) -> None: + self._future = value - class StuckTaskBuilder(ITaskBuilder): - def __init__(self, index) -> None: - self._task_name = 'task-b' - self.index = index - self._stuck_timeout = 5 + @property + def start_ts(self) -> int: + return self._start_ts + + @start_ts.setter + def start_ts(self, value: int) -> None: + self._start_ts = value + @property def task_name(self) -> str: return self._task_name + @property def stuck_timeout(self) -> int: return self._stuck_timeout - def build_task(self) -> Callable: - return functools.partial(stuck_pipeline, index=self.index) + @property + def needed(self) -> bool: + return True + + def create_pipeline(self) -> Callable: + return functools.partial(run_stuck_pipeline, index=self.index) + + class NotNeededTask(StuckedTask): + def __init__(self, index: int) -> None: + self.index = index + self._name = 'not-needed-task' + + @property + def needed(self) -> bool: + return False process_report = ProcessReport(name=_schain_name) target = functools.partial( execute_tasks, - task_builders=[StuckTaskBuilder(0), StuckTaskBuilder(1)], + tasks=[StuckedTask(0), NotNeededTask(1)], process_report=process_report, shutdown_interval=10, ) From 3a7d3d8ac6345242ce45a9a07251a6c3bb0d9245 Mon Sep 17 00:00:00 2001 From: badrogger Date: Wed, 25 Sep 2024 17:06:45 +0000 Subject: [PATCH 22/32] Fix tests --- core/schains/monitor/tasks.py | 1 + tests/schains/monitor/main_test.py | 30 ++++++++++----------------- tests/schains/process_manager_test.py | 2 +- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/core/schains/monitor/tasks.py b/core/schains/monitor/tasks.py index 1fd86052b..c6342f7a0 100644 --- a/core/schains/monitor/tasks.py +++ b/core/schains/monitor/tasks.py @@ -83,6 +83,7 @@ def execute_tasks( canceled = task.future.cancel() if not canceled: logger.warning('Stuck detected for job {task.name}') + task.start_ts = -1 stucked.append(task.name) time.sleep(sleep_interval) if len(stucked) > 0: diff --git a/tests/schains/monitor/main_test.py b/tests/schains/monitor/main_test.py index 6192b13e5..7b6a11eec 100644 --- a/tests/schains/monitor/main_test.py +++ b/tests/schains/monitor/main_test.py @@ -140,7 +140,7 @@ class StuckedTask(ITask): def __init__(self, index) -> None: self._name = 'stucked-task' self.index = index - self._stuck_timeout = 5 + self._stuck_timeout = 3 self._start_ts = 0 self._future = Future() @@ -162,6 +162,7 @@ def start_ts(self) -> int: @start_ts.setter def start_ts(self, value: int) -> None: + print(f'Updating start_ts {self} {value}') self._start_ts = value @property @@ -181,7 +182,7 @@ def create_pipeline(self) -> Callable: class NotNeededTask(StuckedTask): def __init__(self, index: int) -> None: - self.index = index + super().__init__(index=index) self._name = 'not-needed-task' @property @@ -189,22 +190,13 @@ def needed(self) -> bool: return False process_report = ProcessReport(name=_schain_name) - target = functools.partial( - execute_tasks, - tasks=[StuckedTask(0), NotNeededTask(1)], - process_report=process_report, - shutdown_interval=10, + tasks = [StuckedTask(0), NotNeededTask(1)] + execute_tasks( + tasks=tasks, + process_report=process_report, + sleep_interval=1 ) - monitor_process = Process(target=target) - terminated = False - - try: - monitor_process.start() - monitor_process.join(timeout=50) - finally: - if monitor_process.is_alive(): - terminated = True - terminate_process(monitor_process.ident) - - assert terminated + print(tasks[0], tasks[1]) + assert tasks[0].start_ts == -1 + assert tasks[1].start_ts == 0 diff --git a/tests/schains/process_manager_test.py b/tests/schains/process_manager_test.py index 80171e749..2372fb3b2 100644 --- a/tests/schains/process_manager_test.py +++ b/tests/schains/process_manager_test.py @@ -39,7 +39,7 @@ def target_stuck_mock(*args, **kwargs): timeout = 7 - with mock.patch('core.schains.process_manager.start_monitor', target_regular_mock): + with mock.patch('core.schains.process_manager.start_tasks', target_regular_mock): run_pm_schain(skale, skale_ima, node_config, schain, timeout=timeout) pid = ProcessReport(_schain_name).pid assert psutil.Process(pid).is_running() From e4471d0d8a675820a68e82bed6572e449c1fcbae Mon Sep 17 00:00:00 2001 From: badrogger Date: Thu, 26 Sep 2024 12:02:15 +0000 Subject: [PATCH 23/32] Fix process_manager_tests --- core/schains/process_manager.py | 3 +-- tests/schains/process_manager_test.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core/schains/process_manager.py b/core/schains/process_manager.py index 36e02253a..3d051f375 100644 --- a/core/schains/process_manager.py +++ b/core/schains/process_manager.py @@ -76,8 +76,7 @@ def run_pm_schain( else: pid = report.pid logger.info('%s Process is running: PID = %d', log_prefix, pid) - - if not report.is_exist() or not is_monitor_process_alive(report.pid): + else: report.ts = init_ts process = Process( name=schain.name, diff --git a/tests/schains/process_manager_test.py b/tests/schains/process_manager_test.py index 2372fb3b2..442eff4ac 100644 --- a/tests/schains/process_manager_test.py +++ b/tests/schains/process_manager_test.py @@ -35,12 +35,13 @@ def target_stuck_mock(*args, **kwargs): for _ in range(iterations): time.sleep(1) - schain = {'name': _schain_name} + schain = skale.schains.get_by_name(_schain_name) timeout = 7 with mock.patch('core.schains.process_manager.start_tasks', target_regular_mock): run_pm_schain(skale, skale_ima, node_config, schain, timeout=timeout) + pid = ProcessReport(_schain_name).pid assert psutil.Process(pid).is_running() From 967ccdbaa8ea36cc003285e209498e253c628b9d Mon Sep 17 00:00:00 2001 From: badrogger Date: Thu, 26 Sep 2024 16:45:35 +0000 Subject: [PATCH 24/32] Fix process_manager_test --- core/schains/cleaner.py | 2 +- core/schains/monitor/main.py | 7 ++- core/schains/process.py | 27 ++++---- core/schains/process_manager.py | 23 +++---- tests/schains/process_manager_test.py | 90 ++++++++++++++++++--------- 5 files changed, 92 insertions(+), 57 deletions(-) diff --git a/core/schains/cleaner.py b/core/schains/cleaner.py index 4865a394f..6141883ac 100644 --- a/core/schains/cleaner.py +++ b/core/schains/cleaner.py @@ -188,7 +188,7 @@ def remove_schain( logger.warning(msg) report = ProcessReport(name=schain_name) if report.is_exist(): - terminate_process(report) + terminate_process(report.pid) delete_bls_keys(skale, schain_name) sync_agent_ranges = get_sync_agent_ranges(skale) diff --git a/core/schains/monitor/main.py b/core/schains/monitor/main.py index fe1eb63d8..1cf2c2e04 100644 --- a/core/schains/monitor/main.py +++ b/core/schains/monitor/main.py @@ -19,6 +19,8 @@ import functools import logging +import os +import time from typing import Callable, Optional from importlib import reload @@ -388,12 +390,15 @@ def start_tasks( schain: SchainStructure, node_config: NodeConfig, skale_ima: SkaleIma, - process_report: ProcessReport, dutils: Optional[DockerUtils] = None, ) -> bool: reload(web3_request) name = schain.name + init_ts, pid = int(time.time()), os.getpid() + logger.info('Initialazing process report %d %d', pid, init_ts) + process_report = ProcessReport(name) + process_report.update(pid, init_ts) stream_version = get_skale_node_version() schain_record = upsert_schain_record(name) diff --git a/core/schains/process.py b/core/schains/process.py index 4b4173ad0..387c767a0 100644 --- a/core/schains/process.py +++ b/core/schains/process.py @@ -17,11 +17,12 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import json import logging import os import shutil import signal -import json +from typing import Tuple import pathlib import psutil @@ -36,6 +37,19 @@ P_KILL_WAIT_TIMEOUT = 60 +def is_schain_process_report_exist(schain_name: str) -> None: + path = pathlib.Path(SCHAINS_DIR_PATH).joinpath(schain_name, ProcessReport.REPORT_FILENAME) + return path.is_file() + + +def get_schain_process_info(schain_name: str) -> Tuple[int | None, int | None]: + report = ProcessReport(schain_name) + if not ProcessReport(schain_name).is_exist(): + return None, None + else: + return report.pid, report.ts + + class ProcessReport: REPORT_FILENAME = 'process.json' @@ -98,17 +112,6 @@ def cleanup(self) -> None: os.remove(self.path) -def shutdown_process( - process_report: ProcessReport, - kill_timeout: int = P_KILL_WAIT_TIMEOUT, - log_msg: str = '' -) -> None: - pid = process_report.pid - terminate_process(pid=pid, kill_timeout=kill_timeout, log_msg=log_msg) - logger.info(f'Removing process report for {pid}') - process_report.cleanup() - - def terminate_process( pid: int, kill_timeout: int = P_KILL_WAIT_TIMEOUT, diff --git a/core/schains/process_manager.py b/core/schains/process_manager.py index 3d051f375..e14857a34 100644 --- a/core/schains/process_manager.py +++ b/core/schains/process_manager.py @@ -29,9 +29,9 @@ from core.schains.monitor.main import start_tasks from core.schains.notifications import notify_if_not_enough_balance from core.schains.process import ( + get_schain_process_info, is_monitor_process_alive, - shutdown_process, - ProcessReport, + terminate_process ) from tools.str_formatters import arguments_list_string @@ -67,26 +67,21 @@ def run_pm_schain( dkg_timeout = skale.constants_holder.get_dkg_timeout() allowed_diff = timeout or int(dkg_timeout * DKG_TIMEOUT_COEFFICIENT) - report = ProcessReport(schain.name) - init_ts = int(time.time()) - if report.is_exist() and is_monitor_process_alive(report.pid): - if init_ts - report.ts > allowed_diff: - logger.info('%s Terminating process: PID = %d', log_prefix, report.pid) - shutdown_process(report) + pid, pts = get_schain_process_info(schain.name) + if pid is not None and is_monitor_process_alive(pid): + if int(time.time()) - pts > allowed_diff: + logger.info('%s Terminating process: PID = %d', log_prefix, pid) + terminate_process(pid) else: - pid = report.pid logger.info('%s Process is running: PID = %d', log_prefix, pid) else: - report.ts = init_ts process = Process( name=schain.name, target=start_tasks, - args=(skale, schain, node_config, skale_ima, report), + args=(skale, schain, node_config, skale_ima) ) process.start() - pid = process.ident - report.pid = pid - logger.info('%s Process started: PID = %d', log_prefix, pid) + logger.info('Process started for %s', schain.name) def fetch_schains_to_monitor(skale: Skale, node_id: int) -> list: diff --git a/tests/schains/process_manager_test.py b/tests/schains/process_manager_test.py index 442eff4ac..ab215cc68 100644 --- a/tests/schains/process_manager_test.py +++ b/tests/schains/process_manager_test.py @@ -1,4 +1,5 @@ import mock +import logging import os import pathlib import shutil @@ -10,6 +11,11 @@ from core.schains.process import ProcessReport, terminate_process from core.schains.process_manager import run_pm_schain from tools.configs.schains import SCHAINS_DIR_PATH +from tests.utils import get_schain_struct + +logger = logging.getLogger(__name__) + +MAX_ITERATIONS = 100 @pytest.fixture @@ -22,41 +28,68 @@ def tmp_dir(_schain_name): shutil.rmtree(path, ignore_errors=True) -def test_run_pm_schain(tmp_dir, skale, skale_ima, node_config, _schain_name): - def target_regular_mock(*args, **kwargs): - process_report = args[-1] - iterations = 5 - for _ in range(iterations): - process_report.ts = int(time.time()) - time.sleep(1) +def target_regular_mock(*args, **kwargs): + schain_name = args[1].name + process_report = ProcessReport(schain_name) + process_report.update(os.getpid(), int(time.time())) + logger.info('Starting regular test task runner') + iterations = 5 + for i in range(iterations): + process_report.ts = int(time.time()) + logger.info('Regular test task runner beat %s', i) + time.sleep(1) - def target_stuck_mock(*args, **kwargs): - iterations = 10000 - for _ in range(iterations): - time.sleep(1) - schain = skale.schains.get_by_name(_schain_name) +def target_stuck_mock(*args, **kwargs): + schain_name = ProcessReport(args[1].name) + ProcessReport(schain_name).update(os.getpid(), int(time.time())) + logger.info('Starting stucked test task runner') + iterations = 10000 + for i in range(iterations): + logger.info('Stuck test task runner beat %s', i) + time.sleep(1) + + +def wait_for_process_report(process_report): + wait_it = 0 + while wait_it < MAX_ITERATIONS and not process_report.is_exist(): + time.sleep(0.5) + wait_it += 1 + assert process_report.is_exist() + + +def test_run_pm_schain(tmp_dir, skale, skale_ima, node_config, _schain_name): + schain = get_schain_struct(schain_name=_schain_name) timeout = 7 with mock.patch('core.schains.process_manager.start_tasks', target_regular_mock): run_pm_schain(skale, skale_ima, node_config, schain, timeout=timeout) - pid = ProcessReport(_schain_name).pid - assert psutil.Process(pid).is_running() + process_report = ProcessReport(schain.name) + wait_for_process_report(process_report) - start_ts = int(time.time()) + pid = process_report.pid - while int(time.time()) - start_ts < 2 * timeout: - time.sleep(1) - assert psutil.Process(pid).status() == 'zombie' + try: + assert psutil.Process(pid).is_running() + start_ts = int(time.time()) - with mock.patch('core.schains.process_manager.start_monitor', target_stuck_mock): - run_pm_schain(skale, skale_ima, node_config, schain, timeout=timeout) + while int(time.time()) - start_ts < 2 * timeout: + time.sleep(1) + assert psutil.Process(pid).status() not in ('dead', 'stopped') + finally: + pid = ProcessReport(_schain_name).pid + terminate_process(pid) - pid = ProcessReport(_schain_name).pid + old_pid = pid + wait_it = 0 + while wait_it < MAX_ITERATIONS and process_report.pid == old_pid: + time.sleep(0.5) + wait_it += 1 - assert psutil.Process(pid).is_running() + with mock.patch('core.schains.process_manager.start_tasks', target_stuck_mock): + run_pm_schain(skale, skale_ima, node_config, schain, timeout=timeout) start_ts = int(time.time()) @@ -65,12 +98,11 @@ def target_stuck_mock(*args, **kwargs): psutil.Process(pid).is_running() except psutil.NoSuchProcess: break - with mock.patch('core.schains.process_manager.start_monitor', target_stuck_mock): - run_pm_schain(skale, skale_ima, node_config, schain, timeout=timeout) time.sleep(1) - with pytest.raises(psutil.NoSuchProcess): - psutil.Process(pid).is_running() - - pid = ProcessReport(_schain_name).pid - terminate_process(pid) + try: + with pytest.raises(psutil.NoSuchProcess): + psutil.Process(pid).is_running() + finally: + pid = ProcessReport(_schain_name).pid + terminate_process(pid) From 7a203dc21ff79e3f9dcea82f6f8c535bbbe787f9 Mon Sep 17 00:00:00 2001 From: badrogger Date: Thu, 26 Sep 2024 17:42:39 +0000 Subject: [PATCH 25/32] Remove unused code --- core/schains/monitor/action.py | 2 +- core/schains/monitor/main.py | 80 ---------------------------------- core/schains/monitor/tasks.py | 2 +- 3 files changed, 2 insertions(+), 82 deletions(-) diff --git a/core/schains/monitor/action.py b/core/schains/monitor/action.py index b02cbab44..5e5fb301e 100644 --- a/core/schains/monitor/action.py +++ b/core/schains/monitor/action.py @@ -212,7 +212,7 @@ def dkg(self) -> bool: def upstream_config(self) -> bool: with self.statsd_client.timer(f'admin.action.upstream_config.{no_hyphens(self.name)}'): logger.info( - 'Creating new upstream_config rotation_id: %s, stream: %s', + 'Generating new upstream_config rotation_id: %s, stream: %s', self.rotation_data.get('rotation_id'), self.stream_version ) new_config = create_new_upstream_config( diff --git a/core/schains/monitor/main.py b/core/schains/monitor/main.py index 1cf2c2e04..8042a8cd0 100644 --- a/core/schains/monitor/main.py +++ b/core/schains/monitor/main.py @@ -38,7 +38,6 @@ from core.schains.external_config import ExternalConfig, ExternalState from core.schains.monitor import get_skaled_monitor, RegularConfigMonitor, SyncConfigMonitor from core.schains.monitor.action import ConfigActionManager, SkaledActionManager -from core.schains.monitor.pipeline import Pipeline, run_pipelines from core.schains.monitor.tasks import execute_tasks, Future, ITask from core.schains.process import ProcessReport from core.schains.status import get_node_cli_status, get_skaled_status @@ -186,85 +185,6 @@ def run_skaled_pipeline( mon(skaled_am, skaled_checks).run() -def start_monitor( - skale: Skale, - schain: SchainStructure, - node_config: NodeConfig, - skale_ima: SkaleIma, - process_report: ProcessReport, - dutils: Optional[DockerUtils] = None, -) -> bool: - reload(web3_request) - name = schain.name - - stream_version = get_skale_node_version() - schain_record = upsert_schain_record(name) - - dkg_timeout = skale.constants_holder.get_dkg_timeout() - stuck_timeout = int(dkg_timeout * DKG_TIMEOUT_COEFFICIENT) - - is_rotation_active = skale.node_rotation.is_rotation_active(name) - - leaving_chain = not SYNC_NODE and not is_node_part_of_chain(skale, name, node_config.id) - if leaving_chain and not is_rotation_active: - logger.info('Not on node (%d), finishing process', node_config.id) - return True - - logger.info( - 'sync_config_run %s, config_version %s, stream_version %s', - schain_record.sync_config_run, - schain_record.config_version, - stream_version, - ) - - statsd_client = get_statsd_client() - monitor_last_seen_ts = schain_record.monitor_last_seen.timestamp() - statsd_client.incr(f'admin.schain.monitor.{no_hyphens(name)}') - statsd_client.gauge(f'admin.schain.monitor_last_seen.{no_hyphens(name)}', monitor_last_seen_ts) - - pipelines = [] - if not leaving_chain: - logger.info('Adding config pipelines to the pool') - pipelines.append( - Pipeline( - name='config', - job=functools.partial( - run_config_pipeline, - skale=skale, - skale_ima=skale_ima, - schain=schain, - node_config=node_config, - stream_version=stream_version, - ), - ) - ) - if schain_record.config_version != stream_version or ( - schain_record.sync_config_run and schain_record.first_run - ): - ConfigFileManager(name).remove_skaled_config() - else: - logger.info('Adding skaled pipeline to the pool') - pipelines.append( - Pipeline( - name='skaled', - job=functools.partial( - run_skaled_pipeline, - skale=skale, - schain=schain, - node_config=node_config, - dutils=dutils, - ), - ) - ) - - if len(pipelines) == 0: - logger.warning('No pipelines to run') - return False - - run_pipelines(pipelines=pipelines, process_report=process_report, stuck_timeout=stuck_timeout) - return True - - class SkaledTask(ITask): NAME = 'skaled' STUCK_TIMEOUT = 3600 # 1 hours diff --git a/core/schains/monitor/tasks.py b/core/schains/monitor/tasks.py index c6342f7a0..e591e3b48 100644 --- a/core/schains/monitor/tasks.py +++ b/core/schains/monitor/tasks.py @@ -77,7 +77,7 @@ def execute_tasks( logger.info('Starting task %s at %d', task.name, task.start_ts) pipeline = task.create_pipeline() task.future = executor.submit(pipeline) - if task.future.running(): + elif task.future.running(): if int(time.time()) - task.start_ts > task.stuck_timeout: logger.info('Canceling future for %s', task.name) canceled = task.future.cancel() From d8947ed9ef4004b74b871ac1b71beacec831a6f3 Mon Sep 17 00:00:00 2001 From: badrogger Date: Fri, 27 Sep 2024 11:50:55 +0000 Subject: [PATCH 26/32] Remove unused pipeline module --- core/schains/monitor/pipeline.py | 107 ----------------------------- tests/schains/monitor/main_test.py | 66 +----------------- 2 files changed, 1 insertion(+), 172 deletions(-) delete mode 100644 core/schains/monitor/pipeline.py diff --git a/core/schains/monitor/pipeline.py b/core/schains/monitor/pipeline.py deleted file mode 100644 index eaf5c2ccf..000000000 --- a/core/schains/monitor/pipeline.py +++ /dev/null @@ -1,107 +0,0 @@ -import logging -import queue -import random -import threading -import time - -from typing import Callable, NamedTuple -from core.schains.process import ProcessReport - -logger = logging.getLogger(__name__) - - -MIN_SCHAIN_MONITOR_SLEEP_INTERVAL = 20 -MAX_SCHAIN_MONITOR_SLEEP_INTERVAL = 40 - -SKALED_PIPELINE_SLEEP = 2 -CONFIG_PIPELINE_SLEEP = 3 -STUCK_TIMEOUT = 60 * 60 * 2 -SHUTDOWN_INTERVAL = 60 * 10 - - -class Pipeline(NamedTuple): - name: str - job: Callable - - -def run_pipelines( - pipelines: list[Pipeline], - process_report: ProcessReport, - once: bool = False, - stuck_timeout: int = STUCK_TIMEOUT, - shutdown_interval: int = SHUTDOWN_INTERVAL, -) -> None: - init_ts = time.time() - - heartbeat_queues = [queue.Queue() for _ in range(len(pipelines))] - terminating_events = [threading.Event() for _ in range(len(pipelines))] - heartbeat_ts = [init_ts for _ in range(len(pipelines))] - - threads = [ - threading.Thread( - name=pipeline.name, - target=keep_pipeline, - args=[heartbeat_queue, terminating_event, pipeline.job], - ) - for heartbeat_queue, terminating_event, pipeline in zip( - heartbeat_queues, terminating_events, pipelines - ) - ] - - for th in threads: - th.start() - - stuck = False - while not stuck: - for pindex, heartbeat_queue in enumerate(heartbeat_queues): - if not heartbeat_queue.empty(): - heartbeat_ts[pindex] = heartbeat_queue.get() - ts = int(time.time()) - if ts - heartbeat_ts[pindex] > stuck_timeout: - logger.warning( - '%s pipeline has stucked (last heartbeat %d)', - pipelines[pindex].name, - heartbeat_ts[pindex], - ) - stuck = True - break - if once and all((lambda ts: ts > init_ts, heartbeat_ts)): - logger.info('Successfully completed requested single run') - break - ts = int(time.time()) - process_report.ts = ts - - logger.info('Terminating all pipelines') - for event in terminating_events: - if not event.is_set(): - event.set() - if stuck: - logger.info('Joining threads with timeout') - for thread in threads: - thread.join(timeout=shutdown_interval) - process_report.ts = 0 - logger.warning('Stuck was detected') - - logger.info('Finishing with pipelines') - - -def keep_pipeline( - reporting_queue: queue.Queue, terminate: threading.Event, pipeline: Callable -) -> None: - while not terminate.is_set(): - logger.info('Running pipeline') - try: - pipeline() - except Exception: - logger.exception('Pipeline run failed') - terminate.set() - reporting_queue.put(time.time()) - sleep_for_a_while() - - -def sleep_for_a_while(): - schain_monitor_sleep = random.randint( - MIN_SCHAIN_MONITOR_SLEEP_INTERVAL, MAX_SCHAIN_MONITOR_SLEEP_INTERVAL - ) - logger.info('Monitor iteration completed, sleeping for %d', schain_monitor_sleep) - time.sleep(schain_monitor_sleep) diff --git a/tests/schains/monitor/main_test.py b/tests/schains/monitor/main_test.py index 7b6a11eec..1e5a1389e 100644 --- a/tests/schains/monitor/main_test.py +++ b/tests/schains/monitor/main_test.py @@ -5,15 +5,13 @@ import shutil import time from concurrent.futures import Future -from multiprocessing import Process from typing import Callable import pytest from core.schains.firewall.types import IpRange from core.schains.firewall.utils import get_sync_agent_ranges -from core.schains.monitor.main import Pipeline, run_pipelines -from core.schains.process import ProcessReport, terminate_process +from core.schains.process import ProcessReport from core.schains.monitor.tasks import execute_tasks, ITask from tools.configs.schains import SCHAINS_DIR_PATH from tools.helper import is_node_part_of_chain @@ -66,68 +64,6 @@ def test_is_node_part_of_chain(skale, schain_on_contracts, node_config): assert not chain_on_node -def test_run_pipelines(tmp_dir, _schain_name): - def simple_pipeline(index: int): - logging.info('Running simple pipeline %d', index) - time.sleep(1) - logging.info('Finishing simple pipeline %d', index) - - def stuck_pipeline(index: int): - logging.info('Running stuck pipeline %d', index) - while True: - logging.info('Stuck pipeline %d beat', index) - time.sleep(2) - - process_report = ProcessReport(name=_schain_name) - - target = functools.partial( - run_pipelines, - pipelines=[ - Pipeline(name='healthy0', job=functools.partial(simple_pipeline, index=0)), - Pipeline(name='healthy1', job=functools.partial(simple_pipeline, index=1)), - ], - process_report=process_report, - once=True, - stuck_timeout=5, - shutdown_interval=10, - ) - - terminated = False - monitor_process = Process(target=target) - try: - monitor_process.start() - monitor_process.join() - finally: - if monitor_process.is_alive(): - terminated = True - terminate_process(monitor_process.ident) - assert not terminated - - target = functools.partial( - run_pipelines, - pipelines=[ - Pipeline(name='healthy', job=functools.partial(simple_pipeline, index=0)), - Pipeline(name='stuck', job=functools.partial(stuck_pipeline, index=1)), - ], - process_report=process_report, - stuck_timeout=5, - shutdown_interval=10, - ) - - monitor_process = Process(target=target) - terminated = False - - try: - monitor_process.start() - monitor_process.join(timeout=50) - finally: - if monitor_process.is_alive(): - terminated = True - terminate_process(monitor_process.ident) - - assert terminated - - def test_execute_tasks(tmp_dir, _schain_name): def run_stuck_pipeline(index: int) -> None: logging.info('Running stuck pipeline %d', index) From 5665cc85474095075f82d1122e6bd9c85435c067 Mon Sep 17 00:00:00 2001 From: badrogger Date: Fri, 27 Sep 2024 15:36:12 +0000 Subject: [PATCH 27/32] Reduce task sleeping interval. Improve logs. --- core/schains/monitor/action.py | 1 + core/schains/monitor/config_monitor.py | 2 ++ core/schains/monitor/main.py | 2 +- core/schains/monitor/skaled_monitor.py | 2 ++ core/schains/monitor/tasks.py | 6 +++--- 5 files changed, 9 insertions(+), 4 deletions(-) diff --git a/core/schains/monitor/action.py b/core/schains/monitor/action.py index 5e5fb301e..75dd0d5d1 100644 --- a/core/schains/monitor/action.py +++ b/core/schains/monitor/action.py @@ -229,6 +229,7 @@ def upstream_config(self) -> bool: result = False if not self.cfm.upstream_config_exists() or \ new_config != self.cfm.latest_upstream_config: + logger.info('Saving new config') rotation_id = self.rotation_data['rotation_id'] logger.info( 'Saving new upstream config rotation_id: %d, ips: %s', diff --git a/core/schains/monitor/config_monitor.py b/core/schains/monitor/config_monitor.py index 47587a1bc..639689870 100644 --- a/core/schains/monitor/config_monitor.py +++ b/core/schains/monitor/config_monitor.py @@ -45,6 +45,8 @@ def run(self): self.execute() self.am.log_executed_blocks() self.am._upd_last_seen() + except Exception as e: + logger.info('Config monitor type failed %s', typename, exc_info=e) finally: logger.info('Config monitor type finished %s', typename) diff --git a/core/schains/monitor/main.py b/core/schains/monitor/main.py index 8042a8cd0..a363659e2 100644 --- a/core/schains/monitor/main.py +++ b/core/schains/monitor/main.py @@ -187,7 +187,7 @@ def run_skaled_pipeline( class SkaledTask(ITask): NAME = 'skaled' - STUCK_TIMEOUT = 3600 # 1 hours + STUCK_TIMEOUT = 3600 # 1 hour def __init__( self, diff --git a/core/schains/monitor/skaled_monitor.py b/core/schains/monitor/skaled_monitor.py index a946ca2be..a0fe5c99b 100644 --- a/core/schains/monitor/skaled_monitor.py +++ b/core/schains/monitor/skaled_monitor.py @@ -55,6 +55,8 @@ def run(self): self.am._upd_schain_record() self.am.log_executed_blocks() self.am._upd_last_seen() + except Exception as e: + logger.info('Skaled monitor type failed %s', typename, exc_info=e) finally: logger.info('Skaled monitor type finished %s', typename) diff --git a/core/schains/monitor/tasks.py b/core/schains/monitor/tasks.py index e591e3b48..d7d450f4b 100644 --- a/core/schains/monitor/tasks.py +++ b/core/schains/monitor/tasks.py @@ -10,8 +10,7 @@ logger = logging.getLogger(__name__) -STUCK_TIMEOUT = 60 * 60 * 2 -SLEEP_INTERVAL = 60 * 10 +SLEEP_INTERVAL = 10 class Pipeline(NamedTuple): @@ -68,6 +67,7 @@ def execute_tasks( process_report: ProcessReport, sleep_interval: int = SLEEP_INTERVAL, ) -> None: + logger.info('Running tasks %s', tasks) with ThreadPoolExecutor(max_workers=len(tasks), thread_name_prefix='mon') as executor: stucked = [] while True: @@ -82,7 +82,7 @@ def execute_tasks( logger.info('Canceling future for %s', task.name) canceled = task.future.cancel() if not canceled: - logger.warning('Stuck detected for job {task.name}') + logger.warning('Stuck detected for job %s', task.name) task.start_ts = -1 stucked.append(task.name) time.sleep(sleep_interval) From f7fa83139f56ad5b910c83e1a1166c55457c47c0 Mon Sep 17 00:00:00 2001 From: badrogger Date: Mon, 30 Sep 2024 16:21:50 +0000 Subject: [PATCH 28/32] Remove unused classes --- core/schains/cleaner.py | 5 ++--- core/schains/monitor/tasks.py | 13 ++----------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/core/schains/cleaner.py b/core/schains/cleaner.py index 6141883ac..53c9bd437 100644 --- a/core/schains/cleaner.py +++ b/core/schains/cleaner.py @@ -109,12 +109,11 @@ def monitor(skale, node_config, dutils=None): for schain_name in schains_on_node: if schain_name not in schain_names_on_contracts: - logger.warning(f'sChain {schain_name} was found on node, but not on contracts: \ -{schain_names_on_contracts}, going to remove it!') + logger.warning('%s was found on node, but not on contracts: %s, trying to cleanup', schain_name, schain_names_on_contracts) try: ensure_schain_removed(skale, schain_name, node_config.id, dutils=dutils) except Exception: - logger.exception(f'sChain removal {schain_name} failed') + logger.exception('%s removal failed', schain_name) logger.info('Cleanup procedure finished') diff --git a/core/schains/monitor/tasks.py b/core/schains/monitor/tasks.py index d7d450f4b..65e82ea32 100644 --- a/core/schains/monitor/tasks.py +++ b/core/schains/monitor/tasks.py @@ -2,7 +2,7 @@ import logging import time from concurrent.futures import Future, ThreadPoolExecutor -from typing import Callable, NamedTuple +from typing import Callable from core.schains.process import ProcessReport @@ -13,11 +13,6 @@ SLEEP_INTERVAL = 10 -class Pipeline(NamedTuple): - name: str - job: Callable - - class ITask(metaclass=abc.ABCMeta): @property @abc.abstractmethod @@ -58,17 +53,13 @@ def start_ts(self, value: int) -> None: pass -class StuckMonitorError(Exception): - pass - - def execute_tasks( tasks: list[ITask], process_report: ProcessReport, sleep_interval: int = SLEEP_INTERVAL, ) -> None: logger.info('Running tasks %s', tasks) - with ThreadPoolExecutor(max_workers=len(tasks), thread_name_prefix='mon') as executor: + with ThreadPoolExecutor(max_workers=len(tasks), thread_name_prefix='T') as executor: stucked = [] while True: for index, task in enumerate(tasks): From 812a5798f39077ef066c7c549f48e2bcf3a4ea01 Mon Sep 17 00:00:00 2001 From: badrogger Date: Tue, 1 Oct 2024 15:02:11 +0000 Subject: [PATCH 29/32] Add additional monitor tasks tests --- core/schains/monitor/main.py | 1 + core/schains/monitor/tasks.py | 2 +- tests/schains/monitor/main_test.py | 60 +++++++++++++++++++++++++++--- web/models/schain.py | 4 +- 4 files changed, 59 insertions(+), 8 deletions(-) diff --git a/core/schains/monitor/main.py b/core/schains/monitor/main.py index a363659e2..9728bb346 100644 --- a/core/schains/monitor/main.py +++ b/core/schains/monitor/main.py @@ -202,6 +202,7 @@ def __init__( self.node_config = node_config self.dutils = dutils self._future = Future() + self._start_ts = 0 self.stream_version = stream_version @property diff --git a/core/schains/monitor/tasks.py b/core/schains/monitor/tasks.py index 65e82ea32..e15ce06aa 100644 --- a/core/schains/monitor/tasks.py +++ b/core/schains/monitor/tasks.py @@ -63,7 +63,7 @@ def execute_tasks( stucked = [] while True: for index, task in enumerate(tasks): - if not task.future.running() and task.needed: + if not task.future.running() and task.needed and len(stucked) == 0: task.start_ts = int(time.time()) logger.info('Starting task %s at %d', task.name, task.start_ts) pipeline = task.create_pipeline() diff --git a/tests/schains/monitor/main_test.py b/tests/schains/monitor/main_test.py index 1e5a1389e..242fbe43e 100644 --- a/tests/schains/monitor/main_test.py +++ b/tests/schains/monitor/main_test.py @@ -6,15 +6,18 @@ import time from concurrent.futures import Future from typing import Callable +from unittest import mock import pytest from core.schains.firewall.types import IpRange from core.schains.firewall.utils import get_sync_agent_ranges from core.schains.process import ProcessReport +from core.schains.monitor.main import ConfigTask, SkaledTask from core.schains.monitor.tasks import execute_tasks, ITask from tools.configs.schains import SCHAINS_DIR_PATH from tools.helper import is_node_part_of_chain +from web.models.schain import upsert_schain_record @pytest.fixture @@ -64,6 +67,57 @@ def test_is_node_part_of_chain(skale, schain_on_contracts, node_config): assert not chain_on_node +def test_config_task(skale, skale_ima, schain_db, schain_on_contracts, node_config): + stream_version = '2.3.0' + config_task = ConfigTask( + schain_name=schain_on_contracts, + skale=skale, + skale_ima=skale_ima, + node_config=node_config, + stream_version=stream_version, + ) + assert config_task.needed + skale_ima.linker.has_schain = mock.Mock(return_value=True) + + def get_monitor_mock(*args, **kwargs): + result = mock.MagicMock() + result.__name__ = 'TestConfigMonitor' + return result + + with mock.patch('core.schains.monitor.main.RegularConfigMonitor', get_monitor_mock): + pipeline = config_task.create_pipeline() + pipeline() + + +def test_skaled_task(skale, schain_db, schain_on_contracts, node_config, dutils): + record = upsert_schain_record(schain_on_contracts) + stream_version = '2.3.0' + skaled_task = SkaledTask( + schain_name=schain_on_contracts, + skale=skale, + node_config=node_config, + stream_version=stream_version, + dutils=dutils, + ) + assert not skaled_task.needed + assert skaled_task.name == 'skaled' + assert skaled_task.start_ts == 0 + assert skaled_task.stuck_timeout == 3600 + + record.set_config_version(stream_version) + assert skaled_task.needed + + def get_monitor_mock(*args, **kwargs): + result = mock.MagicMock() + result.__name__ = 'TestSkaledMonitor' + return result + + with mock.patch('core.schains.monitor.main.get_skaled_monitor', get_monitor_mock): + with mock.patch('core.schains.monitor.main.notify_checks'): + pipeline = skaled_task.create_pipeline() + pipeline() + + def test_execute_tasks(tmp_dir, _schain_name): def run_stuck_pipeline(index: int) -> None: logging.info('Running stuck pipeline %d', index) @@ -127,11 +181,7 @@ def needed(self) -> bool: process_report = ProcessReport(name=_schain_name) tasks = [StuckedTask(0), NotNeededTask(1)] - execute_tasks( - tasks=tasks, - process_report=process_report, - sleep_interval=1 - ) + execute_tasks(tasks=tasks, process_report=process_report, sleep_interval=1) print(tasks[0], tasks[1]) assert tasks[0].start_ts == -1 diff --git a/web/models/schain.py b/web/models/schain.py index 264904f5a..5f92e45ca 100644 --- a/web/models/schain.py +++ b/web/models/schain.py @@ -31,7 +31,7 @@ logger = logging.getLogger(__name__) DEFAULT_CONFIG_VERSION = '0.0.0' -RETRY_ATTEMPTS = 10 +RETRY_ATTEMPTS = 5 TIMEOUTS = [2 ** p for p in range(RETRY_ATTEMPTS)] @@ -43,7 +43,7 @@ def wrapper(cls, *args, **kwargs): try: result = func(cls, *args, **kwargs) except OperationalError as e: - logger.exception('DB operational error') + logger.error('DB operational error. Sleeping %d', timeout, exc_info=e) error = e time.sleep(timeout) else: From 0fa074c7dbd9b5e72a4868a2db0a8f6b246c7627 Mon Sep 17 00:00:00 2001 From: badrogger Date: Tue, 1 Oct 2024 15:55:53 +0000 Subject: [PATCH 30/32] Fix linter --- core/schains/cleaner.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/core/schains/cleaner.py b/core/schains/cleaner.py index 53c9bd437..f006d439a 100644 --- a/core/schains/cleaner.py +++ b/core/schains/cleaner.py @@ -109,7 +109,11 @@ def monitor(skale, node_config, dutils=None): for schain_name in schains_on_node: if schain_name not in schain_names_on_contracts: - logger.warning('%s was found on node, but not on contracts: %s, trying to cleanup', schain_name, schain_names_on_contracts) + logger.warning( + '%s was found on node, but not on contracts: %s, trying to cleanup', + schain_name, + schain_names_on_contracts, + ) try: ensure_schain_removed(skale, schain_name, node_config.id, dutils=dutils) except Exception: @@ -240,9 +244,7 @@ def cleanup_schain( ) check_status = checks.get_all() if check_status['skaled_container'] or is_exited( - schain_name, - container_type=ContainerType.schain, - dutils=dutils + schain_name, container_type=ContainerType.schain, dutils=dutils ): remove_schain_container(schain_name, dutils=dutils) if check_status['volume']: @@ -259,9 +261,7 @@ def cleanup_schain( rc.cleanup() if estate is not None and estate.ima_linked: if check_status.get('ima_container', False) or is_exited( - schain_name, - container_type=ContainerType.ima, - dutils=dutils + schain_name, container_type=ContainerType.ima, dutils=dutils ): remove_ima_container(schain_name, dutils=dutils) if check_status['config_dir']: From be712d6b1835dddad1e1f857ee104f9884d5a78b Mon Sep 17 00:00:00 2001 From: badrogger Date: Wed, 16 Oct 2024 11:56:24 +0000 Subject: [PATCH 31/32] Remove unused code --- core/schains/monitor/main.py | 12 +++------ core/schains/monitor/tasks.py | 4 +-- core/schains/process.py | 1 - core/schains/task.py | 51 ----------------------------------- tests/schains/task_test.py | 33 ----------------------- 5 files changed, 5 insertions(+), 96 deletions(-) delete mode 100644 core/schains/task.py delete mode 100644 tests/schains/task_test.py diff --git a/core/schains/monitor/main.py b/core/schains/monitor/main.py index 9728bb346..b3c589724 100644 --- a/core/schains/monitor/main.py +++ b/core/schains/monitor/main.py @@ -52,12 +52,6 @@ from web.models.schain import SChainRecord, upsert_schain_record -MIN_SCHAIN_MONITOR_SLEEP_INTERVAL = 20 -MAX_SCHAIN_MONITOR_SLEEP_INTERVAL = 40 - -STUCK_TIMEOUT = 60 * 60 * 2 -SHUTDOWN_INTERVAL = 60 * 10 - logger = logging.getLogger(__name__) @@ -187,7 +181,7 @@ def run_skaled_pipeline( class SkaledTask(ITask): NAME = 'skaled' - STUCK_TIMEOUT = 3600 # 1 hour + STUCK_TIMEOUT_SECONDS = 3600 # 1 hour def __init__( self, @@ -211,7 +205,7 @@ def name(self) -> str: @property def stuck_timeout(self) -> int: - return self.STUCK_TIMEOUT + return self.STUCK_TIMEOUT_SECONDS @property def future(self) -> Future: @@ -248,7 +242,7 @@ def create_pipeline(self) -> Callable: class ConfigTask(ITask): NAME = 'config' - STUCK_TIMEOUT = 60 * 60 * 2 + STUCK_TIMEOUT_SECONDS = 60 * 60 * 2 def __init__( self, diff --git a/core/schains/monitor/tasks.py b/core/schains/monitor/tasks.py index e15ce06aa..7fcc86f82 100644 --- a/core/schains/monitor/tasks.py +++ b/core/schains/monitor/tasks.py @@ -10,7 +10,7 @@ logger = logging.getLogger(__name__) -SLEEP_INTERVAL = 10 +SLEEP_INTERVAL_SECONDS = 10 class ITask(metaclass=abc.ABCMeta): @@ -56,7 +56,7 @@ def start_ts(self, value: int) -> None: def execute_tasks( tasks: list[ITask], process_report: ProcessReport, - sleep_interval: int = SLEEP_INTERVAL, + sleep_interval: int = SLEEP_INTERVAL_SECONDS, ) -> None: logger.info('Running tasks %s', tasks) with ThreadPoolExecutor(max_workers=len(tasks), thread_name_prefix='T') as executor: diff --git a/core/schains/process.py b/core/schains/process.py index 387c767a0..1da149995 100644 --- a/core/schains/process.py +++ b/core/schains/process.py @@ -33,7 +33,6 @@ logger = logging.getLogger(__name__) -TIMEOUT_COEFFICIENT = 2.2 P_KILL_WAIT_TIMEOUT = 60 diff --git a/core/schains/task.py b/core/schains/task.py deleted file mode 100644 index b95a8eb92..000000000 --- a/core/schains/task.py +++ /dev/null @@ -1,51 +0,0 @@ -import logging -import time -from concurrent.futures import Future, ThreadPoolExecutor -from typing import Callable, List, Optional - -logger = logging.getLogger(__name__) - - -class Task: - def __init__( - self, - name: str, - action: Callable, - index: int = 0, - sleep: int = 2 - ) -> None: - self.name = name - self.index = index - self.action = action - self.sleep = sleep - - def run(self) -> None: - try: - self.action() - except Exception as e: - logger.exception('Task %s failed with %s', self.name, e) - logger.info('Sleeping after task execution for %d', self.sleep) - time.sleep(self.sleep) - - -def keep_tasks_running( - executor: ThreadPoolExecutor, - tasks: List[Task], - futures: List[Optional[Future]] -) -> None: - for i, task in enumerate(tasks): - future = futures[i] - if future is not None and not future.running(): - result = future.result() - logger.info('Task %s finished with %s', task.name, result) - if future is None or not future.running(): - logger.info('Running task %s', task.name) - futures[i] = executor.submit(task.run) - - -def run_tasks(name: str, tasks: List[Task]) -> None: - with ThreadPoolExecutor(max_workers=len(tasks), thread_name_prefix='T') as executor: - futures: List[Optional[Future]] = [None for i in range(len(tasks))] - while True: - keep_tasks_running(executor, tasks, futures) - time.sleep(30) diff --git a/tests/schains/task_test.py b/tests/schains/task_test.py deleted file mode 100644 index f5c574094..000000000 --- a/tests/schains/task_test.py +++ /dev/null @@ -1,33 +0,0 @@ -import functools -import time - -import pytest - -from core.schains.task import run_tasks, Task - -ITERATIONS = 10 -SCHAINS_NUM = 10 - - -class StopActionError(Exception): - pass - - -def action(name): - for i in range(ITERATIONS): - time.sleep(2) - raise StopActionError(f'Stopping {name}') - - -@pytest.mark.skip -def test_tasks(): - tasks = [ - Task( - f'test-schain-{i}', - functools.partial(action, name=f'test-schain-{i}'), - i - ) - for i in range(SCHAINS_NUM) - ] - run_tasks(tasks=tasks) - time.sleep(3) From 0ac7970614a007627033ac01f60317b9013bb7f0 Mon Sep 17 00:00:00 2001 From: badrogger Date: Wed, 16 Oct 2024 11:56:55 +0000 Subject: [PATCH 32/32] Run tests only for push --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f5cc084e9..59296eb3d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,5 +1,5 @@ name: Test -on: [push, pull_request] +on: [push] env: ETH_PRIVATE_KEY: ${{ secrets.ETH_PRIVATE_KEY }} SCHAIN_TYPE: ${{ secrets.SCHAIN_TYPE }}