From 551702ec1f7f519fbf455b36f8ffadb21bcf8154 Mon Sep 17 00:00:00 2001 From: HudsonGraeme Date: Thu, 12 Dec 2024 15:59:07 +0000 Subject: [PATCH 01/20] Bump version to 6.0.0 --- neurons/__init__.py | 2 +- neurons/_validator/scoring/weights.py | 5 +++-- neurons/constants.py | 2 +- .../input.json | 1 - 4 files changed, 5 insertions(+), 5 deletions(-) delete mode 100644 neurons/deployment_layer/model_33b92394b18412622adad75733a6fc659b4e202b01ee8a5465958a6bad8ded62/input.json diff --git a/neurons/__init__.py b/neurons/__init__.py index a949afc4..8620cca3 100644 --- a/neurons/__init__.py +++ b/neurons/__init__.py @@ -2,4 +2,4 @@ This version number is used to trigger automatic updates. """ -__version__ = "5.1.4" +__version__ = "6.0.0" diff --git a/neurons/_validator/scoring/weights.py b/neurons/_validator/scoring/weights.py index 85ab011b..e6a4dd79 100644 --- a/neurons/_validator/scoring/weights.py +++ b/neurons/_validator/scoring/weights.py @@ -68,10 +68,11 @@ def update_weights(self, scores: torch.Tensor) -> bool: bt.logging.info("Updating weights") - bt.logging.info("Updating weights") weights = torch.zeros(self.metagraph.n) nonzero_indices = scores.nonzero() - print(weights, nonzero_indices, scores) + bt.logging.debug( + f"Weights: {weights}, Nonzero indices: {nonzero_indices}, Scores: {scores}" + ) if nonzero_indices.sum() > 0: weights[nonzero_indices] = scores[nonzero_indices] diff --git a/neurons/constants.py b/neurons/constants.py index 29bc090b..a93e4b27 100644 --- a/neurons/constants.py +++ b/neurons/constants.py @@ -32,7 +32,7 @@ # Shift in seconds to apply to the minimum response time for vertical asymptote adjustment MINIMUM_SCORE_SHIFT = 0.0 # Weights version hyperparameter -WEIGHTS_VERSION = 1514 +WEIGHTS_VERSION = 1600 # Rate limit for weight updates WEIGHT_RATE_LIMIT: int = 100 # Delay between requests diff --git a/neurons/deployment_layer/model_33b92394b18412622adad75733a6fc659b4e202b01ee8a5465958a6bad8ded62/input.json b/neurons/deployment_layer/model_33b92394b18412622adad75733a6fc659b4e202b01ee8a5465958a6bad8ded62/input.json deleted file mode 100644 index df3b0afe..00000000 --- a/neurons/deployment_layer/model_33b92394b18412622adad75733a6fc659b4e202b01ee8a5465958a6bad8ded62/input.json +++ /dev/null @@ -1 +0,0 @@ -{"input_data": [[0.7794686555862427, 0.6809455156326294, 0.3775910437107086, 0.8429582715034485, 0.8292675614356995, 0.0882205218076706, 0.4011983871459961, 0.9323041439056396, 0.7773856520652771, 0.7941298484802246, 0.03448842838406563, 0.04747951030731201, 0.5277414321899414, 0.8536348342895508, 0.17029573023319244, 0.8829934000968933, 0.6650581955909729, 0.9269793033599854, 0.17329521477222443, 0.48585787415504456, 0.7039809226989746, 0.006828826852142811, 0.34701061248779297, 0.458881139755249, 0.09565884619951248, 0.5835137367248535, 0.6649661660194397, 0.8946686387062073, 0.7309051156044006, 0.6489992737770081, 0.6777242422103882, 0.17124499380588531, 0.5306597948074341, 0.46822836995124817, 0.9834725856781006, 0.9121965765953064, 0.3174760341644287, 0.9013152122497559, 0.4911682605743408, 0.6072609424591064, 0.2578725814819336, 0.5314511656761169, 0.24931003153324127, 0.14604689180850983, 0.43746691942214966, 0.8786545991897583, 0.2085399478673935, 0.2983964681625366, 0.9710108041763306, 0.31453123688697815, 0.595678448677063, 0.5057293772697449, 0.3340931534767151, 0.08662565052509308, 0.047077156603336334, 0.3515186011791229, 0.08305668830871582, 0.7111360430717468, 0.9252983927726746, 0.8941932320594788, 0.525164008140564, 0.5602061152458191, 0.03857544809579849, 0.6424386501312256, 0.16259750723838806, 0.6345444917678833, 0.022738732397556305, 0.6852360963821411, 0.10518483817577362, 0.025503499433398247, 0.9316177368164062, 0.14264649152755737, 0.8343826532363892, 0.7791666984558105, 0.6261012554168701, 0.21122364699840546, 0.4291432201862335, 0.9473956823348999, 0.9014989137649536, 0.9135263562202454, 0.7051522731781006, 0.16453231871128082, 0.8735365867614746, 0.5426819920539856, 0.03251492604613304, 0.25134673714637756, 0.569119930267334, 0.5371896624565125, 0.8012118339538574, 0.461309552192688, 0.40140053629875183, 0.941059947013855, 0.4510308504104614, 0.20913153886795044, 0.7081170082092285, 0.6046028733253479, 0.7483329772949219, 0.2640364170074463, 0.9350386261940002, 0.787164032459259, 0.5113464593887329, 0.43512842059135437, 0.16646911203861237, 0.6330782175064087, 0.2719026505947113, 0.6378305554389954, 0.4318429231643677, 0.916333794593811, 0.5829175114631653, 0.738396167755127, 0.3215373158454895, 0.041321028023958206, 0.17013895511627197, 0.00018949093646369874, 0.06506246328353882, 0.5927417278289795, 0.3321612477302551, 0.47252190113067627, 0.38523000478744507, 0.22107788920402527, 0.8691383004188538, 0.026992585510015488, 0.47141480445861816, 0.7345831990242004, 0.907618522644043, 0.3415154814720154, 0.5832733511924744, 0.4799695312976837, 0.1212652251124382, 0.9898389577865601, 0.007558291312307119, 0.5798917412757874, 0.4170285761356354, 0.9984191060066223, 0.16392384469509125, 0.7193905711174011, 0.24285921454429626, 0.3867260813713074, 0.4834059774875641, 0.8288545608520508, 0.7040031552314758, 0.13600602746009827, 0.8382794857025146, 0.37359189987182617, 0.5664068460464478, 0.20983566343784332, 0.4544571340084076, 0.29028794169425964, 0.0865643098950386, 0.07884003967046738, 0.5577963590621948, 0.6860281825065613, 0.42827513813972473, 0.002777600660920143, 0.7569222450256348, 0.8636320233345032, 0.9366362690925598, 0.7771576642990112, 0.10189391672611237, 0.4611443877220154, 0.3100345730781555, 0.7940906286239624, 0.07580625265836716, 0.02719643898308277, 0.21641875803470612, 0.9446501731872559, 0.1855611652135849, 0.37111586332321167, 0.20239779353141785, 0.0009318959200754762, 0.5512221455574036, 0.2911645770072937, 0.19933727383613586, 0.7942936420440674, 0.9931385517120361, 0.32307130098342896, 0.9285454750061035, 0.7810187935829163, 0.7238433361053467, 0.8329330086708069, 0.8985331654548645, 0.7131734490394592, 0.6271097660064697, 0.26631054282188416, 0.4925815463066101, 0.8601160049438477, 0.7875863313674927, 0.5615888833999634, 0.5976184010505676, 0.13564826548099518, 0.04655137658119202, 0.3126538395881653, 0.7295407056808472, 0.38343703746795654, 0.11804111301898956, 0.488350510597229, 0.21235623955726624, 0.7206555008888245, 0.4632780849933624, 0.01701243594288826, 0.44720444083213806, 0.3848167955875397, 0.023824729025363922, 0.24855785071849823, 0.04190967231988907, 0.8140885829925537, 0.7750052809715271, 0.7265616059303284, 0.7512474060058594, 0.3908788561820984, 0.5953333973884583, 0.07212728261947632, 0.901211142539978, 0.4891035854816437, 0.396647572517395, 0.32557809352874756, 0.07493803650140762, 0.9222293496131897, 0.6560570597648621, 0.3777931332588196, 0.5689038038253784, 0.15723179280757904, 0.7180094122886658, 0.28270334005355835, 0.022937191650271416, 0.11138973385095596, 0.7963477373123169, 0.6905400156974792, 0.8882362842559814, 0.7238687872886658, 0.5844522714614868, 0.06532762199640274, 0.19095835089683533, 0.08516211807727814, 0.054629597812891006, 0.3365156948566437, 0.5767698884010315, 0.07983719557523727, 0.1467011421918869, 0.8402654528617859, 0.16204577684402466, 0.6307146549224854, 0.6145657300949097, 0.1417083889245987, 0.5266880393028259, 0.9919960498809814, 0.9775968790054321, 0.25797325372695923, 0.20875971019268036, 0.355589896440506, 0.43516939878463745, 0.1908176690340042, 0.2842778265476227, 0.48906922340393066, 0.6523699164390564, 0.628368616104126], [0.10000000149011612], [0.4000000059604645], [0.699999988079071], [0.20000000298023224], [0.10000000149011612], [1584917389]]} \ No newline at end of file From 39278953fdfe243e019c4d6227f75bac55df1c3f Mon Sep 17 00:00:00 2001 From: Bondarenko Oleg Date: Wed, 18 Dec 2024 00:36:09 +0200 Subject: [PATCH 02/20] Blacklist two Jolt model IDs (#39) --- neurons/constants.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/neurons/constants.py b/neurons/constants.py index a93e4b27..43834a26 100644 --- a/neurons/constants.py +++ b/neurons/constants.py @@ -17,6 +17,8 @@ "55de10a6bcf638af4bc79901d63204a9e5b1c6534670aa03010bae6045e3d0e8", "9998a12b8194d3e57d332b484ede57c3d871d42a176456c4e10da2995791d181", "ed8ba401d709ee31f6b9272163c71451da171c7d71800313fe5db58d0f6c483a", + "1d60d545b7c5123fd60524dcbaf57081ca7dc4a9ec36c892927a3153328d17c0", + "37320fc74fec80805eedc8e92baf3c58842a2cb2a4ae127ad6e930f0c8441c7a", ] # The maximum timespan allowed for miners to respond to a query From cf0fc98066521d909a8faa94a86c1c7e4f704c91 Mon Sep 17 00:00:00 2001 From: HudsonGraeme Date: Wed, 18 Dec 2024 06:03:19 +0000 Subject: [PATCH 03/20] Upgrade bittensor to 8.5.1 - Improve weight setting logic --- neurons/_validator/core/api.py | 2 +- neurons/_validator/scoring/weights.py | 91 ++++++++++++++++----------- neurons/miner.py | 1 + neurons/utils/system.py | 77 ++++++++++++++++------- neurons/validator.py | 1 + requirements.txt | 8 +-- 6 files changed, 114 insertions(+), 66 deletions(-) diff --git a/neurons/_validator/core/api.py b/neurons/_validator/core/api.py index f056e9c9..23d4a9f0 100644 --- a/neurons/_validator/core/api.py +++ b/neurons/_validator/core/api.py @@ -1,5 +1,5 @@ import uvicorn -from bittensor.axon import FastAPIThreadedServer +from bittensor.core.axon import FastAPIThreadedServer import traceback from fastapi import APIRouter, FastAPI import bittensor as bt diff --git a/neurons/_validator/scoring/weights.py b/neurons/_validator/scoring/weights.py index e6a4dd79..7fcdc4a0 100644 --- a/neurons/_validator/scoring/weights.py +++ b/neurons/_validator/scoring/weights.py @@ -5,8 +5,7 @@ from constants import WEIGHT_RATE_LIMIT, WEIGHTS_VERSION from _validator.utils.logging import log_weights from _validator.utils.proof_of_weights import ProofOfWeightsItem - -from utils.system import timeout_with_multiprocess +from utils.system import timeout_with_multiprocess_retry @dataclass @@ -31,13 +30,6 @@ class WeightsManager: last_update_weights_block: int = 0 proof_of_weights_queue: list[ProofOfWeightsItem] = field(default_factory=list) - def should_update_weights(self) -> bool: - current_block = self.subtensor.get_current_block() - bt.logging.trace(f"Current block: {current_block}") - bt.logging.trace(f"Last update weights block: {self.last_update_weights_block}") - return current_block - self.last_update_weights_block >= WEIGHT_RATE_LIMIT - - @timeout_with_multiprocess(seconds=60) def set_weights(self, netuid, wallet, uids, weights, version_key): return self.subtensor.set_weights( netuid=netuid, @@ -48,48 +40,73 @@ def set_weights(self, netuid, wallet, uids, weights, version_key): version_key=version_key, ) - def update_weights(self, scores: torch.Tensor) -> bool: - """ - Updates the weights based on the given scores and sets them on the chain. - - Args: - scores (torch.Tensor): The scores tensor used to calculate new weights. - """ - if not self.should_update_weights(): - current_block = self.subtensor.get_current_block() - blocks_until_update = WEIGHT_RATE_LIMIT - ( - current_block - self.last_update_weights_block - ) + def should_update_weights(self) -> tuple[bool, str]: + """Check if weights should be updated based on rate limiting.""" + blocks_since_last_update = self.subtensor.blocks_since_last_update( + self.metagraph.netuid, self.user_uid + ) + if blocks_since_last_update < WEIGHT_RATE_LIMIT: + blocks_until_update = WEIGHT_RATE_LIMIT - blocks_since_last_update minutes_until_update = round((blocks_until_update * 12) / 60, 1) - bt.logging.info( - f"Next weight update in {blocks_until_update} blocks (approximately {minutes_until_update:.1f} minutes)" + return ( + False, + f"Next weight update in {blocks_until_update} blocks " + f"(approximately {minutes_until_update:.1f} minutes)", ) - return False - - bt.logging.info("Updating weights") - - weights = torch.zeros(self.metagraph.n) - nonzero_indices = scores.nonzero() - bt.logging.debug( - f"Weights: {weights}, Nonzero indices: {nonzero_indices}, Scores: {scores}" - ) - if nonzero_indices.sum() > 0: - weights[nonzero_indices] = scores[nonzero_indices] + return True, "" + @timeout_with_multiprocess_retry(seconds=60, retries=3) + def _set_weights_with_retry(self, weights: torch.Tensor) -> bool: + """Internal method to set weights with retry logic.""" try: - success = self.set_weights( + success, message = self.set_weights( netuid=self.metagraph.netuid, wallet=self.wallet, uids=self.metagraph.uids.tolist(), weights=weights.tolist(), version_key=WEIGHTS_VERSION, ) + + if message: + bt.logging.info(f"Set weights message: {message}") + if success: + bt.logging.success("Weights were set successfully") log_weights(weights) self.last_update_weights_block = int(self.metagraph.block.item()) return True - bt.logging.error("Failed to set weights") + + blocks_since_last_update = self.subtensor.blocks_since_last_update( + self.metagraph.netuid, self.user_uid + ) + if blocks_since_last_update < WEIGHT_RATE_LIMIT: + bt.logging.success( + f"Blocks since last update is now {blocks_since_last_update}, " + f"which is less than {WEIGHT_RATE_LIMIT}. Weights were set." + ) + self.last_update_weights_block = int(self.metagraph.block.item()) + return True + + bt.logging.warning("Failed to set weights") return False except Exception as e: - bt.logging.error(f"Failed to set weights on chain with exception: {e}") + bt.logging.warning(f"Failed to set weights on chain with exception: {e}") return False + + def update_weights(self, scores: torch.Tensor) -> bool: + """Updates the weights based on the given scores and sets them on the chain.""" + should_update, message = self.should_update_weights() + if not should_update: + bt.logging.info(message) + return True + + bt.logging.info("Updating weights") + weights = torch.zeros(self.metagraph.n) + nonzero_indices = scores.nonzero() + bt.logging.debug( + f"Weights: {weights}, Nonzero indices: {nonzero_indices}, Scores: {scores}" + ) + if nonzero_indices.sum() > 0: + weights[nonzero_indices] = scores[nonzero_indices] + + return self._set_weights_with_retry(weights) diff --git a/neurons/miner.py b/neurons/miner.py index 3157be3e..2220ec7f 100644 --- a/neurons/miner.py +++ b/neurons/miner.py @@ -97,6 +97,7 @@ def get_config_from_args(): ) bt.logging(config=config, logging_dir=config.full_path) + bt.logging.enable_info() if not os.path.exists(config.full_path): os.makedirs(config.full_path, exist_ok=True) diff --git a/neurons/utils/system.py b/neurons/utils/system.py index fd92b1ab..7ef305e3 100644 --- a/neurons/utils/system.py +++ b/neurons/utils/system.py @@ -31,36 +31,65 @@ def clean_temp_files(): shutil.rmtree(folder_path) -def timeout_with_multiprocess(seconds): +def timeout_with_multiprocess_retry(seconds, retries=3): + """Executes a function with timeout and automatic retries using multiprocessing. + + Args: + seconds (int): Maximum execution time in seconds before timeout + retries (int, optional): Number of retry attempts. Defaults to 3. + + Returns: + Decorator that wraps function with timeout and retry logic + """ + def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): - def target_func(result_dict, *args, **kwargs): - try: - result_dict["result"] = func(*args, **kwargs) - except Exception as e: - result_dict["exception"] = e - - manager = multiprocessing.Manager() - result_dict = manager.dict() - process = multiprocessing.Process( - target=target_func, args=(result_dict, *args), kwargs=kwargs - ) - process.start() - process.join(seconds) - - if process.is_alive(): - process.terminate() - process.join() - logging.warning( - f"Function '{func.__name__}' timed out after {seconds} seconds" + for attempt in range(retries): + logging.info(f"Attempt {attempt + 1} of {retries}") + + manager = multiprocessing.Manager() + result_dict = manager.dict() + process = multiprocessing.Process( + target=lambda d: d.update({"result": func(*args, **kwargs)}), + args=(result_dict,), ) - return None - if "exception" in result_dict: - raise result_dict["exception"] + try: + process.start() + process.join(seconds) + + if process.is_alive(): + process.terminate() + process.join() + logging.warning( + f"Function '{func.__name__}' timed out after {seconds} seconds" + ) + if attempt < retries - 1: + continue + return None + + result = result_dict.get("result") + if result: + return result + + if attempt < retries - 1: + continue + + error_msg = ( + "Another attempt will be made after the next request cycle." + if func.__name__ == "update_weights" + else f"Function returned {result}" + ) + logging.error(f"Failed after {retries} attempts. {error_msg}") + return None + + finally: + if process.is_alive(): + process.terminate() + manager.shutdown() - return result_dict.get("result", None) + return None return wrapper diff --git a/neurons/validator.py b/neurons/validator.py index f9d42cae..66f0decf 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -160,6 +160,7 @@ def get_config_from_args(): os.makedirs(config.full_path, exist_ok=True) bt.logging(config=config, logging_dir=config.full_path) + bt.logging.enable_info() if config.wandb_key: wandb_logger.safe_login(api_key=config.wandb_key) diff --git a/requirements.txt b/requirements.txt index ac164dfc..aa49c237 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ attrs==24.2.0 -bittensor==6.9.4 +bittensor==8.5.1 black==23.7.0 ezkl==16.2.0 -fastapi==0.99.1 -numpy==1.26.4 +fastapi==0.110.1 +numpy==2.0.1 packaging==24.1 psutil==6.0.0 -pydantic==1.10.18 +pydantic==2.10.3 requests==2.32.3 rich==13.8.1 torch==2.4.1 From ff06addf08583cdfc4fade7683355fbac101ddad Mon Sep 17 00:00:00 2001 From: HudsonGraeme Date: Wed, 18 Dec 2024 16:37:40 +0000 Subject: [PATCH 04/20] Bump version to 6.2.1 --- neurons/__init__.py | 2 +- neurons/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/neurons/__init__.py b/neurons/__init__.py index 8620cca3..98743715 100644 --- a/neurons/__init__.py +++ b/neurons/__init__.py @@ -2,4 +2,4 @@ This version number is used to trigger automatic updates. """ -__version__ = "6.0.0" +__version__ = "6.2.1" diff --git a/neurons/constants.py b/neurons/constants.py index 43834a26..f64ff27d 100644 --- a/neurons/constants.py +++ b/neurons/constants.py @@ -34,7 +34,7 @@ # Shift in seconds to apply to the minimum response time for vertical asymptote adjustment MINIMUM_SCORE_SHIFT = 0.0 # Weights version hyperparameter -WEIGHTS_VERSION = 1600 +WEIGHTS_VERSION = 1621 # Rate limit for weight updates WEIGHT_RATE_LIMIT: int = 100 # Delay between requests From 29ae18c009b282c207c1750cde0de173a86c0539 Mon Sep 17 00:00:00 2001 From: HudsonGraeme Date: Wed, 18 Dec 2024 21:05:58 +0000 Subject: [PATCH 05/20] Remove JOLT compile step in Dockerfile --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5cd2ef10..ea7bcc9a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,8 +27,8 @@ RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \ ENV PATH="/root/.cargo/bin:${PATH}" # Install Jolt -ENV JOLT_VERSION=dd9e5c4bcf36ffeb75a576351807f8d86c33ec66 -RUN cargo +${RUST_TOOLCHAIN} install --git https://github.com/a16z/jolt --rev ${JOLT_VERSION} --force --bins jolt +#ENV JOLT_VERSION=dd9e5c4bcf36ffeb75a576351807f8d86c33ec66 +#RUN cargo +${RUST_TOOLCHAIN} install --git https://github.com/a16z/jolt --rev ${JOLT_VERSION} --force --bins jolt # Install node et al. RUN curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.0/install.sh | bash && \ From ac4412c626862c95954b04a5adefe0d1fc00c900 Mon Sep 17 00:00:00 2001 From: HudsonGraeme Date: Wed, 18 Dec 2024 21:20:02 +0000 Subject: [PATCH 06/20] Skip JOLT pre-flight checks entirely --- neurons/utils/pre_flight.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/neurons/utils/pre_flight.py b/neurons/utils/pre_flight.py index 984c2d2e..e8db2d39 100644 --- a/neurons/utils/pre_flight.py +++ b/neurons/utils/pre_flight.py @@ -49,10 +49,6 @@ def run_shared_preflight_checks(external_model_dir: str): ("Syncing model files", sync_model_files), ("Ensuring Node.js version", ensure_nodejs_version), ("Checking SnarkJS installation", ensure_snarkjs_installed), - ("Checking Rust and Cargo installation", ensure_rust_cargo_installed), - ("Checking Rust nightly toolchain", ensure_rust_nightly_installed), - ("Checking Jolt installation", ensure_jolt_installed), - ("Compiling Jolt circuits", compile_jolt_circuits), ("Checking EZKL installation", ensure_ezkl_installed), ] From 9561e2ec1cc46b599fc01c968f1f7e70cf22fa85 Mon Sep 17 00:00:00 2001 From: HudsonGraeme Date: Wed, 18 Dec 2024 21:30:20 +0000 Subject: [PATCH 07/20] Bump version to 6.2.2 --- neurons/__init__.py | 2 +- neurons/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/neurons/__init__.py b/neurons/__init__.py index 98743715..48d22338 100644 --- a/neurons/__init__.py +++ b/neurons/__init__.py @@ -2,4 +2,4 @@ This version number is used to trigger automatic updates. """ -__version__ = "6.2.1" +__version__ = "6.2.2" diff --git a/neurons/constants.py b/neurons/constants.py index f64ff27d..04c45ffc 100644 --- a/neurons/constants.py +++ b/neurons/constants.py @@ -34,7 +34,7 @@ # Shift in seconds to apply to the minimum response time for vertical asymptote adjustment MINIMUM_SCORE_SHIFT = 0.0 # Weights version hyperparameter -WEIGHTS_VERSION = 1621 +WEIGHTS_VERSION = 1622 # Rate limit for weight updates WEIGHT_RATE_LIMIT: int = 100 # Delay between requests From ac93666db14ae973b3fb63552c3e8268509027a1 Mon Sep 17 00:00:00 2001 From: HudsonGraeme Date: Thu, 19 Dec 2024 21:20:26 +0000 Subject: [PATCH 08/20] Bump version to 6.2.3 --- neurons/__init__.py | 2 +- neurons/_validator/scoring/weights.py | 54 +++++++++------------------ neurons/constants.py | 2 +- 3 files changed, 20 insertions(+), 38 deletions(-) diff --git a/neurons/__init__.py b/neurons/__init__.py index 48d22338..3ec7c55c 100644 --- a/neurons/__init__.py +++ b/neurons/__init__.py @@ -2,4 +2,4 @@ This version number is used to trigger automatic updates. """ -__version__ = "6.2.2" +__version__ = "6.2.3" diff --git a/neurons/_validator/scoring/weights.py b/neurons/_validator/scoring/weights.py index 7fcdc4a0..c895c1f0 100644 --- a/neurons/_validator/scoring/weights.py +++ b/neurons/_validator/scoring/weights.py @@ -5,7 +5,6 @@ from constants import WEIGHT_RATE_LIMIT, WEIGHTS_VERSION from _validator.utils.logging import log_weights from _validator.utils.proof_of_weights import ProofOfWeightsItem -from utils.system import timeout_with_multiprocess_retry @dataclass @@ -55,9 +54,22 @@ def should_update_weights(self) -> tuple[bool, str]: ) return True, "" - @timeout_with_multiprocess_retry(seconds=60, retries=3) - def _set_weights_with_retry(self, weights: torch.Tensor) -> bool: - """Internal method to set weights with retry logic.""" + def update_weights(self, scores: torch.Tensor) -> bool: + """Updates the weights based on the given scores and sets them on the chain.""" + should_update, message = self.should_update_weights() + if not should_update: + bt.logging.info(message) + return True + + bt.logging.info("Updating weights") + weights = torch.zeros(self.metagraph.n) + nonzero_indices = scores.nonzero() + bt.logging.debug( + f"Weights: {weights}, Nonzero indices: {nonzero_indices}, Scores: {scores}" + ) + if nonzero_indices.sum() > 0: + weights[nonzero_indices] = scores[nonzero_indices] + try: success, message = self.set_weights( netuid=self.metagraph.netuid, @@ -75,38 +87,8 @@ def _set_weights_with_retry(self, weights: torch.Tensor) -> bool: log_weights(weights) self.last_update_weights_block = int(self.metagraph.block.item()) return True - - blocks_since_last_update = self.subtensor.blocks_since_last_update( - self.metagraph.netuid, self.user_uid - ) - if blocks_since_last_update < WEIGHT_RATE_LIMIT: - bt.logging.success( - f"Blocks since last update is now {blocks_since_last_update}, " - f"which is less than {WEIGHT_RATE_LIMIT}. Weights were set." - ) - self.last_update_weights_block = int(self.metagraph.block.item()) - return True - - bt.logging.warning("Failed to set weights") return False + except Exception as e: - bt.logging.warning(f"Failed to set weights on chain with exception: {e}") + bt.logging.error(f"Failed to set weights on chain with exception: {e}") return False - - def update_weights(self, scores: torch.Tensor) -> bool: - """Updates the weights based on the given scores and sets them on the chain.""" - should_update, message = self.should_update_weights() - if not should_update: - bt.logging.info(message) - return True - - bt.logging.info("Updating weights") - weights = torch.zeros(self.metagraph.n) - nonzero_indices = scores.nonzero() - bt.logging.debug( - f"Weights: {weights}, Nonzero indices: {nonzero_indices}, Scores: {scores}" - ) - if nonzero_indices.sum() > 0: - weights[nonzero_indices] = scores[nonzero_indices] - - return self._set_weights_with_retry(weights) diff --git a/neurons/constants.py b/neurons/constants.py index 04c45ffc..69b91add 100644 --- a/neurons/constants.py +++ b/neurons/constants.py @@ -34,7 +34,7 @@ # Shift in seconds to apply to the minimum response time for vertical asymptote adjustment MINIMUM_SCORE_SHIFT = 0.0 # Weights version hyperparameter -WEIGHTS_VERSION = 1622 +WEIGHTS_VERSION = 1623 # Rate limit for weight updates WEIGHT_RATE_LIMIT: int = 100 # Delay between requests From ca52626cd0b35aacf99093846658d2693b4b8a64 Mon Sep 17 00:00:00 2001 From: HudsonGraeme Date: Fri, 20 Dec 2024 16:18:03 +0000 Subject: [PATCH 09/20] Reduce range of inputs to LSTM --- .../input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neurons/deployment_layer/model_8dcff627a782525ea86196941a694ffbead179905f0cd4550ddc3df9e2b90924/input.py b/neurons/deployment_layer/model_8dcff627a782525ea86196941a694ffbead179905f0cd4550ddc3df9e2b90924/input.py index 5c5f2a68..445bbf32 100644 --- a/neurons/deployment_layer/model_8dcff627a782525ea86196941a694ffbead179905f0cd4550ddc3df9e2b90924/input.py +++ b/neurons/deployment_layer/model_8dcff627a782525ea86196941a694ffbead179905f0cd4550ddc3df9e2b90924/input.py @@ -24,7 +24,7 @@ def __init__( @staticmethod def generate() -> dict[str, object]: return { - "list_items": [random.random() for _ in range(LIST_SIZE)], + "list_items": [min(0.85, random.random()) for _ in range(LIST_SIZE)], } @staticmethod From e17f5341211f4c13ad2714f386e774717367444b Mon Sep 17 00:00:00 2001 From: Hudson Graeme Date: Fri, 20 Dec 2024 16:18:27 +0000 Subject: [PATCH 10/20] Add code of conduct (#37) --- CODE_OF_CONDUCT.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..2e24e99f --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,30 @@ +# Code of Conduct + +For Omron miners, validators, and operators of the subnet. + +## General Principles + +- All parties shall conduct activities with transparency and accountability. +- All parties shall not engage in any activity that could be construed as a conflict of interest. +- All parties shall report any potential issues or exploits to the owners and team members through the official bug bounty program. + +> The official bug bounty program can be found at https://immunefi.com/bug-bounty/omron/ + +## Subnet Operators + +- Shall conduct all development within the public `omron-subnet` repository. +- Shall not disclose any sensitive information about future updates to third parties. +- Shall make every reasonable effort to ensure that the subnet is secure and stable at all times. +- Shall provide support to the community as needed. +- Shall not give hints, tips, or tricks to any miners nor withhold information which may advantage any party. +- Shall put all pending or scheduled code updates in the official publicly available code repository before and leading up to their release. +- Shall provide reasonable timing between the release of pending updates and their official release, except for critical security updates. + +## Miners + +- Shall not collude with other miners or validators to manipulate the subnet. +- Shall not participate in activities that are directly adversarial to other miners or validators, such as DDoS or malicious requests. + +## Validators + +- Shall update their subnet codebases per the instructions provided by the owners and team members in a timely manner. From 9cb34d999cc9033b7c3db68858fde0722eddf9e5 Mon Sep 17 00:00:00 2001 From: HudsonGraeme Date: Fri, 20 Dec 2024 16:23:09 +0000 Subject: [PATCH 11/20] Bump version to 6.2.4 --- neurons/__init__.py | 2 +- neurons/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/neurons/__init__.py b/neurons/__init__.py index 3ec7c55c..a668dbb5 100644 --- a/neurons/__init__.py +++ b/neurons/__init__.py @@ -2,4 +2,4 @@ This version number is used to trigger automatic updates. """ -__version__ = "6.2.3" +__version__ = "6.2.4" diff --git a/neurons/constants.py b/neurons/constants.py index 69b91add..feb2df02 100644 --- a/neurons/constants.py +++ b/neurons/constants.py @@ -34,7 +34,7 @@ # Shift in seconds to apply to the minimum response time for vertical asymptote adjustment MINIMUM_SCORE_SHIFT = 0.0 # Weights version hyperparameter -WEIGHTS_VERSION = 1623 +WEIGHTS_VERSION = 1624 # Rate limit for weight updates WEIGHT_RATE_LIMIT: int = 100 # Delay between requests From 92f09a29d97244a733fb03769cf35e720067e245 Mon Sep 17 00:00:00 2001 From: Bondarenko Oleg Date: Fri, 20 Dec 2024 20:13:04 +0200 Subject: [PATCH 12/20] Prometheus integration (#33) Co-authored-by: HudsonGraeme --- Dockerfile | 2 + README.md | 2 + cspell.json | 4 + neurons/_validator/core/prometheus.py | 83 +++++++++++++++++++ neurons/_validator/core/response_processor.py | 23 +++-- neurons/_validator/core/validator_loop.py | 14 +++- neurons/_validator/utils/logging.py | 7 +- neurons/validator.py | 14 ++++ requirements.txt | 1 + 9 files changed, 139 insertions(+), 11 deletions(-) create mode 100644 neurons/_validator/core/prometheus.py diff --git a/Dockerfile b/Dockerfile index ea7bcc9a..6460f1d9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -73,3 +73,5 @@ CMD ["-c", "import subprocess; \ ] EXPOSE 4091/tcp EXPOSE 8000/tcp +# for prometheus monitoring server: +EXPOSE 9090/tcp diff --git a/README.md b/README.md index 9b2bdeec..1952a40f 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,7 @@ services: restart: unless-stopped ports: - 8000:8000 + - 9090:9090 # In case you use prometheus monitoring volumes: # Update this path to your .bittensor directory - {path_to_your_.bittensor_directory}:/root/.bittensor labels: @@ -152,6 +153,7 @@ services: docker run -d \ --name omron-validator \ -p 8000:8000 \ + -p 9090:9090 \ -v {path_to_your_.bittensor_directory}:/root/.bittensor \ --restart unless-stopped \ ghcr.io/inference-labs-inc/omron:latest \ diff --git a/cspell.json b/cspell.json index 775071b6..23b93398 100644 --- a/cspell.json +++ b/cspell.json @@ -15,6 +15,7 @@ "Gbps", "incentivizes", "Keypair", + "libudev", "localnet", "Mbps", "metagraph", @@ -23,11 +24,14 @@ "onnxruntime", "Opentensor", "pydantic", + "rustup", "setdefaulttimeout", + "snarkjs", "starlette", "substrateinterface", "subtensor", "timespan", + "tlsv", "uids", "uvicorn", "venv", diff --git a/neurons/_validator/core/prometheus.py b/neurons/_validator/core/prometheus.py new file mode 100644 index 00000000..c5b22cf2 --- /dev/null +++ b/neurons/_validator/core/prometheus.py @@ -0,0 +1,83 @@ +import statistics +import threading +from typing import Optional +from wsgiref.simple_server import WSGIServer + +from prometheus_client import Summary, start_http_server + + +_server: Optional[WSGIServer] = None +_thread: Optional[threading.Thread] = None + +_validation_times: Optional[Summary] = None +_response_times: Optional[Summary] = None +_proof_sizes: Optional[Summary] = None +_verification_ratio: Optional[Summary] = None + + +def start_prometheus_logging(port: int) -> None: + global _server, _thread + _server, _thread = start_http_server(port) + + global _validation_times, _response_times, _proof_sizes, _verification_ratio + _validation_times = Summary("validating_seconds", "Time spent validating responses") + _response_times = Summary( + "requests_seconds", + "Time spent processing requests", + ["aggregation_type", "model"], + ) + _proof_sizes = Summary( + "proof_sizes", "Size of proofs", ["aggregation_type", "model"] + ) + _verification_ratio = Summary( + "verified_proofs_ratio", "Verified proofs ratio", ["model"] + ) + + +def stop_prometheus_logging() -> None: + global _server, _thread + global _validation_times, _response_times, _proof_sizes, _verification_ratio + if _server: + _server.shutdown() + _server = None + _thread = None + _validation_times = None + _response_times = None + _proof_sizes = None + _verification_ratio = None + + +def log_validation_time(time: float) -> None: + global _validation_times + if _validation_times: + _validation_times.observe(time) + + +def log_response_times(response_times: list[float], model_name: str) -> None: + global _response_times + if _response_times and response_times: + _response_times.labels("max", model_name).observe(max(response_times)) + _response_times.labels("min", model_name).observe(min(response_times)) + _response_times.labels("mean", model_name).observe( + statistics.mean(response_times) + ) + _response_times.labels("median", model_name).observe( + statistics.median(response_times) + ) + + +def log_proof_sizes(proof_sizes: list[int], model_name: str) -> None: + global _proof_sizes + if _proof_sizes and proof_sizes: + _proof_sizes.labels("max", model_name).observe(max(proof_sizes)) + _proof_sizes.labels("min", model_name).observe(min(proof_sizes)) + _proof_sizes.labels("mean", model_name).observe(statistics.mean(proof_sizes)) + _proof_sizes.labels("median", model_name).observe( + statistics.median(proof_sizes) + ) + + +def log_verification_ratio(value: float, model_name: str) -> None: + global _verification_ratio + if _verification_ratio: + _verification_ratio.labels(model_name).observe(value) diff --git a/neurons/_validator/core/response_processor.py b/neurons/_validator/core/response_processor.py index e9ee4225..687c7788 100644 --- a/neurons/_validator/core/response_processor.py +++ b/neurons/_validator/core/response_processor.py @@ -4,18 +4,18 @@ import traceback from bittensor import logging -from execution_layer.verified_model_session import VerifiedModelSession +from _validator.core import prometheus +from _validator.core.request import Request from _validator.models.completed_proof_of_weights import CompletedProofOfWeightsItem from _validator.models.miner_response import MinerResponse +from _validator.models.request_type import RequestType from _validator.scoring.score_manager import ScoreManager -from _validator.core.request import Request from _validator.utils.logging import log_responses, log_system_metrics from _validator.utils.proof_of_weights import save_proof_of_weights -from _validator.models.request_type import RequestType from constants import BATCHED_PROOF_OF_WEIGHTS_MODEL_ID from execution_layer.generic_input import GenericInput - +from execution_layer.verified_model_session import VerifiedModelSession from utils import wandb_logger @@ -31,17 +31,28 @@ def process_responses(self, responses: list[Request]) -> list[MinerResponse]: if len(responses) == 0: logging.error("No responses received") return [] + processed_responses = [self.process_single_response(r) for r in responses] log_responses(processed_responses) + response_times = [ r.response_time for r in processed_responses if r.response_time is not None and r.verification_result ] verified_count = sum(1 for r in processed_responses if r.verification_result) - log_system_metrics( - response_times, verified_count, processed_responses[0].circuit + circuit = processed_responses[0].circuit + log_system_metrics(response_times, verified_count, circuit) + + # Log response times, proof sizes and verification ratio to Prometheus + prometheus.log_verification_ratio( + verified_count / len(response_times) if response_times else 0, circuit + ) + prometheus.log_proof_sizes( + [r.proof_size for r in processed_responses if r.proof_size is not None], + circuit, ) + prometheus.log_response_times(response_times, circuit) if not processed_responses[0].circuit.id == BATCHED_PROOF_OF_WEIGHTS_MODEL_ID: return processed_responses diff --git a/neurons/_validator/core/validator_loop.py b/neurons/_validator/core/validator_loop.py index d15b7307..4eacfbf8 100644 --- a/neurons/_validator/core/validator_loop.py +++ b/neurons/_validator/core/validator_loop.py @@ -11,6 +11,12 @@ from _validator.config import ValidatorConfig from _validator.core.api import ValidatorAPI +from _validator.core.prometheus import ( + log_validation_time, + start_prometheus_logging, + stop_prometheus_logging, +) +from _validator.core.request import Request from _validator.core.request_pipeline import RequestPipeline from _validator.core.response_processor import ResponseProcessor from _validator.models.miner_response import MinerResponse @@ -20,11 +26,10 @@ from _validator.utils.axon import query_axons from _validator.utils.proof_of_weights import save_proof_of_weights from _validator.utils.uid import get_queryable_uids -from _validator.core.request import Request -from execution_layer.circuit import Circuit, CircuitType from constants import ( REQUEST_DELAY_SECONDS, ) +from execution_layer.circuit import Circuit, CircuitType from utils import AutoUpdate, clean_temp_files, wandb_logger from utils.gc_logging import log_responses as log_responses_gc @@ -66,6 +71,9 @@ def __init__(self, config: ValidatorConfig): self.config, self.score_manager, self.api ) + if self.config.bt_config.prometheus_monitoring: + start_prometheus_logging(self.config.bt_config.prometheus_port) + def run(self) -> NoReturn: """ Run the main validator loop indefinitely. @@ -190,11 +198,13 @@ def _log_overhead_time(self, start_time) -> float: "overhead_time": overhead_time, } ) + log_validation_time(overhead_time) return overhead_time def _handle_keyboard_interrupt(self): """Handle keyboard interrupt by cleaning up and exiting.""" bt.logging.success("Keyboard interrupt detected. Exiting validator.") self.api.stop() + stop_prometheus_logging() clean_temp_files() sys.exit(0) diff --git a/neurons/_validator/utils/logging.py b/neurons/_validator/utils/logging.py index b9eaf5af..84499631 100644 --- a/neurons/_validator/utils/logging.py +++ b/neurons/_validator/utils/logging.py @@ -1,8 +1,10 @@ from __future__ import annotations + +import torch from rich.console import Console, JustifyMethod from rich.table import Table + import utils.wandb_logger as wandb_logger -import torch from _validator.models.miner_response import MinerResponse @@ -138,8 +140,7 @@ def log_responses(responses: list[MinerResponse]): wandb_logger.safe_log(wandb_log) -def log_system_metrics(response_times, verified_count, circuit): - +def log_system_metrics(response_times: list[float], verified_count: int, circuit: str): if response_times: max_response_time = max(response_times) min_response_time = min(response_times) diff --git a/neurons/validator.py b/neurons/validator.py index 66f0decf..82b05076 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -116,6 +116,20 @@ def get_config_from_args(): help="Whether to run the validator in localnet mode.", ) + parser.add_argument( + "--prometheus-monitoring", + action="store_true", + default=False, + help="Whether to enable prometheus monitoring.", + ) + + parser.add_argument( + "--prometheus-port", + type=int, + default=9090, + help="The port for the prometheus monitoring.", + ) + bt.subtensor.add_args(parser) bt.logging.add_args(parser) bt.wallet.add_args(parser) diff --git a/requirements.txt b/requirements.txt index aa49c237..d04143a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ rich==13.8.1 torch==2.4.1 uvicorn==0.22.0 wandb==0.18.1 +prometheus_client==0.21.1 From 4089da9e189a4747c3e7733202e3aad915853233 Mon Sep 17 00:00:00 2001 From: Hudson Graeme Date: Tue, 7 Jan 2025 07:02:05 +0000 Subject: [PATCH 13/20] Upgrade validator API (#34) - Implement SSL certificate management and pinning - Implement WebSockets for external communication - Refactor real-world-requests and request management throughout --- Dockerfile | 8 +- README.md | 4 +- cspell.json | 1 + docs/command_line_arguments.md | 2 +- makefile | 6 +- neurons/_validator/api/__init__.py | 266 ++++++++++++++++++ neurons/_validator/api/cache.py | 47 ++++ neurons/_validator/api/certificate_manager.py | 38 +++ neurons/_validator/api/websocket_manager.py | 19 ++ neurons/_validator/config/__init__.py | 21 +- neurons/_validator/config/api.py | 24 ++ neurons/_validator/core/request.py | 5 + neurons/_validator/core/request_pipeline.py | 115 +++++--- neurons/_validator/core/validator_loop.py | 21 +- neurons/_validator/models/api.py | 16 -- neurons/_validator/models/base_rpc_request.py | 14 + neurons/_validator/models/poc_rpc_request.py | 29 ++ neurons/_validator/models/pow_rpc_request.py | 46 +++ .../pow/proof_of_weights_handler.py | 5 + neurons/_validator/utils/api.py | 6 +- neurons/constants.py | 56 ++++ neurons/deployment_layer/circuit_store.py | 40 +++ .../input.py | 2 +- .../metadata.json | 1 + .../input.py | 8 +- .../metadata.json | 1 + .../metadata.json | 1 + neurons/execution_layer/circuit.py | 3 +- neurons/validator.py | 33 ++- requirements.txt | 3 + 30 files changed, 737 insertions(+), 104 deletions(-) create mode 100644 neurons/_validator/api/__init__.py create mode 100644 neurons/_validator/api/cache.py create mode 100644 neurons/_validator/api/certificate_manager.py create mode 100644 neurons/_validator/api/websocket_manager.py create mode 100644 neurons/_validator/config/api.py delete mode 100644 neurons/_validator/models/api.py create mode 100644 neurons/_validator/models/base_rpc_request.py create mode 100644 neurons/_validator/models/poc_rpc_request.py create mode 100644 neurons/_validator/models/pow_rpc_request.py diff --git a/Dockerfile b/Dockerfile index 6460f1d9..67a244eb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,7 +71,9 @@ CMD ["-c", "import subprocess; \ subprocess.run(['/opt/venv/bin/python3', '/opt/omron/neurons/miner.py', '--help']); \ subprocess.run(['/opt/venv/bin/python3', '/opt/omron/neurons/validator.py', '--help']);" \ ] -EXPOSE 4091/tcp -EXPOSE 8000/tcp -# for prometheus monitoring server: +# Axon server +EXPOSE 8091/tcp +# API server +EXPOSE 8443/tcp +# Prometheus server EXPOSE 9090/tcp diff --git a/README.md b/README.md index 1952a40f..64d21a27 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ services: image: ghcr.io/inference-labs-inc/omron:latest restart: unless-stopped ports: - - 8000:8000 + - 8443:8443 - 9090:9090 # In case you use prometheus monitoring volumes: # Update this path to your .bittensor directory - {path_to_your_.bittensor_directory}:/root/.bittensor @@ -152,7 +152,7 @@ services: ```console docker run -d \ --name omron-validator \ - -p 8000:8000 \ + -p 8443:8443 \ -p 9090:9090 \ -v {path_to_your_.bittensor_directory}:/root/.bittensor \ --restart unless-stopped \ diff --git a/cspell.json b/cspell.json index 23b93398..75ae0a14 100644 --- a/cspell.json +++ b/cspell.json @@ -4,6 +4,7 @@ "dictionaryDefinitions": [], "dictionaries": [], "words": [ + "alist", "bittensor", "blocktime", "btcli", diff --git a/docs/command_line_arguments.md b/docs/command_line_arguments.md index de884564..43c18ad4 100644 --- a/docs/command_line_arguments.md +++ b/docs/command_line_arguments.md @@ -33,7 +33,7 @@ The below arguments are specific to validator software and have no effect on min | `--enable-pow` | No | `False` | `True`, `False` | Whether on-chain proof of weights is enabled | | `--pow-target-interval` | No | `1000` | Integer | The target block interval for committing proof of weights to the chain | | `--ignore-external-requests` | No | `True` | `True`, `False` | Whether the validator should ignore external requests through it's API. | -| `--external-api-port` | No | `8000` | Integer | The port for the validator's external API. | +| `--external-api-port` | No | `8443` | Integer | The port for the validator's external API. | | `--external-api-workers` | No | `1` | Integer | The number of workers for the validator's external API. | | `--external-api-host` | No | `0.0.0.0` | String | The host for the validator's external API. | | `--do-not-verify-external-signatures` | No | `False` | `True`, `False` | External PoW requests are signed by validator's (sender's) wallet. By default, these are checked to ensure legitimacy. This should only be disabled in controlled development environments. | diff --git a/makefile b/makefile index 578f5b50..4f86fdfc 100644 --- a/makefile +++ b/makefile @@ -3,7 +3,7 @@ WALLET_NAME ?= default WALLET_HOTKEY ?= default WALLET_PATH ?= $(HOME)/.bittensor MINER_PORT ?= 8091 -VALIDATOR_PORT ?= 8000 +VALIDATOR_PORT ?= 8443 build: docker build -t omron -f Dockerfile . @@ -47,7 +47,7 @@ validator: docker run \ --detach \ --name omron-validator \ - -p $(VALIDATOR_PORT):8000 \ + -p $(VALIDATOR_PORT):8443 \ -v $(WALLET_PATH):/root/.bittensor \ omron validator.py \ --wallet.name $(WALLET_NAME) \ @@ -77,7 +77,7 @@ test-validator: docker run \ --detach \ --name omron-validator \ - -p $(VALIDATOR_PORT):8000 \ + -p $(VALIDATOR_PORT):8443 \ -v $(WALLET_PATH):/root/.bittensor \ omron validator.py \ --wallet.name $(WALLET_NAME) \ diff --git a/neurons/_validator/api/__init__.py b/neurons/_validator/api/__init__.py new file mode 100644 index 00000000..eec74307 --- /dev/null +++ b/neurons/_validator/api/__init__.py @@ -0,0 +1,266 @@ +from __future__ import annotations +import os +import traceback +from fastapi import ( + FastAPI, + WebSocket, + WebSocketDisconnect, + WebSocketException, +) +from jsonrpcserver import ( + method, + async_dispatch, + Success, + Error, + InvalidParams, +) + +import bittensor as bt +from _validator.models.poc_rpc_request import ProofOfComputationRPCRequest +from _validator.models.pow_rpc_request import ProofOfWeightsRPCRequest +import hashlib +from constants import MAX_SIGNATURE_LIFESPAN, MAINNET_TESTNET_UIDS +from _validator.config import ValidatorConfig +import base64 +import substrateinterface +import time +from _validator.api.cache import ValidatorKeysCache +import threading +import uvicorn +from _validator.api.certificate_manager import CertificateManager +from _validator.api.websocket_manager import WebSocketManager +import asyncio +from OpenSSL import crypto + + +class ValidatorAPI: + def __init__(self, config: ValidatorConfig): + self.config = config + self.app = FastAPI() + self.external_requests_queue: list[ + ProofOfWeightsRPCRequest | ProofOfComputationRPCRequest + ] = [] + self.ws_manager = WebSocketManager() + self.validator_keys_cache = ValidatorKeysCache(config) + self.server_thread: threading.Thread | None = None + self.pending_requests: dict[str, asyncio.Event] = {} + self.request_results: dict[str, dict[str, any]] = {} + self.is_testnet = config.bt_config.subtensor.network == "test" + self._setup_api() + + def _setup_api(self) -> None: + if not self.config.api.enabled: + bt.logging.info("API Disabled: --ignore-external-requests flag present") + return + + bt.logging.debug("Starting WebSocket API server...") + if self.config.api.certificate_path: + cert_manager = CertificateManager(self.config.api.certificate_path) + cert_manager.ensure_valid_certificate( + bt.axon(self.config.wallet).external_ip + ) + self.commit_cert_hash() + + self.setup_rpc_methods() + self.start_server() + bt.logging.success("WebSocket API server started") + + def setup_rpc_methods(self) -> None: + @self.app.websocket("/rpc") + async def websocket_endpoint(websocket: WebSocket): + if ( + self.config.api.verify_external_signatures + and not await self.validate_connection(websocket.headers) + ): + raise WebSocketException( + code=3000, reason="Connection validation failed" + ) + + try: + await self.ws_manager.connect(websocket) + async for data in websocket.iter_text(): + response = await async_dispatch(data, context=websocket) + await websocket.send_text(str(response)) + except WebSocketDisconnect: + bt.logging.debug("Client disconnected normally") + except Exception as e: + bt.logging.error(f"WebSocket error: {str(e)}") + finally: + await self.ws_manager.disconnect(websocket) + + @method(name="omron.proof_of_weights") + async def omron_proof_of_weights( + websocket: WebSocket, **params: dict[str, object] + ) -> dict[str, object]: + evaluation_data = params.get("evaluation_data") + weights_version = params.get("weights_version") + + if not evaluation_data: + return InvalidParams("Missing evaluation data") + + try: + netuid = websocket.headers.get("x-netuid") + if netuid is None: + return InvalidParams("Missing x-netuid header") + + if self.is_testnet: + testnet_uids = [ + uid[0] for uid in MAINNET_TESTNET_UIDS if uid[1] == int(netuid) + ] + if not testnet_uids: + return InvalidParams( + f"No testnet UID mapping found for mainnet UID {netuid}" + ) + netuid = testnet_uids[0] + + netuid = int(netuid) + try: + external_request = ProofOfWeightsRPCRequest( + evaluation_data=evaluation_data, + netuid=netuid, + weights_version=weights_version, + ) + except ValueError as e: + return InvalidParams(str(e)) + + self.pending_requests[external_request.hash] = asyncio.Event() + self.external_requests_queue.insert(0, external_request) + bt.logging.success( + f"External request with hash {external_request.hash} added to queue" + ) + try: + await asyncio.wait_for( + self.pending_requests[external_request.hash].wait(), + timeout=900, + ) + result = self.request_results.pop(external_request.hash, None) + + if result: + bt.logging.success( + f"External request with hash {external_request.hash} processed successfully" + ) + return Success(result) + bt.logging.error( + f"External request with hash {external_request.hash} failed to process" + ) + return Error(9, "Request processing failed") + except asyncio.TimeoutError: + bt.logging.error( + f"External request with hash {external_request.hash} timed out" + ) + return Error(9, "Request processing failed", "Request timed out") + finally: + self.pending_requests.pop(external_request.hash, None) + + except Exception as e: + bt.logging.error(f"Error processing request: {str(e)}") + traceback.print_exc() + return Error(9, "Request processing failed", str(e)) + + def start_server(self): + """Start the uvicorn server in a separate thread""" + self.server_thread = threading.Thread( + target=uvicorn.run, + args=(self.app,), + kwargs={ + "host": "0.0.0.0", + "port": self.config.api.port, + "ssl_keyfile": os.path.join( + self.config.api.certificate_path, "key.pem" + ), + "ssl_certfile": os.path.join( + self.config.api.certificate_path, "cert.pem" + ), + }, + daemon=True, + ) + self.server_thread.start() + try: + bt.logging.info(f"Serving axon on port {self.config.api.port}") + axon = bt.axon( + wallet=self.config.wallet, external_port=self.config.api.port + ) + axon.serve(self.config.bt_config.netuid, self.config.subtensor) + bt.logging.success("Axon served") + except Exception as e: + bt.logging.error(f"Error serving axon: {e}") + + async def stop(self): + """Gracefully shutdown the WebSocket server""" + for connection in self.ws_manager.active_connections: + await connection.close() + self.ws_manager.active_connections.clear() + + async def validate_connection(self, headers) -> bool: + """Validate WebSocket connection request headers""" + required_headers = ["x-timestamp", "x-origin-ss58", "x-signature", "x-netuid"] + + if not all(header in headers for header in required_headers): + return False + + try: + timestamp = int(headers["x-timestamp"]) + current_time = time.time() + if current_time - timestamp > MAX_SIGNATURE_LIFESPAN: + return False + + ss58_address = headers["x-origin-ss58"] + signature = base64.b64decode(headers["x-signature"]) + netuid = int(headers["x-netuid"]) + + public_key = substrateinterface.Keypair(ss58_address=ss58_address) + if not public_key.verify(str(timestamp).encode(), signature): + return False + + return await self.validator_keys_cache.check_validator_key( + ss58_address, netuid + ) + + except Exception as e: + bt.logging.error(f"Validation error: {str(e)}") + traceback.print_exc() + return False + + def commit_cert_hash(self): + """Commit the cert hash to the chain. Clients will use this for certificate pinning.""" + + existing_commitment = None + try: + existing_commitment = self.config.subtensor.get_commitment( + self.config.subnet_uid, self.config.user_uid + ) + except Exception: + bt.logging.warning( + "Error getting existing commitment. Assuming no commitment exists." + ) + traceback.print_exc() + + if not self.config.api.certificate_path: + return + + cert_path = os.path.join(self.config.api.certificate_path, "cert.pem") + if not os.path.exists(cert_path): + return + + with open(cert_path, "rb") as f: + cert_data = f.read() + cert = crypto.load_certificate(crypto.FILETYPE_PEM, cert_data) + cert_der = crypto.dump_certificate(crypto.FILETYPE_ASN1, cert) + cert_hash = hashlib.sha256(cert_der).hexdigest() + if cert_hash != existing_commitment: + try: + self.config.subtensor.commit( + self.config.wallet, self.config.subnet_uid, cert_hash + ) + bt.logging.success("Certificate hash committed to chain.") + except Exception as e: + bt.logging.error(f"Error committing certificate hash: {str(e)}") + traceback.print_exc() + else: + bt.logging.info("Certificate hash already committed to chain.") + + def set_request_result(self, request_hash: str, result: dict[str, any]): + """Set the result for a pending request and signal its completion.""" + if request_hash in self.pending_requests: + self.request_results[request_hash] = result + self.pending_requests[request_hash].set() diff --git a/neurons/_validator/api/cache.py b/neurons/_validator/api/cache.py new file mode 100644 index 00000000..7cefb003 --- /dev/null +++ b/neurons/_validator/api/cache.py @@ -0,0 +1,47 @@ +import datetime +import asyncio +from _validator.config import ValidatorConfig +import bittensor as bt + + +class ValidatorKeysCache: + """ + A thread-safe cache for validator keys to reduce the number of requests to the metagraph. + """ + + def __init__(self, config: ValidatorConfig) -> None: + self.cached_keys: dict[int, list[str]] = {} + self.cached_timestamps: dict[int, datetime.datetime] = {} + self.config: ValidatorConfig = config + self._lock = asyncio.Lock() + + async def fetch_validator_keys(self, netuid: int) -> None: + """ + Fetch the validator keys for a given netuid and cache them. + Thread-safe implementation using a lock. + """ + subtensor = bt.subtensor(config=self.config.bt_config) + self.cached_keys[netuid] = [ + neuron.hotkey + for neuron in subtensor.neurons_lite(netuid) + if neuron.validator_permit + ] + self.cached_timestamps[netuid] = datetime.datetime.now() + datetime.timedelta( + hours=12 + ) + + async def check_validator_key(self, ss58_address: str, netuid: int) -> bool: + """ + Thread-safe check if a given key is a validator key for a given netuid. + """ + if ( + self.config.api.whitelisted_public_keys + and ss58_address in self.config.api.whitelisted_public_keys + ): + # If the sender is whitelisted, we don't need to check the key + return True + + cache_timestamp = self.cached_timestamps.get(netuid, None) + if cache_timestamp is None or cache_timestamp < datetime.datetime.now(): + await self.fetch_validator_keys(netuid) + return ss58_address in self.cached_keys.get(netuid, []) diff --git a/neurons/_validator/api/certificate_manager.py b/neurons/_validator/api/certificate_manager.py new file mode 100644 index 00000000..48925c64 --- /dev/null +++ b/neurons/_validator/api/certificate_manager.py @@ -0,0 +1,38 @@ +import os +import time +from OpenSSL import crypto +import bittensor as bt + + +class CertificateManager: + def __init__(self, cert_path: str): + self.cert_path = cert_path + self.key_path = os.path.join(cert_path, "key.pem") + self.cert_file = os.path.join(cert_path, "cert.pem") + + def ensure_valid_certificate(self, external_ip: str) -> None: + if not os.path.exists(self.cert_file): + bt.logging.warning( + "Certificate not found. Generating new self-signed certificate." + ) + os.makedirs(self.cert_path, exist_ok=True) + self._generate_certificate(external_ip) + + def _generate_certificate(self, cn: str) -> None: + key = crypto.PKey() + key.generate_key(crypto.TYPE_RSA, 4096) + + cert = crypto.X509() + cert.get_subject().CN = cn + cert.set_serial_number(int(time.time())) + cert.gmtime_adj_notBefore(0) + cert.gmtime_adj_notAfter(2 * 365 * 24 * 60 * 60) + cert.set_issuer(cert.get_subject()) + cert.set_pubkey(key) + cert.sign(key, "sha256") + + with open(self.cert_file, "wb") as f: + f.write(crypto.dump_certificate(crypto.FILETYPE_PEM, cert)) + + with open(self.key_path, "wb") as f: + f.write(crypto.dump_privatekey(crypto.FILETYPE_PEM, key)) diff --git a/neurons/_validator/api/websocket_manager.py b/neurons/_validator/api/websocket_manager.py new file mode 100644 index 00000000..5534a107 --- /dev/null +++ b/neurons/_validator/api/websocket_manager.py @@ -0,0 +1,19 @@ +from __future__ import annotations +from fastapi import WebSocket + + +class WebSocketManager: + def __init__(self): + self.active_connections: set[WebSocket] = set() + + async def connect(self, websocket: WebSocket) -> None: + await websocket.accept() + self.active_connections.add(websocket) + + async def disconnect(self, websocket: WebSocket) -> None: + self.active_connections.remove(websocket) + await websocket.close() + + async def close_all(self) -> None: + for connection in self.active_connections.copy(): + await self.disconnect(connection) diff --git a/neurons/_validator/config/__init__.py b/neurons/_validator/config/__init__.py index 4bde7b00..0710c718 100644 --- a/neurons/_validator/config/__init__.py +++ b/neurons/_validator/config/__init__.py @@ -3,25 +3,7 @@ from constants import DEFAULT_NETUID from utils import wandb_logger - - -class ApiConfig: - """ - Configuration class for the API. - - Attributes: - enabled (bool): Whether the API is enabled. - host (str): The host for the API. - port (int): The port for the API. - workers (int): The number of workers for the API. - """ - - def __init__(self, config: bt.config): - self.enabled = not config.ignore_external_requests - self.host = config.external_api_host - self.port = config.external_api_port - self.workers = config.external_api_workers - self.verify_external_signatures = not config.do_not_verify_external_signatures +from _validator.config.api import ApiConfig class ValidatorConfig: @@ -62,6 +44,7 @@ def __init__(self, config: bt.config): self.user_uid = int( self.metagraph.hotkeys.index(self.wallet.hotkey.ss58_address) ) + self.localnet = self.bt_config.localnet self.api = ApiConfig(self.bt_config) # Initialize wandb logger diff --git a/neurons/_validator/config/api.py b/neurons/_validator/config/api.py new file mode 100644 index 00000000..f49009f9 --- /dev/null +++ b/neurons/_validator/config/api.py @@ -0,0 +1,24 @@ +import bittensor as bt + + +class ApiConfig: + """ + Configuration class for the API. + + Attributes: + enabled (bool): Whether the API is enabled. + host (str): The host for the API. + port (int): The port for the API. + workers (int): The number of workers for the API. + verify_external_signatures (bool): Whether to verify external signatures. + certificate_path (str): The path to the certificate directory. + """ + + def __init__(self, config: bt.config): + self.enabled = not config.ignore_external_requests + self.host = config.external_api_host + self.port = config.external_api_port + self.workers = config.external_api_workers + self.verify_external_signatures = not config.do_not_verify_external_signatures + self.certificate_path = config.certificate_path + self.whitelisted_public_keys = config.whitelisted_public_keys diff --git a/neurons/_validator/core/request.py b/neurons/_validator/core/request.py index 2b0de402..eac87532 100644 --- a/neurons/_validator/core/request.py +++ b/neurons/_validator/core/request.py @@ -9,12 +9,17 @@ @dataclass class Request: + """ + A request to be sent to a miner. + """ + uid: int axon: bt.axon synapse: QueryZkProof | ProofOfWeightsSynapse circuit: Circuit request_type: RequestType inputs: GenericInput | None = None + request_hash: str | None = None response_time: float | None = None deserialized: dict[str, object] | None = None result: bt.Synapse | None = None diff --git a/neurons/_validator/core/request_pipeline.py b/neurons/_validator/core/request_pipeline.py index 7be84052..d7dff276 100644 --- a/neurons/_validator/core/request_pipeline.py +++ b/neurons/_validator/core/request_pipeline.py @@ -1,14 +1,14 @@ +from __future__ import annotations from _validator.pow.proof_of_weights_handler import ProofOfWeightsHandler import bittensor as bt import random from protocol import ProofOfWeightsSynapse, QueryZkProof from _validator.scoring.score_manager import ScoreManager -from _validator.core.api import ValidatorAPI +from _validator.api import ValidatorAPI from _validator.config import ValidatorConfig from constants import ( BATCHED_PROOF_OF_WEIGHTS_MODEL_ID, - DEFAULT_NETUID, CIRCUIT_WEIGHTS, SINGLE_PROOF_OF_WEIGHTS_MODEL_ID, ) @@ -19,6 +19,7 @@ from _validator.core.request import Request from utils.wandb_logger import safe_log from _validator.models.request_type import RequestType +import copy class RequestPipeline: @@ -32,55 +33,74 @@ def __init__( def prepare_requests(self, filtered_uids) -> list[Request]: """ - Prepare a batch of requests for the provided UIDs. + Prepare requests for the current validation step. + This includes both regular benchmark requests and any external requests. Args: - filtered_uids (list): List of filtered UIDs to query. + filtered_uids (list): List of UIDs to send requests to. Returns: - list: List of prepared requests. + list[Request]: List of prepared requests. """ + if len(filtered_uids) == 0: + bt.logging.error("No UIDs to query") + return [] - request_type = ( - RequestType.BENCHMARK - if not self.api.external_requests_queue - else RequestType.RWR - ) + if self.api.external_requests_queue: + return self._prepare_real_world_requests(filtered_uids) + return self._prepare_benchmark_requests(filtered_uids) - netuid = self.config.subnet_uid - circuit = self.select_circuit_for_benchmark() - request = None - if request_type == RequestType.RWR: - netuid, request = self.api.external_requests_queue.pop() - bt.logging.debug(f"Processing external request for netuid {netuid}") + def _prepare_real_world_requests(self, filtered_uids: list[int]) -> list[Request]: + external_request = self.api.external_requests_queue.pop() + requests = [] - target_netuid = ( - DEFAULT_NETUID if netuid == self.config.subnet_uid else netuid + for uid in filtered_uids: + synapse = self.get_synapse_request( + RequestType.RWR, external_request.circuit, external_request ) - circuit = circuit_store.get_latest_circuit_for_netuid(target_netuid) - bt.logging.info( - f"The next round of requests will be for {circuit} in {request_type} mode" - ) + if isinstance(synapse, ProofOfWeightsSynapse): + input_data = synapse.inputs + else: + input_data = synapse.query_input["public_inputs"] - requests = [ - Request( + try: + self.hash_guard.check_hash(input_data) + except Exception as e: + bt.logging.error(f"Hash already exists: {e}") + safe_log({"hash_guard_error": 1}) + continue + + request = Request( uid=uid, axon=self.config.metagraph.axons[uid], - synapse=self.get_synapse_request(uid, request_type, circuit, request), - circuit=circuit, - request_type=request_type, + synapse=synapse, + circuit=external_request.circuit, + request_type=RequestType.RWR, + inputs=GenericInput(RequestType.RWR, input_data), + request_hash=external_request.hash, ) - for uid in filtered_uids - ] - - for request in requests: - input_data = ( - request.synapse.inputs - if request.circuit.metadata.type == CircuitType.PROOF_OF_WEIGHTS - else request.synapse.query_input["public_inputs"] - ) - request.inputs = GenericInput(RequestType.RWR, input_data) + requests.append(request) + return requests + + def _prepare_benchmark_requests(self, filtered_uids: list[int]) -> list[Request]: + circuit = self.select_circuit_for_benchmark() + if circuit is None: + bt.logging.error("No circuit selected") + return [] + + if circuit.id == BATCHED_PROOF_OF_WEIGHTS_MODEL_ID: + self.score_manager.clear_proof_of_weights_queue() + + requests = [] + for uid in filtered_uids: + synapse = self.get_synapse_request(RequestType.BENCHMARK, circuit) + + if isinstance(synapse, ProofOfWeightsSynapse): + input_data = synapse.inputs + else: + input_data = synapse.query_input["public_inputs"] + try: self.hash_guard.check_hash(input_data) except Exception as e: @@ -88,8 +108,15 @@ def prepare_requests(self, filtered_uids) -> list[Request]: safe_log({"hash_guard_error": 1}) continue - if circuit.id == BATCHED_PROOF_OF_WEIGHTS_MODEL_ID: - self.score_manager.clear_proof_of_weights_queue() + request = Request( + uid=uid, + axon=self.config.metagraph.axons[uid], + synapse=synapse, + circuit=circuit, + request_type=RequestType.BENCHMARK, + inputs=GenericInput(RequestType.RWR, input_data), + ) + requests.append(request) return requests @@ -111,15 +138,17 @@ def format_for_query( def get_synapse_request( self, - uid: int, request_type: RequestType, circuit: Circuit, - request: dict[str, object] | None = None, + request: any | None = None, ) -> ProofOfWeightsSynapse | QueryZkProof: inputs = ( circuit.input_handler(request_type) if request_type == RequestType.BENCHMARK - else circuit.input_handler(RequestType.RWR, request["inputs"]) + else circuit.input_handler( + RequestType.RWR, + copy.deepcopy(request.inputs), + ) ) if request_type == RequestType.RWR: @@ -142,12 +171,10 @@ def get_synapse_request( SINGLE_PROOF_OF_WEIGHTS_MODEL_ID, BATCHED_PROOF_OF_WEIGHTS_MODEL_ID, ]: - # We'll forward the responsibility of handling these to the internal proof of weights handler return ProofOfWeightsHandler.prepare_pow_request( circuit, self.score_manager.proof_of_weights_queue ) - # Otherwise, we'll prepare a regular benchmark request depending on the circuit type if circuit.metadata.type == CircuitType.PROOF_OF_COMPUTATION: return QueryZkProof( model_id=circuit.id, diff --git a/neurons/_validator/core/validator_loop.py b/neurons/_validator/core/validator_loop.py index 4eacfbf8..fdff6568 100644 --- a/neurons/_validator/core/validator_loop.py +++ b/neurons/_validator/core/validator_loop.py @@ -10,7 +10,7 @@ import bittensor as bt from _validator.config import ValidatorConfig -from _validator.core.api import ValidatorAPI +from _validator.api import ValidatorAPI from _validator.core.prometheus import ( log_validation_time, start_prometheus_logging, @@ -24,6 +24,7 @@ from _validator.scoring.weights import WeightsManager from _validator.utils.api import hash_inputs from _validator.utils.axon import query_axons +from _validator.models.request_type import RequestType from _validator.utils.proof_of_weights import save_proof_of_weights from _validator.utils.uid import get_queryable_uids from constants import ( @@ -171,12 +172,25 @@ def _process_requests(self, requests: list[Request]) -> list[MinerResponse]: ] if verified_responses: random_verified_response = random.choice(verified_responses) + request_hash = requests[0].request_hash or hash_inputs( + requests[0].inputs + ) save_proof_of_weights( public_signals=[random_verified_response.public_json], proof=[random_verified_response.proof_content], - proof_filename=hash_inputs(requests[0].inputs), + proof_filename=request_hash, ) + if requests[0].request_type == RequestType.RWR: + self.api.set_request_result( + request_hash, + { + "hash": request_hash, + "public_signals": random_verified_response.public_json, + "proof": random_verified_response.proof_content, + }, + ) + self.score_manager.update_scores(processed_responses) self.weights_manager.update_weights(self.score_manager.scores) @@ -204,7 +218,8 @@ def _log_overhead_time(self, start_time) -> float: def _handle_keyboard_interrupt(self): """Handle keyboard interrupt by cleaning up and exiting.""" bt.logging.success("Keyboard interrupt detected. Exiting validator.") - self.api.stop() + loop = asyncio.get_event_loop() + loop.run_until_complete(self.api.stop()) stop_prometheus_logging() clean_temp_files() sys.exit(0) diff --git a/neurons/_validator/models/api.py b/neurons/_validator/models/api.py deleted file mode 100644 index 8e5f5ae6..00000000 --- a/neurons/_validator/models/api.py +++ /dev/null @@ -1,16 +0,0 @@ -from pydantic import BaseModel - - -class PowInputModel(BaseModel): - """ - Pydantic model for incoming requests. - - inputs: JSON string containing the actual inputs for the proof of weights request. - - signature: Signature of the inputs. Signed by the validator's hotkey. - - sender: Sender's wallet address. - - netuid: The originating subnet the request comes from. - """ - - inputs: str - signature: str - sender: str - netuid: int diff --git a/neurons/_validator/models/base_rpc_request.py b/neurons/_validator/models/base_rpc_request.py new file mode 100644 index 00000000..4ecd9c39 --- /dev/null +++ b/neurons/_validator/models/base_rpc_request.py @@ -0,0 +1,14 @@ +from pydantic import BaseModel +from execution_layer.circuit import Circuit +from _validator.utils.api import hash_inputs + + +class RealWorldRequest(BaseModel): + circuit: Circuit + inputs: dict + + model_config = {"arbitrary_types_allowed": True} + + @property + def hash(self) -> str: + return hash_inputs(self.inputs) diff --git a/neurons/_validator/models/poc_rpc_request.py b/neurons/_validator/models/poc_rpc_request.py new file mode 100644 index 00000000..13cbe204 --- /dev/null +++ b/neurons/_validator/models/poc_rpc_request.py @@ -0,0 +1,29 @@ +from _validator.models.base_rpc_request import RealWorldRequest +from pydantic import Field +from deployment_layer.circuit_store import circuit_store + + +class ProofOfComputationRPCRequest(RealWorldRequest): + """ + Request for the Proof of Computation RPC method. + """ + + circuit_name: str = Field(..., description="The name of the circuit to use") + circuit_version: int | None = Field( + ..., description="The version of the circuit to use" + ) + + def __init__( + self, circuit_name: str, circuit_version: int | None, inputs: dict[str, any] + ): + if circuit_version is None: + circuit = circuit_store.get_latest_circuit_by_name(circuit_name) + else: + circuit = circuit_store.get_circuit_by_name_and_version( + circuit_name=circuit_name, circuit_version=circuit_version + ) + if circuit is None: + raise ValueError( + f"No circuit found for name {circuit_name} and version {circuit_version}" + ) + super().__init__(circuit=circuit, inputs=inputs) diff --git a/neurons/_validator/models/pow_rpc_request.py b/neurons/_validator/models/pow_rpc_request.py new file mode 100644 index 00000000..14a65995 --- /dev/null +++ b/neurons/_validator/models/pow_rpc_request.py @@ -0,0 +1,46 @@ +from __future__ import annotations +from _validator.models.base_rpc_request import RealWorldRequest +from pydantic import Field +from deployment_layer.circuit_store import circuit_store + + +class ProofOfWeightsRPCRequest(RealWorldRequest): + """ + Request for the Proof of Weights RPC method. + """ + + weights_version: int | None = Field( + None, description="The version of weights in use by the origin subnet" + ) + netuid: int = Field(..., description="The origin subnet UID") + evaluation_data: dict = Field(default_factory=dict) + + class Config: + arbitrary_types_allowed = True + extra = "allow" + + def __init__(self, **data): + netuid = data.get("netuid") + weights_version = data.get("weights_version") + evaluation_data = data.get("evaluation_data") + + circuit = None + if weights_version is None: + circuit = circuit_store.get_latest_circuit_for_netuid(netuid) + weights_version = circuit.metadata.weights_version + else: + circuit = circuit_store.get_circuit_for_netuid_and_version( + netuid=netuid, version=weights_version + ) + if circuit is None: + raise ValueError( + f"No circuit found for netuid {netuid} and weights version {weights_version}" + ) + + super().__init__( + circuit=circuit, + inputs=evaluation_data, + evaluation_data=evaluation_data, + netuid=netuid, + weights_version=weights_version, + ) diff --git a/neurons/_validator/pow/proof_of_weights_handler.py b/neurons/_validator/pow/proof_of_weights_handler.py index 07771522..6d73c5a7 100644 --- a/neurons/_validator/pow/proof_of_weights_handler.py +++ b/neurons/_validator/pow/proof_of_weights_handler.py @@ -9,6 +9,11 @@ class ProofOfWeightsHandler: + """ + Handles internal proof of weights + This covers the case where the origin validator is a validator on Omron; + no external requests are needed as this internal mechanism is used to generate the proof of weights. + """ @staticmethod def prepare_pow_request(circuit: Circuit, proof_of_weights_queue): diff --git a/neurons/_validator/utils/api.py b/neurons/_validator/utils/api.py index fa435543..79bacac9 100644 --- a/neurons/_validator/utils/api.py +++ b/neurons/_validator/utils/api.py @@ -2,7 +2,7 @@ from execution_layer.generic_input import GenericInput -def hash_inputs(inputs: GenericInput) -> str: +def hash_inputs(inputs: GenericInput | dict) -> str: """ Hashes inputs to proof of weights, excluding dynamic fields. @@ -12,9 +12,11 @@ def hash_inputs(inputs: GenericInput) -> str: Returns: str: The hashed inputs. """ + if isinstance(inputs, GenericInput): + inputs = inputs.to_json() filtered_inputs = { k: v - for k, v in inputs.to_json().items() + for k, v in inputs.items() if k not in ["validator_uid", "nonce", "uid_responsible_for_proof"] } return hashlib.sha256(str(filtered_inputs).encode()).hexdigest() diff --git a/neurons/constants.py b/neurons/constants.py index feb2df02..add871c4 100644 --- a/neurons/constants.py +++ b/neurons/constants.py @@ -69,3 +69,59 @@ "e84b2e5f223621fa20078eb9f920d8d4d3a4ff95fa6e2357646fdbb43a2557c9": 0.20, "8dcff627a782525ea86196941a694ffbead179905f0cd4550ddc3df9e2b90924": 0.20, } +# Maximum signature lifespan for WebSocket requests +MAX_SIGNATURE_LIFESPAN = 300 +# Whitelisted public keys (ss58 addresses) we accept external requests from by default +# (even if an address is not in the metagraph) +WHITELISTED_PUBLIC_KEYS = [] +# Mainnet <> Testnet UID mapping +MAINNET_TESTNET_UIDS = [ + (1, 61), # apex + (2, 118), # omron + (3, 223), # templar + (4, 40), # targon + (5, 88), # kaito + (6, 155), # infinite + (7, 92), # subvortex + (8, 3), # ptn + (8, 116), # ptn (PTN) + (10, 104), # sturdy + (11, 135), # dippy + (12, 174), # horde + (13, 254), # dataverse + (14, 203), # palaidn + (15, 202), # deval + (16, 120), # bitads + (17, 89), # 3gen + (18, 24), # cortex + (19, 176), # inference + (20, 76), # bitagent + (21, 157), # any-any + (23, 119), # social + (24, 96), # omega + (25, 141), # protein + (26, 25), # alchemy + (27, 15), # compute + (28, 93), # oracle + (31, 123), # naschain + (32, 87), # itsai + (33, 138), # ready + (34, 168), # mind + (35, 78), # logic + (39, 159), # edge + (40, 166), # chunk + (41, 172), # sportstensor + (42, 165), # masa + (43, 65), # graphite + (44, 180), # score + (45, 171), # gen42 + (46, 182), # neural + (48, 208), # nextplace + (49, 100), # automl + (50, 31), # audio + (52, 98), # dojo + (53, 232), # efficient-frontier + (54, 236), # docs-insights + (57, 237), # gaia + (59, 249), # agent-arena +] diff --git a/neurons/deployment_layer/circuit_store.py b/neurons/deployment_layer/circuit_store.py index 2ac7b46f..d1a13143 100644 --- a/neurons/deployment_layer/circuit_store.py +++ b/neurons/deployment_layer/circuit_store.py @@ -105,6 +105,46 @@ def get_latest_circuit_for_netuid(self, netuid: int): return max(matching_circuits, key=lambda c: version.parse(c.metadata.version)) + def get_circuit_for_netuid_and_version( + self, netuid: int, version: int + ) -> Circuit | None: + """ + Get the circuit for a given netuid and version. + """ + matching_circuits = [ + c + for c in self.circuits.values() + if c.metadata.netuid == netuid and c.metadata.weights_version == version + ] + if not matching_circuits: + bt.logging.warning( + f"No circuit found for netuid {netuid} and weights version {version}" + ) + return None + return matching_circuits[0] + + def get_latest_circuit_by_name(self, circuit_name: str) -> Circuit | None: + """ + Get the latest circuit by name. + """ + matching_circuits = [ + c for c in self.circuits.values() if c.metadata.name == circuit_name + ] + return max(matching_circuits, key=lambda c: version.parse(c.metadata.version)) + + def get_circuit_by_name_and_version( + self, circuit_name: str, version: int + ) -> Circuit | None: + """ + Get the circuit by name and version. + """ + matching_circuits = [ + c + for c in self.circuits.values() + if c.metadata.name == circuit_name and c.metadata.version == version + ] + return matching_circuits[0] if matching_circuits else None + def list_circuits(self) -> list[str]: """ Get a list of all circuit IDs. diff --git a/neurons/deployment_layer/model_33b92394b18412622adad75733a6fc659b4e202b01ee8a5465958a6bad8ded62/input.py b/neurons/deployment_layer/model_33b92394b18412622adad75733a6fc659b4e202b01ee8a5465958a6bad8ded62/input.py index 013f88f7..5562ca0c 100644 --- a/neurons/deployment_layer/model_33b92394b18412622adad75733a6fc659b4e202b01ee8a5465958a6bad8ded62/input.py +++ b/neurons/deployment_layer/model_33b92394b18412622adad75733a6fc659b4e202b01ee8a5465958a6bad8ded62/input.py @@ -54,5 +54,5 @@ def process(data: dict[str, object]) -> dict[str, object]: """ Add a random nonce to ensure that the request is not reused. """ - data["nonce"] = secrets.randbits(32) + data["nonce"] = [secrets.randbits(32)] return data diff --git a/neurons/deployment_layer/model_33b92394b18412622adad75733a6fc659b4e202b01ee8a5465958a6bad8ded62/metadata.json b/neurons/deployment_layer/model_33b92394b18412622adad75733a6fc659b4e202b01ee8a5465958a6bad8ded62/metadata.json index fd405fba..c372a61f 100644 --- a/neurons/deployment_layer/model_33b92394b18412622adad75733a6fc659b4e202b01ee8a5465958a6bad8ded62/metadata.json +++ b/neurons/deployment_layer/model_33b92394b18412622adad75733a6fc659b4e202b01ee8a5465958a6bad8ded62/metadata.json @@ -5,6 +5,7 @@ "version": "0.0.1", "proof_system": "EZKL", "netuid": 48, + "weights_version": 0, "type": "proof_of_weights", "external_files": { "pk.key": "https://storage.omron.ai/33b92394b18412622adad75733a6fc659b4e202b01ee8a5465958a6bad8ded62/pk.key" diff --git a/neurons/deployment_layer/model_a4bcecaf699fd9212600a1f2fcaa40c444e1aeaab409ea240a38c33ed356f4e2/input.py b/neurons/deployment_layer/model_a4bcecaf699fd9212600a1f2fcaa40c444e1aeaab409ea240a38c33ed356f4e2/input.py index 75e64aac..94fec1e9 100644 --- a/neurons/deployment_layer/model_a4bcecaf699fd9212600a1f2fcaa40c444e1aeaab409ea240a38c33ed356f4e2/input.py +++ b/neurons/deployment_layer/model_a4bcecaf699fd9212600a1f2fcaa40c444e1aeaab409ea240a38c33ed356f4e2/input.py @@ -25,9 +25,9 @@ class CircuitInputSchema(BaseModel): last_20_difficulty_avg: list[float] has_docker: list[bool] uid: list[int] - allocated_uids: list[list[int]] - penalized_uids: list[list[int]] - validator_uids: list[list[int]] + allocated_uids: list[int] + penalized_uids: list[int] + validator_uids: list[int] success_weight: list[float] difficulty_weight: list[float] time_elapsed_weight: list[float] @@ -89,5 +89,5 @@ def process(data: dict[str, object]) -> dict[str, object]: """ Add a random nonce to ensure that the request is not reused. """ - data["nonce"] = secrets.randbits(32) + data["nonce"] = [secrets.randbits(32)] return data diff --git a/neurons/deployment_layer/model_a4bcecaf699fd9212600a1f2fcaa40c444e1aeaab409ea240a38c33ed356f4e2/metadata.json b/neurons/deployment_layer/model_a4bcecaf699fd9212600a1f2fcaa40c444e1aeaab409ea240a38c33ed356f4e2/metadata.json index 3a646ee3..86e162ad 100644 --- a/neurons/deployment_layer/model_a4bcecaf699fd9212600a1f2fcaa40c444e1aeaab409ea240a38c33ed356f4e2/metadata.json +++ b/neurons/deployment_layer/model_a4bcecaf699fd9212600a1f2fcaa40c444e1aeaab409ea240a38c33ed356f4e2/metadata.json @@ -5,6 +5,7 @@ "version": "0.0.6", "proof_system": "EZKL", "netuid": 27, + "weights_version": 160, "type": "proof_of_weights", "external_files": { "pk.key": "https://storage.omron.ai/a4bcecaf699fd9212600a1f2fcaa40c444e1aeaab409ea240a38c33ed356f4e2/pk.key" diff --git a/neurons/deployment_layer/model_e84b2e5f223621fa20078eb9f920d8d4d3a4ff95fa6e2357646fdbb43a2557c9/metadata.json b/neurons/deployment_layer/model_e84b2e5f223621fa20078eb9f920d8d4d3a4ff95fa6e2357646fdbb43a2557c9/metadata.json index 61314db5..e5483699 100644 --- a/neurons/deployment_layer/model_e84b2e5f223621fa20078eb9f920d8d4d3a4ff95fa6e2357646fdbb43a2557c9/metadata.json +++ b/neurons/deployment_layer/model_e84b2e5f223621fa20078eb9f920d8d4d3a4ff95fa6e2357646fdbb43a2557c9/metadata.json @@ -4,6 +4,7 @@ "author": "Inference Labs", "version": "0.0.6", "netuid": 2, + "weights_version": 1623, "proof_system": "CIRCOM", "type": "proof_of_weights", "external_files": { diff --git a/neurons/execution_layer/circuit.py b/neurons/execution_layer/circuit.py index b56744a7..7d3f829c 100644 --- a/neurons/execution_layer/circuit.py +++ b/neurons/execution_layer/circuit.py @@ -119,7 +119,8 @@ class CircuitMetadata: proof_system: str type: CircuitType external_files: dict[str, str] - netuid: int = -1 + netuid: int | None = None + weights_version: int | None = None @classmethod def from_file(cls, metadata_path: str) -> CircuitMetadata: diff --git a/neurons/validator.py b/neurons/validator.py index 82b05076..bd9aad59 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -4,7 +4,11 @@ import traceback import bittensor as bt -from constants import ONCHAIN_PROOF_OF_WEIGHTS_ENABLED, PROOF_OF_WEIGHTS_INTERVAL +from constants import ( + ONCHAIN_PROOF_OF_WEIGHTS_ENABLED, + PROOF_OF_WEIGHTS_INTERVAL, + WHITELISTED_PUBLIC_KEYS, +) from utils import wandb_logger from _validator.validator_session import ValidatorSession @@ -77,6 +81,15 @@ def get_config_from_args(): help="Whether to ignore external requests.", ) + parser.add_argument( + "--whitelisted-public-keys", + type=str, + nargs="*", + dest="alist", + default=WHITELISTED_PUBLIC_KEYS, + help="Comma separated list of public keys to whitelist for external requests.", + ) + parser.add_argument( "--external-api-host", type=str, @@ -87,7 +100,7 @@ def get_config_from_args(): parser.add_argument( "--external-api-port", type=int, - default=8000, + default=8443, help="The port for the external API.", ) @@ -116,13 +129,21 @@ def get_config_from_args(): help="Whether to run the validator in localnet mode.", ) + parser.add_argument( + "--certificate-path", + type=str, + default=None, + help="A custom path to a directory containing a public and private SSL certificate. " + "(cert.pem and key.pem) " + "Please note that this should not be used unless you have issued your own certificate. " + "Omron will issue a certificate for you by default.", + ) parser.add_argument( "--prometheus-monitoring", action="store_true", default=False, help="Whether to enable prometheus monitoring.", ) - parser.add_argument( "--prometheus-port", type=int, @@ -157,8 +178,7 @@ def get_config_from_args(): config.disable_blacklist if config.disable_blacklist is None else True ) config.external_api_workers = config.external_api_workers or 1 - config.external_api_port = config.external_api_port or 8000 - config.do_not_verify_external_signatures = True + config.external_api_port = config.external_api_port or 8443 config.full_path = os.path.expanduser( "{}/{}/{}/netuid{}/{}".format( @@ -170,6 +190,9 @@ def get_config_from_args(): ) ) + if not config.certificate_path: + config.certificate_path = os.path.join(config.full_path, "cert") + if not os.path.exists(config.full_path): os.makedirs(config.full_path, exist_ok=True) diff --git a/requirements.txt b/requirements.txt index d04143a5..c62b2606 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,11 +2,14 @@ attrs==24.2.0 bittensor==8.5.1 black==23.7.0 ezkl==16.2.0 +fastapi==0.99.1 +jsonrpcserver==5.0.9 fastapi==0.110.1 numpy==2.0.1 packaging==24.1 psutil==6.0.0 pydantic==2.10.3 +pyoopenssl==24.3.0 requests==2.32.3 rich==13.8.1 torch==2.4.1 From 81a5e69029168cf2724d9240f44baaca5fe5e656 Mon Sep 17 00:00:00 2001 From: HudsonGraeme Date: Tue, 7 Jan 2025 07:24:24 +0000 Subject: [PATCH 14/20] Update dependencies - `fastapi` - `pyOpenSSL` --- requirements.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index c62b2606..c92bf766 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,14 +2,13 @@ attrs==24.2.0 bittensor==8.5.1 black==23.7.0 ezkl==16.2.0 -fastapi==0.99.1 +fastapi~=0.110.1 jsonrpcserver==5.0.9 -fastapi==0.110.1 numpy==2.0.1 packaging==24.1 psutil==6.0.0 pydantic==2.10.3 -pyoopenssl==24.3.0 +pyOpenSSL==24.3.0 requests==2.32.3 rich==13.8.1 torch==2.4.1 From f89a076334b7df3b8f410937696e28ac51b54784 Mon Sep 17 00:00:00 2001 From: HudsonGraeme Date: Tue, 7 Jan 2025 15:52:28 +0000 Subject: [PATCH 15/20] Bump version to 6.4.4 --- neurons/__init__.py | 2 +- neurons/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/neurons/__init__.py b/neurons/__init__.py index a668dbb5..f7eb22b0 100644 --- a/neurons/__init__.py +++ b/neurons/__init__.py @@ -2,4 +2,4 @@ This version number is used to trigger automatic updates. """ -__version__ = "6.2.4" +__version__ = "6.4.4" diff --git a/neurons/constants.py b/neurons/constants.py index add871c4..31440322 100644 --- a/neurons/constants.py +++ b/neurons/constants.py @@ -34,7 +34,7 @@ # Shift in seconds to apply to the minimum response time for vertical asymptote adjustment MINIMUM_SCORE_SHIFT = 0.0 # Weights version hyperparameter -WEIGHTS_VERSION = 1624 +WEIGHTS_VERSION = 1644 # Rate limit for weight updates WEIGHT_RATE_LIMIT: int = 100 # Delay between requests From be4889fe02937a3ed40e45eee14c1ae21ebcf705 Mon Sep 17 00:00:00 2001 From: Bondarenko Oleg Date: Wed, 8 Jan 2025 23:30:15 +0200 Subject: [PATCH 16/20] Use aria2 instead of curl in model-sync script. Do not sync models during setup stage (#43) --- setup.sh | 22 ++++++++++++++++++---- sync_model_files.sh | 2 +- 2 files changed, 19 insertions(+), 5 deletions(-) mode change 100755 => 100644 setup.sh diff --git a/setup.sh b/setup.sh old mode 100755 new mode 100644 index 8c2676f3..d9297b89 --- a/setup.sh +++ b/setup.sh @@ -179,6 +179,24 @@ if ! command -v jq >/dev/null 2>&1; then esac fi +# Check if aria2 is installed, if not then install it +if ! [ -x "$(command -v aria2c)" ]; then + echo "aria2 not found. Installing aria2..." + case ${OS} in + 'Linux') + sudo apt update + sudo apt install -y aria2 + ;; + 'Darwin') + brew install aria2 + ;; + *) + echo "Unsupported OS for aria2 installation" + exit 1 + ;; + esac +fi + # Check if PM2 is installed, if not then install it if ! command -v pm2 >/dev/null 2>&1; then echo "pm2 not found. Installing pm2..." @@ -245,10 +263,6 @@ fi # Set working directory to install dir cd "${INSTALL_PATH}" || exit -# Sync remote files for all models -echo "Syncing model files..." -bash "./sync_model_files.sh" - # Display next steps echo -e "\033[32mOmron has been installed to ${INSTALL_PATH}. Please run \`cd ${INSTALL_PATH}\` to navigate to the directory.\033[0m" echo -e "\033[32mPlease see ${INSTALL_PATH}/docs/shared_setup_steps.md for the next steps.\033[0m" diff --git a/sync_model_files.sh b/sync_model_files.sh index cb2bace4..7fcf9a57 100755 --- a/sync_model_files.sh +++ b/sync_model_files.sh @@ -26,7 +26,7 @@ for MODEL_FOLDER in $(find "$MODEL_DIR" -maxdepth 1 -type d -name 'model_*'); do fi # If the file doesn't exist we'll pull from the URL specified echo "Downloading ${url} to ${MODEL_FOLDER}/${key}..." - curl -o "${MODEL_FOLDER}/${key}" "${url}" + aria2c ${url} -d "${MODEL_FOLDER}" -o "${key}" # If the file doesn't download then we'll skip this file and echo the error if [ $? -ne 0 ]; then echo "Error: Failed to download ${url} to ${MODEL_FOLDER}/${key}" From afe66af40bc0efdb6370cdf1fc4abddd9b4e4ad4 Mon Sep 17 00:00:00 2001 From: Bondarenko Oleg Date: Thu, 9 Jan 2025 00:52:36 +0200 Subject: [PATCH 17/20] Prometheus docs (#41) --- docs/README.md | 8 ++ docs/command_line_arguments.md | 2 + docs/prometheus.md | 178 ++++++++++++++++++++++++++ neurons/_validator/core/prometheus.py | 20 +-- 4 files changed, 199 insertions(+), 9 deletions(-) create mode 100644 docs/prometheus.md diff --git a/docs/README.md b/docs/README.md index 815ebe45..67bb0ec5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -42,6 +42,12 @@ Instructions for configuring a PM2 ecosystem file which allows for streamlined s [View PM2 Configuration →] +## [Prometheus and Grafana Configuration] + +Instructions for configuring Prometheus and Grafana to monitor your validator. + +[View Prometheus and Grafana Configuration →] + ## [Versioning] Information about Omron's versioning system and how to stay up-to-date with the latest releases. @@ -60,6 +66,8 @@ Information about Omron's versioning system and how to stay up-to-date with the [View mainnet instructions →]: ./running_on_mainnet.md [View testnet instructions →]: ./running_on_testnet.md [View local instructions →]: ./running_on_staging.md +[Prometheus and Grafana Configuration]: ./prometheus.md +[View Prometheus and Grafana Configuration →]: ./prometheus.md [Versioning]: ./versioning.md [View versioning →]: ./versioning.md [← Back to home]: ../ diff --git a/docs/command_line_arguments.md b/docs/command_line_arguments.md index 43c18ad4..7bcba01d 100644 --- a/docs/command_line_arguments.md +++ b/docs/command_line_arguments.md @@ -37,6 +37,8 @@ The below arguments are specific to validator software and have no effect on min | `--external-api-workers` | No | `1` | Integer | The number of workers for the validator's external API. | | `--external-api-host` | No | `0.0.0.0` | String | The host for the validator's external API. | | `--do-not-verify-external-signatures` | No | `False` | `True`, `False` | External PoW requests are signed by validator's (sender's) wallet. By default, these are checked to ensure legitimacy. This should only be disabled in controlled development environments. | +| `--prometheus-monitoring` | No | `False` | `True`, `False` | Whether to enable sering of metrics for Prometheus monitoring. | +| `--prometheus-port` | No | `9090` | Integer | The port for the Prometheus data source. | ## Built-in Arguments diff --git a/docs/prometheus.md b/docs/prometheus.md new file mode 100644 index 00000000..6c7c8428 --- /dev/null +++ b/docs/prometheus.md @@ -0,0 +1,178 @@ +# Setting Up Prometheus and Grafana for Metrics Analysis + +## Important Port Configuration + +> [!WARNING] +> **Port Conflicts** +> By default, both prometheus server and the validator prometheus exporter use port 9090. If installing on the same machine, you must either: +> +> - Change the validator exporter port: `--prometheus-port ` +> - Change the prometheus server port: `--web.listen-address=:` + +## Installation Options + +Choose one of two approaches: + +1. [Manual Installation](#manual-installation) - Step-by-step setup of individual components +2. [Docker Installation](#docker-installation) - Quick setup using Docker Compose + +## Manual Installation + +Our application provides the ability for validators to analyze some metrics with Prometheus. Validation metrics such as validation time, request times, proof sizes, ratio of verified results, and response times are served by default on port `9090`. Follow these step-by-step instructions to set up a basic Grafana UI for Prometheus metrics exposed by the validator instance. + +For enabling metrics serving add `--prometheus-monitoring` flag to the validator command line. +For changing the port of the metrics server add `--prometheus-port {port_number}` flag to the validator command line. + +Take a note by default Prometheus and validator data source use the same port. So in case you want to install Prometheus to the same machine as the validator, you need to change the port of the validator metrics server (with `--prometheus-port {port_number}` flag) or for Prometheus itself (with `--web.listen-address=:{port_number}` flag). + +### Step 1: Install Prometheus + +1. Download the latest Prometheus release from the [official website](https://prometheus.io/download/). +2. Extract the downloaded archive. +3. Navigate to the extracted directory. + +### Step 2: Configure Prometheus + +1. Open the `prometheus.yml` configuration file in a text editor. +2. Add a new job under the `omron-validator-metrics` section to scrape metrics from the validator instance: + +```yaml +scrape_configs: + - job_name: "omron-validator-metrics" + static_configs: + - targets: ["localhost:9090"] # Replace with validator IP if needed +``` + +3. Save the `prometheus.yml` file. + +### Step 3: Start Prometheus + +1. In the terminal, navigate to the Prometheus directory. +2. Start Prometheus by running the following command: + +```sh +./prometheus --config.file=prometheus.yml +``` + +3. Prometheus will start and begin scraping metrics from the validator instance on port 9090. + +Your Prometheus instance is now set up to fetch metrics exposed by the validator instance. You can verify the setup by opening `http://localhost:9090/targets` in a web browser and checking that the `omron-validator-metrics` job is listed and the endpoint is up. + +Take a look at the [Prometheus documentation](https://prometheus.io/docs/introduction/overview/) for more information on how to use Prometheus. + +### Step 4: Install Grafana + +1. Download the latest Grafana release from the [official website](https://grafana.com/grafana/download). +2. Extract the downloaded archive. +3. Navigate to the extracted directory. +4. Start Grafana by running `./bin/grafana-server`. +5. Open your web browser and go to `http://localhost:3000` to access the Grafana UI. + +### Step 4: Add Prometheus Data Source in Grafana + +1. Open your Grafana instance in a web browser. +2. Log in with your credentials. (The default username and password are `admin`.) +3. Click on the **Connections/Data Sources** in the left sidebar. +4. Click on the **Add data source** button. +5. Select **Prometheus** from the list of available data sources. +6. In the **HTTP** section, set the URL to `http://localhost:9090`. +7. Click on the **Save & Test** button to verify the connection. + +### Step 5: Create a Dashboard + +1. Click on **Dashboard** in the left sidebar. +2. Click on **Create Dashboard** and **Add Visualization**. +3. Select **Prometheus** as the data source. +4. Select desired metric in `Queries` tab (e.g., `proof_sizes_bytes`) and optionally label for it. +5. (Optionally) Add one more query, play with visualization options, and customize the panel as needed. +6. Click on **Save Dashboard** to save the panel. + +### Step 6: Add Panels for Other Metrics + +1. Repeat the process of adding new panels for each of the following metrics: + +- Validation time (`validation_time_seconds`) +- Request times (`request_time_seconds`) +- Proof sizes (`proof_sizes_bytes`) +- Ratio of verified results (`verified_results_ratio`) +- Response times (`response_time_seconds`) + +Your Grafana dashboard is now set up to display Prometheus metrics exposed by the validator instance. You can further customize the dashboard by adding alerts, annotations, and more. + +## Docker Installation + +Create a project directory with: + +``` +project/ +├── docker-compose.yml +├── prometheus.yml +├── grafana_data/ # Will be created automatically +└── certs/ # Will be created automatically +``` + +## Docker Compose Configuration + +Instead of installing Prometheus and Grafana manually, you can use Docker Compose to set up both services easily. Create a `docker-compose.yml` file with the following configuration: + +```yaml +services: + prometheus: + image: prom/prometheus + ports: + - 9095:9090 # publish prometheus to host machine port 9095 + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + networks: + - monitoring + grafana: + image: grafana/grafana + volumes: + # optional section to persist grafana data + - ./grafana_data:/var/lib/grafana + - ./certs:/certs + ports: + - 3000:3000 + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin # set admin password for grafana here + networks: + - monitoring +networks: + monitoring: + driver: bridge +``` + +Create a `prometheus.yml` file with the configuration as shown in the previous steps. Create a `grafana_data` and `certs` directory in the same directory as the `docker-compose.yml` file. Run the following command to start Prometheus and Grafana: + +```sh +docker-compose up +``` + +Prometheus will be available at `http://localhost:9095` and Grafana at `http://localhost:3000`. You can access both services in your web browser and set up the data source and dashboard as described in the previous steps. + +For more information on using Docker Compose with Prometheus and Grafana, refer to the [official Prometheus Docker documentation](https://prometheus.io/docs/prometheus/latest/installation/). + +## Security Considerations + +> [!WARNING] +> **Production Deployment Warning** +> The setup described above is suitable for local development. For production deployments: +> +> - Use strong passwords for Grafana +> - Configure TLS/SSL for both Prometheus and Grafana +> - Implement proper authentication mechanisms +> - Consider network isolation using Docker networks or firewalls + +## Troubleshooting + +1. **Verify Metrics Collection** + + - Check Prometheus targets: `http://localhost:9090/targets` + - Confirm metrics endpoint: `curl http://localhost:9090/metrics` + +2. **Common Issues** + - Port conflicts: Check if ports 9090/3000 are already in use + - Connection refused: Ensure validator metrics are enabled + - No data in Grafana: Verify Prometheus data source configuration + +For additional help, consult the [Prometheus troubleshooting guide](https://prometheus.io/docs/prometheus/latest/troubleshooting/). diff --git a/neurons/_validator/core/prometheus.py b/neurons/_validator/core/prometheus.py index c5b22cf2..88f5c8b7 100644 --- a/neurons/_validator/core/prometheus.py +++ b/neurons/_validator/core/prometheus.py @@ -3,16 +3,16 @@ from typing import Optional from wsgiref.simple_server import WSGIServer -from prometheus_client import Summary, start_http_server +from prometheus_client import Summary, Histogram, start_http_server _server: Optional[WSGIServer] = None _thread: Optional[threading.Thread] = None -_validation_times: Optional[Summary] = None -_response_times: Optional[Summary] = None -_proof_sizes: Optional[Summary] = None -_verification_ratio: Optional[Summary] = None +_validation_times: Optional[Histogram] = None +_response_times: Optional[Histogram] = None +_proof_sizes: Optional[Histogram] = None +_verification_ratio: Optional[Histogram] = None def start_prometheus_logging(port: int) -> None: @@ -20,16 +20,18 @@ def start_prometheus_logging(port: int) -> None: _server, _thread = start_http_server(port) global _validation_times, _response_times, _proof_sizes, _verification_ratio - _validation_times = Summary("validating_seconds", "Time spent validating responses") - _response_times = Summary( + _validation_times = Histogram( + "validating_seconds", "Time spent validating responses" + ) + _response_times = Histogram( "requests_seconds", "Time spent processing requests", ["aggregation_type", "model"], ) - _proof_sizes = Summary( + _proof_sizes = Histogram( "proof_sizes", "Size of proofs", ["aggregation_type", "model"] ) - _verification_ratio = Summary( + _verification_ratio = Histogram( "verified_proofs_ratio", "Verified proofs ratio", ["model"] ) From 2d3dc9c6d9a7e6ad1a10dd3ff97bfed28dacb04d Mon Sep 17 00:00:00 2001 From: Eric Date: Thu, 9 Jan 2025 15:38:26 -0500 Subject: [PATCH 18/20] Cleanup devcontainer --- devcontainer.Dockerfile => .devcontainer/Dockerfile | 0 .devcontainer/devcontainer.json | 3 ++- 2 files changed, 2 insertions(+), 1 deletion(-) rename devcontainer.Dockerfile => .devcontainer/Dockerfile (100%) diff --git a/devcontainer.Dockerfile b/.devcontainer/Dockerfile similarity index 100% rename from devcontainer.Dockerfile rename to .devcontainer/Dockerfile diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 9d5228db..d2fc74e1 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -6,7 +6,8 @@ // "image": "mcr.microsoft.com/devcontainers/base:jammy", "build": { // Path is relative to the devcontainer.json file. - "dockerfile": "../devcontainer.Dockerfile" + "context": "..", + "dockerfile": "Dockerfile" }, "remoteUser": "root", "customizations": { From 1fe9293295512551a479bf06554d1bcc69563509 Mon Sep 17 00:00:00 2001 From: Eric Date: Fri, 10 Jan 2025 17:06:26 -0500 Subject: [PATCH 19/20] Migrate devcontainer base to https://github.com/inference-labs-inc/tensor-tools --- .devcontainer/Dockerfile | 111 +------------------------------- .devcontainer/devcontainer.json | 33 +++++----- 2 files changed, 16 insertions(+), 128 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 59827602..9841d998 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,113 +1,4 @@ -FROM --platform=linux/amd64 mcr.microsoft.com/devcontainers/base:noble - -# Install dependencies and some nice to have tools -RUN apt update && \ - apt install -y \ - python3-dev \ - python3-venv \ - build-essential \ - jq \ - git \ - curl \ - make \ - clang \ - pkg-config \ - libssl-dev \ - llvm \ - libudev-dev \ - protobuf-compiler \ - byobu \ - fish \ - wget \ - && rm -rf /var/lib/apt/lists/* - -# Install Rust -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y -ENV PATH="/root/.cargo/bin:${PATH}" - -# Install subtensor -RUN git clone https://github.com/opentensor/subtensor.git && \ - cd subtensor && \ - git checkout v1.1.6 && \ - cargo build --workspace --profile=release --features pow-faucet --manifest-path "Cargo.toml" - -# Install Jolt -ENV RUST_TOOLCHAIN=nightly-2024-09-30 -RUN rustup toolchain install ${RUST_TOOLCHAIN} && \ - cargo +${RUST_TOOLCHAIN} install --git https://github.com/a16z/jolt --force --bins jolt - -# Install node et al. -RUN curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.0/install.sh | bash && \ - export NVM_DIR="/root/.nvm" && \ - [ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh" && \ - [ -s "$NVM_DIR/bash_completion" ] && \. "$NVM_DIR/bash_completion" && \ - nvm install 20 && \ - npm install --prefix /root/.snarkjs snarkjs@0.7.4 - -# Use a venv because of https://peps.python.org/pep-0668/ -RUN python3 -m venv /opt/venv && \ - /opt/venv/bin/python3 -m pip install --upgrade pip -ENV PATH="/opt/venv/bin:${PATH}" - -# Install Python dependencies, and some more nice to have tools -COPY requirements.txt /opt/omron/requirements.txt -RUN TORCH_VERSION=$(grep "^torch" /opt/omron/requirements.txt) && \ - pip3 install $TORCH_VERSION --index-url https://download.pytorch.org/whl/cpu && \ - pip3 install -r /opt/omron/requirements.txt -RUN pip3 install bittensor-cli==8.3.1 bpython ptpython pipdeptree pysnooper coverage - -# Set flags for local subtensor -ENV WPATH="--wallet.path /root/.bittensor/wallets/" -ENV LOCALNET="--subtensor.chain_endpoint ws://127.0.0.1:9946" - -# Create wallets -RUN btcli wallet new_coldkey --no-use-password --n-words 15 $WPATH --wallet.name owner && \ - btcli wallet new_hotkey --no-use-password --n-words 15 $WPATH --wallet.name owner --wallet.hotkey default && \ - btcli wallet new_coldkey --no-use-password --n-words 15 $WPATH --wallet.name miner && \ - btcli wallet new_hotkey --no-use-password --n-words 15 $WPATH --wallet.name miner --wallet.hotkey default && \ - btcli wallet new_coldkey --no-use-password --n-words 15 $WPATH --wallet.name validator && \ - btcli wallet new_hotkey --no-use-password --n-words 15 $WPATH --wallet.name validator --wallet.hotkey default - -# Mint some tokens -RUN cd subtensor && \ - BUILD_BINARY=0 ./scripts/localnet.sh > /dev/null 2>&1 & \ - sleep 5 && \ - yes | btcli wallet faucet --wallet.name owner $WPATH $LOCALNET & \ - sleep 60 && \ - pkill -2 localnet ; \ - pkill -2 subtensor ; \ - sleep 1 - -RUN cd subtensor && \ - BUILD_BINARY=0 ./scripts/localnet.sh --no-purge > /dev/null 2>&1 & \ - sleep 5 && \ - yes | btcli wallet faucet --wallet.name miner $WPATH $LOCALNET & \ - sleep 60 && \ - pkill -2 localnet ; \ - pkill -2 subtensor ; \ - sleep 1 - -RUN cd subtensor && \ - BUILD_BINARY=0 ./scripts/localnet.sh --no-purge > /dev/null 2>&1 & \ - sleep 5 && \ - yes | btcli wallet faucet --wallet.name validator $WPATH $LOCALNET & \ - sleep 60 && \ - pkill -2 localnet ; \ - pkill -2 subtensor ; \ - sleep 1 - -# Register the subnet -RUN cd subtensor && \ - BUILD_BINARY=0 ./scripts/localnet.sh --no-purge > /dev/null 2>&1 & \ - sleep 5 && \ - btcli subnet create --wallet.name owner --no-prompt $WPATH $LOCALNET && \ - btcli subnet register --wallet.name miner --no-prompt --netuid 1 --wallet.hotkey default $WPATH $LOCALNET && \ - btcli subnet register --wallet.name validator --no-prompt --netuid 1 --wallet.hotkey default $WPATH $LOCALNET && \ - btcli root nominate --wallet.name validator --no-prompt --wallet.hotkey default $WPATH $LOCALNET && \ - btcli stake add --wallet.name validator --no-prompt --amount 100 --wallet.hotkey default $WPATH $LOCALNET && \ - pkill -2 localnet ; \ - pkill -2 subtensor ; \ - sleep 1 +FROM ghcr.io/inference-labs-inc/bittensor-devcontainer:latest # Add scripts to start and stop the localnet, and aliases for common btcli commands RUN echo "#!/usr/bin/env bash" > /start_localnet.sh && \ diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index d2fc74e1..170c5f6a 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -9,24 +9,15 @@ "context": "..", "dockerfile": "Dockerfile" }, - "remoteUser": "root", - "customizations": { - "vscode": { - "settings": { - "python.defaultInterpreterPath": "/opt/venv/bin/python" - } - } - }, - "runArgs": ["--platform=linux/amd64"], - "features": { - "ghcr.io/devcontainers/features/python:1": { - "installTools": true, - "version": "os-provided" - } - } + // "runArgs": ["--platform=linux/amd64"], // Features to add to the dev container. More info: https://containers.dev/features. - // "features": {}, + // "features": { + // "ghcr.io/devcontainers/features/python:1": { + // "installTools": true, + // "version": "os-provided" + // } + // }, // Use 'forwardPorts' to make a list of ports inside the container available locally. // "forwardPorts": [], @@ -35,8 +26,14 @@ // "postCreateCommand": "uname -a", // Configure tool-specific properties. - // "customizations": {}, + "customizations": { + "vscode": { + "settings": { + "python.defaultInterpreterPath": "/opt/venv/bin/python" + } + } + }, // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. - // "remoteUser": "root" + "remoteUser": "root" } From 7b9a4737ef483ca3b45511297c8f02eaaf5f6f04 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 15 Jan 2025 14:18:13 -0500 Subject: [PATCH 20/20] Move remaining bits of devcontainer to https://github.com/inference-labs-inc/tensor-tools --- .devcontainer/Dockerfile | 17 ----------------- .devcontainer/devcontainer.json | 15 ++++++++------- 2 files changed, 8 insertions(+), 24 deletions(-) delete mode 100644 .devcontainer/Dockerfile diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile deleted file mode 100644 index 9841d998..00000000 --- a/.devcontainer/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -FROM ghcr.io/inference-labs-inc/bittensor-devcontainer:latest - -# Add scripts to start and stop the localnet, and aliases for common btcli commands -RUN echo "#!/usr/bin/env bash" > /start_localnet.sh && \ - echo "cd /subtensor" >> /start_localnet.sh && \ - echo "BUILD_BINARY=0 ./scripts/localnet.sh --no-purge > /dev/null 2>&1 &" >> /start_localnet.sh && \ - chmod +x /start_localnet.sh && \ - echo "#!/usr/bin/env bash" > /stop_localnet.sh && \ - echo "pkill -2 localnet" >> /stop_localnet.sh && \ - echo "pkill -2 subtensor" >> /stop_localnet.sh && \ - chmod +x /stop_localnet.sh && \ - echo 'alias btcliwo="btcli wallet overview '"$WPATH"' '"$LOCALNET"'"' >> ~/.bashrc && \ - fish -c 'alias --save btcliwo="btcli wallet overview '"$WPATH"' '"$LOCALNET"'"' && \ - echo 'alias btclisl="btcli subnet list '"$LOCALNET"'"' >> ~/.bashrc && \ - fish -c 'alias --save btclisl="btcli subnet list '"$LOCALNET"'"' && \ - echo "source /opt/venv/bin/activate" >> ~/.bashrc && \ - echo "source /opt/venv/bin/activate.fish" >> ~/.config/fish/config.fish diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 170c5f6a..c1edfc9e 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -3,12 +3,13 @@ { "name": "Ubuntu", // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile - // "image": "mcr.microsoft.com/devcontainers/base:jammy", - "build": { - // Path is relative to the devcontainer.json file. - "context": "..", - "dockerfile": "Dockerfile" - }, + // Built from https://github.com/inference-labs-inc/tensor-tools/blob/main/bittensor-devcontainer/Dockerfile + "image": "ghcr.io/inference-labs-inc/bittensor-devcontainer:latest", + // "build": { + // // Path is relative to the devcontainer.json file. + // "context": "..", + // "dockerfile": "Dockerfile" + // }, // "runArgs": ["--platform=linux/amd64"], // Features to add to the dev container. More info: https://containers.dev/features. @@ -29,7 +30,7 @@ "customizations": { "vscode": { "settings": { - "python.defaultInterpreterPath": "/opt/venv/bin/python" + "python.defaultInterpreterPath": "/home/vscode/.venv/bin/python" } } },