Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add base node responsiveness monitoring #6715

Merged
merged 4 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions applications/minotari_node/src/commands/command/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ mod rewind_blockchain;
mod search_kernel;
mod search_utxo;
mod status;
mod test_peer_liveness;
mod unban_all_peers;
mod version;
mod watch_command;
Expand Down Expand Up @@ -118,6 +119,7 @@ pub enum Command {
ResetOfflinePeers(reset_offline_peers::Args),
RewindBlockchain(rewind_blockchain::Args),
AddPeer(add_peer::ArgsAddPeer),
TestPeerLiveness(test_peer_liveness::ArgsTestPeerLiveness),
BanPeer(ban_peer::ArgsBan),
UnbanPeer(ban_peer::ArgsUnban),
UnbanAllPeers(unban_all_peers::Args),
Expand Down Expand Up @@ -239,6 +241,8 @@ impl CommandContext {
Command::CreateTlsCerts(_) |
Command::Quit(_) |
Command::Exit(_) => 30,
// This test can potentially take a longer time and should be allowed to run longer
Command::TestPeerLiveness(_) => 240,
// These commands involve intense blockchain db operations and needs a lot of time to complete
Command::CheckDb(_) | Command::PeriodStats(_) | Command::RewindBlockchain(_) => 600,
};
Expand Down Expand Up @@ -272,6 +276,7 @@ impl HandleCommand<Command> for CommandContext {
Command::GetChainMetadata(args) => self.handle_command(args).await,
Command::GetDbStats(args) => self.handle_command(args).await,
Command::GetPeer(args) => self.handle_command(args).await,
Command::TestPeerLiveness(args) => self.handle_command(args).await,
Command::GetStateInfo(args) => self.handle_command(args).await,
Command::GetNetworkStats(args) => self.handle_command(args).await,
Command::ListPeers(args) => self.handle_command(args).await,
Expand Down
246 changes: 246 additions & 0 deletions applications/minotari_node/src/commands/command/test_peer_liveness.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
// Copyright 2022, The Tari Project
//
// Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
// following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following
// disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
// following disclaimer in the documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
// USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use std::{
fs,
fs::OpenOptions,
io::Write,
path::PathBuf,
process,
time::{Duration, Instant},
};

use anyhow::Error;
use async_trait::async_trait;
use chrono::Local;
use clap::Parser;
use minotari_app_utilities::utilities::UniPublicKey;
use tari_common_types::types::PublicKey;
use tari_comms::{
multiaddr::Multiaddr,
net_address::{MultiaddressesWithStats, PeerAddressSource},
peer_manager::{NodeId, Peer, PeerFeatures, PeerFlags},
};
use tari_p2p::services::liveness::LivenessEvent;
use tokio::{sync::watch, task};

use super::{CommandContext, HandleCommand};

/// Adds a peer
#[derive(Debug, Parser)]
pub struct ArgsTestPeerLiveness {
/// The public key of the peer to be tested
public_key: UniPublicKey,
/// The address of the peer to be tested
address: Multiaddr,
/// Auto exit the base node after test
exit: Option<bool>,
/// Write the responsiveness result to file - results will be written to
/// 'peer_liveness_test.log'
output_to_file: Option<bool>,
/// Start with a new log file
refresh_file: Option<bool>,
/// Optional output directory (otherwise current directory will be used)
output_directory: Option<PathBuf>,
}

#[derive(Debug, Eq, PartialEq, Clone, Copy)]
enum PingResult {
Initial,
Success,
Fail,
}

#[async_trait]
impl HandleCommand<ArgsTestPeerLiveness> for CommandContext {
async fn handle_command(&mut self, args: ArgsTestPeerLiveness) -> Result<(), Error> {
println!("\nTesting peer liveness...\n");
let peer_manager = self.comms.peer_manager();

let public_key = args.public_key.into();
if *self.comms.node_identity().public_key() == public_key {
return Err(Error::msg("Self liveness test not supported"));
}
let node_id = NodeId::from_public_key(&public_key);
let node_id_clone = node_id.clone();
let public_key_clone = public_key.clone();
let address_clone = args.address.clone();

// Remove the peer from the peer manager (not the peer db)
let _res = peer_manager.delete_peer(&node_id).await;

// Create a new peer with the given address, if the peer exists, this will merge the given address
let peer = Peer::new(
public_key.clone(),
node_id.clone(),
MultiaddressesWithStats::from_addresses_with_source(vec![args.address], &PeerAddressSource::Config),
PeerFlags::empty(),
PeerFeatures::COMMUNICATION_NODE,
vec![],
String::new(),
);
peer_manager.add_peer(peer).await?;

let (tx, mut rx) = watch::channel(PingResult::Initial);

// Attempt to dial and ping the peer
let start = Instant::now();
for _ in 0..5 {
if self.dial_peer(node_id.clone()).await.is_ok() {
println!("🏓 Peer ({}, {}) dialed successfully", node_id, public_key);
let mut liveness_events = self.liveness.get_event_stream();
let mut liveness = self.liveness.clone();
task::spawn(async move {
if let Ok(nonce) = liveness.send_ping(node_id.clone()).await {
println!("🏓 Pinging peer ({}, {}) with nonce {} ...", node_id, public_key, nonce);
for _ in 0..5 {
match liveness_events.recv().await {
Ok(event) => {
if let LivenessEvent::ReceivedPong(pong) = &*event {
if pong.node_id == node_id && pong.nonce == nonce {
println!(
"🏓️ Pong: peer ({}, {}) responded with nonce {}, round-trip-time is \
{:.2?}!",
pong.node_id,
public_key,
pong.nonce,
pong.latency.unwrap_or_default()
);
let _ = tx.send(PingResult::Success);
return;
}
}
},
Err(e) => {
println!("🏓 Ping peer ({}, {}) gave error: {}", node_id, public_key, e);
},
}
}
let _ = tx.send(PingResult::Fail);
}
});
// Break if the dial was successful
break;
} else {
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
}
}

// Wait for the liveness test to complete
loop {
tokio::select! {
_ = rx.changed() => {
let test_duration = start.elapsed();
let responsive = *rx.borrow();
println!("\nWhen rx.changed(): {:?}\n", responsive);
if responsive == PingResult::Success {
println!("✅ Peer ({}, {}) is responsive", node_id_clone, public_key_clone);
} else {
println!("❌ Peer ({}, {}) is unresponsive", node_id_clone, public_key_clone);
}

if let Some(true) = args.output_to_file {
print_to_file(
responsive,
args.output_directory,
args.refresh_file,
public_key_clone,
address_clone,
test_duration
).await;
}
println!();

if let Some(true) = args.exit {
println!("The liveness test is complete and base node will now exit\n");
self.shutdown.trigger();
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
match responsive {
PingResult::Success => process::exit(0),
_ => process::exit(1),
}
}

break;
},

_ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {},
}
}

Ok(())
}
}

async fn print_to_file(
responsive: PingResult,
output_directory: Option<PathBuf>,
refresh_file: Option<bool>,
public_key: PublicKey,
address: Multiaddr,
test_duration: Duration,
) {
let test_result = if responsive == PingResult::Success {
"PASS"
} else {
"FAIL"
};
let now = Local::now();
let date_time = now.format("%Y-%m-%d %H:%M:%S").to_string();

let file_name = "peer_liveness_test.csv";
let file_path = if let Some(path) = output_directory.clone() {
if let Ok(true) = fs::exists(&path) {
path.join(file_name)
} else if fs::create_dir_all(&path).is_ok() {
path.join(file_name)
} else {
PathBuf::from(file_name)
}
} else {
PathBuf::from(file_name)
};

if let Some(true) = refresh_file {
let _unused = fs::remove_file(&file_path);
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
}
let write_header = !file_path.exists();
if let Ok(mut file) = OpenOptions::new().append(true).create(true).open(file_path.clone()) {
let mut file_content = String::new();
if write_header {
file_content.push_str("Date Time,Public Key,Address,Result,Test Duration\n");
}
file_content.push_str(&format!(
"{},{},{},{},{:.2?}",
date_time, public_key, address, test_result, test_duration
));
match writeln!(file, "{}", file_content) {
Ok(_) => {
println!("📝 Test result written to file: {}", file_path.display());
},
Err(e) => {
println!("❌ Error writing test result to file: {}", e);
},
}
}
}
8 changes: 6 additions & 2 deletions base_layer/core/src/transactions/aggregated_body.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,20 @@
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
// USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#[cfg(feature = "base_node")]
use std::convert::TryFrom;
use std::{
cmp::max,
convert::TryFrom,
fmt::{Display, Error, Formatter},
};

use borsh::{BorshDeserialize, BorshSerialize};
use log::*;
use serde::{Deserialize, Serialize};
use tari_common_types::types::{ComAndPubSignature, Commitment, FixedHash, PrivateKey};
#[cfg(feature = "base_node")]
use tari_common_types::types::FixedHash;
use tari_common_types::types::{ComAndPubSignature, Commitment, PrivateKey};
use tari_crypto::commitment::HomomorphicCommitmentFactory;
#[cfg(feature = "base_node")]
use tari_mmr::pruned_hashset::PrunedHashSet;
Expand Down
Loading