From ba15d6ffff353c16cbc0b70508482bdf28aabebb Mon Sep 17 00:00:00 2001 From: Zhang Jun Date: Fri, 15 May 2020 18:19:54 +0800 Subject: [PATCH] auto-discovery datanode and nodemanager. --- README.md | 16 +++++----------- common.py | 3 +-- hadoop_jmx_exporter.py | 17 +++++++++-------- hdfs_datanode.py | 11 ++++++----- hdfs_journalnode.py | 3 ++- hdfs_namenode.py | 7 ++++++- utils.py | 3 --- yarn_nodemanager.py | 9 ++++----- yarn_resourcemanager.py | 6 +++++- 9 files changed, 38 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index f482dd1..8d54a93 100644 --- a/README.md +++ b/README.md @@ -15,13 +15,11 @@ Tested on CDH 5.14.2. ``` bash ➜ hadoop_jmx_exporter git:(master) ✗ pip2 install -r requirements.txt -➜ hadoop_exporter git:(master) ✗ python2 hadoop_jmx_exporter.py --help +➜ hadoop_exporter git:(master) ✗ python2 hadoop_jmx_exporter.py -h usage: hadoop_jmx_exporter.py [-h] -cluster cluster_name [-queue yarn_queue_regexp] [-nns [namenode_jmx_url [namenode_jmx_url ...]]] - [-dns [datanode_jmx_url [datanode_jmx_url ...]]] [-rms [resourcemanager_jmx_url [resourcemanager_jmx_url ...]]] - [-nms [nodemanager_jmx_url [nodemanager_jmx_url ...]]] [-jns [journalnode_jmx_url [journalnode_jmx_url ...]]] [-host host] [-port port] @@ -34,20 +32,16 @@ optional arguments: -queue yarn_queue_regexp Regular expression of queue name. default: root.* -nns [namenode_jmx_url [namenode_jmx_url ...]] - Hadoop hdfs namenode jmx metrics: URL. - -dns [datanode_jmx_url [datanode_jmx_url ...]] - Hadoop datanode jmx metrics: URL. + Hadoop hdfs namenode jmx metrics URL. -rms [resourcemanager_jmx_url [resourcemanager_jmx_url ...]] - Hadoop resourcemanager metrics: jmx URL. - -nms [nodemanager_jmx_url [nodemanager_jmx_url ...]] - Hadoop nodemanager jmx metrics: URL. + Hadoop resourcemanager metrics jmx URL. -jns [journalnode_jmx_url [journalnode_jmx_url ...]] - Hadoop journalnode jmx metrics: URL. + Hadoop journalnode jmx metrics URL. -host host Listen on this address. default: 0.0.0.0 -port port Listen to this port. default: 6688 ➜ hadoop_exporter git:(master) ✗ -➜ hadoop_exporter git:(master) ✗ python2 hadoop_jmx_exporter.py -cluster yh-cdh -nns http://10.193.40.10:50070/jmx http://10.193.40.3:50070/jmx -dns http://10.193.40.9:50075/jmx http://10.193.40.3:50075/jmx http://10.193.40.10:50075/jmx -rms http://yh-shhd-cdh04:8088/jmx http://yh-shhd-cdh01:8088/jmx -nms http://yh-shhd-cdh04:8042/jmx http://yh-shhd-cdh05:8042/jmx +➜ hadoop_exporter git:(master) ✗ python2 hadoop_jmx_exporter.py -cluster yh-cdh -nns http://10.193.40.10:50070/jmx http://10.193.40.3:50070/jmx -rms http://yh-shhd-cdh04:8088/jmx http://yh-shhd-cdh01:8088/jmx Listen at 0.0.0.0:6688 ``` diff --git a/common.py b/common.py index 6d71c11..9941896 100644 --- a/common.py +++ b/common.py @@ -11,9 +11,8 @@ class MetricCollector(object): - def __init__(self, cluster, urls, component, service): + def __init__(self, cluster, component, service): self.cluster = cluster - self.urls = urls self.component = component self.prefix = 'hadoop_{0}_{1}'.format(component, service) diff --git a/hadoop_jmx_exporter.py b/hadoop_jmx_exporter.py index f112828..f85d5fc 100755 --- a/hadoop_jmx_exporter.py +++ b/hadoop_jmx_exporter.py @@ -18,16 +18,17 @@ def register_prometheus(cluster, args): if args.nns is not None and len(args.nns) > 0: - REGISTRY.register(NameNodeMetricCollector(cluster, args.nns)) - if args.dns is not None and len(args.dns) > 0: - REGISTRY.register(DataNodeMetricCollector(cluster, args.dns)) + nnc = NameNodeMetricCollector(cluster, args.nns) + nnc.collect() + REGISTRY.register(nnc) + REGISTRY.register(DataNodeMetricCollector(cluster, nnc)) + if args.rms is not None and len(args.rms) > 0: + rmc = ResourceManagerMetricCollector(cluster, args.rms, args.queue) + rmc.collect() + REGISTRY.register(rmc) + REGISTRY.register(NodeManagerMetricCollector(cluster, rmc)) if args.jns is not None and len(args.jns) > 0: REGISTRY.register(JournalNodeMetricCollector(cluster, args.jns)) - if args.rms is not None and len(args.rms) > 0: - REGISTRY.register(ResourceManagerMetricCollector(cluster, args.rms, args.queue)) - if args.nms is not None and len(args.nms) > 0: - REGISTRY.register(NodeManagerMetricCollector(cluster, args.nms)) - def main(): args = utils.parse_args() host = args.host diff --git a/hdfs_datanode.py b/hdfs_datanode.py index 2f269f0..18b91a3 100644 --- a/hdfs_datanode.py +++ b/hdfs_datanode.py @@ -13,9 +13,10 @@ class DataNodeMetricCollector(MetricCollector): - def __init__(self, cluster, urls): - MetricCollector.__init__(self, cluster, urls, "hdfs", "datanode") + def __init__(self, cluster, nnc, urls=""): + MetricCollector.__init__(self, cluster, "hdfs", "datanode") self.target = "-" + self.nnc = nnc self.hadoop_datanode_metrics = {} for i in range(len(self.file_list)): @@ -23,11 +24,11 @@ def __init__(self, cluster, urls): self.common_metric_collector = CommonMetricCollector(cluster, "hdfs", "datanode") - self.scrape_metrics = ScrapeMetrics(urls) - def collect(self): isSetup = False - beans_list = self.scrape_metrics.scrape() + if self.nnc.dns == "": + return + beans_list = ScrapeMetrics(self.nnc.dns).scrape() for beans in beans_list: if not isSetup: self.common_metric_collector.setup_labels(beans) diff --git a/hdfs_journalnode.py b/hdfs_journalnode.py index 96877b2..0cffd05 100644 --- a/hdfs_journalnode.py +++ b/hdfs_journalnode.py @@ -13,8 +13,9 @@ class JournalNodeMetricCollector(MetricCollector): def __init__(self, cluster, urls): - MetricCollector.__init__(self, cluster, urls, "hdfs", "journalnode") + MetricCollector.__init__(self, cluster, "hdfs", "journalnode") self.target = "-" + self.urls = urls self.hadoop_journalnode_metrics = {} for i in range(len(self.file_list)): diff --git a/hdfs_namenode.py b/hdfs_namenode.py index cc67977..da06d7f 100644 --- a/hdfs_namenode.py +++ b/hdfs_namenode.py @@ -15,8 +15,10 @@ class NameNodeMetricCollector(MetricCollector): def __init__(self, cluster, urls): - MetricCollector.__init__(self, cluster, urls, "hdfs", "namenode") + MetricCollector.__init__(self, cluster, "hdfs", "namenode") self.target = "-" + self.urls = urls + self.dns = set() self.hadoop_namenode_metrics = {} for i in range(len(self.file_list)): @@ -388,10 +390,12 @@ def get_nninfo_metrics(self, bean): if "LiveNodes" in metric and "LiveNodes" in bean: live_node_dict = yaml.safe_load(bean["LiveNodes"]) self.hadoop_namenode_metrics["NameNodeInfo"]["LiveNodeCount"].add_metric([self.cluster, self.target], len(live_node_dict)) + dns = set() for node, info in live_node_dict.items(): label = [self.cluster, node, info["infoAddr"], info["infoSecureAddr"], info["xferaddr"], info["version"], self.target] items = ["lastContact", "usedSpace", "adminState", "nonDfsUsedSpace", "capacity", "numBlocks", "used", "remaining", "blockScheduled", "blockPoolUsed", "blockPoolUsedPercent", "volfails"] + dns.add("http://"+info["infoAddr"]+"/jmx") for item in items: value = info[item] if item in info else 0 if item == "adminState": @@ -404,6 +408,7 @@ def get_nninfo_metrics(self, bean): item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() key = "LiveNodes-" + item self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value) + self.dns = dns elif "DeadNodes" in metric and "DeadNodes" in bean: dead_node_dict = yaml.safe_load(bean["DeadNodes"]) self.hadoop_namenode_metrics["NameNodeInfo"]["DeadNodeCount"].add_metric([self.cluster, self.target], len(dead_node_dict)) diff --git a/utils.py b/utils.py index b2c236d..7f3989e 100644 --- a/utils.py +++ b/utils.py @@ -30,7 +30,6 @@ def get_module_logger(mod_name): logger = get_module_logger(__name__) - def read_json_file(path_name, file_name): path = os.path.dirname(os.path.realpath(__file__)) metric_path = os.path.join(path, "metrics", path_name) @@ -64,9 +63,7 @@ def parse_args(): parser.add_argument('-cluster', required=True, metavar='cluster_name', help='Hadoop cluster name (maybe HA name)') parser.add_argument('-queue', required=False, metavar='yarn_queue_regexp', help='Regular expression of queue name. default: root.*', default='root.*') parser.add_argument('-nns', required=False, metavar='namenode_jmx_url', help='Hadoop hdfs namenode jmx metrics URL.', nargs="*") - parser.add_argument('-dns', required=False, metavar='datanode_jmx_url', help='Hadoop datanode jmx metrics URL.', nargs="*") parser.add_argument('-rms', required=False, metavar='resourcemanager_jmx_url', help='Hadoop resourcemanager metrics jmx URL.', nargs="*") - parser.add_argument('-nms', required=False, metavar='nodemanager_jmx_url', help='Hadoop nodemanager jmx metrics URL.', nargs="*") parser.add_argument('-jns', required=False, metavar='journalnode_jmx_url', help='Hadoop journalnode jmx metrics URL.', nargs="*") parser.add_argument('-host', required=False, metavar='host', help='Listen on this address. default: 0.0.0.0', default='0.0.0.0') parser.add_argument('-port', required=False, metavar='port', type=int, help='Listen to this port. default: 6688', default=6688) diff --git a/yarn_nodemanager.py b/yarn_nodemanager.py index 6529455..87abf18 100644 --- a/yarn_nodemanager.py +++ b/yarn_nodemanager.py @@ -13,9 +13,10 @@ class NodeManagerMetricCollector(MetricCollector): - def __init__(self, cluster, urls): - MetricCollector.__init__(self, cluster, urls, "yarn", "nodemanager") + def __init__(self, cluster, rmc): + MetricCollector.__init__(self, cluster, "yarn", "nodemanager") self.target = "-" + self.rmc = rmc self.hadoop_nodemanager_metrics = {} for i in range(len(self.file_list)): @@ -23,11 +24,9 @@ def __init__(self, cluster, urls): self.common_metric_collector = CommonMetricCollector(cluster, "yarn", "nodemanager") - self.scrape_metrics = ScrapeMetrics(urls) - def collect(self): isSetup = False - beans_list = self.scrape_metrics.scrape() + beans_list = ScrapeMetrics(self.rmc.nms).scrape() for beans in beans_list: if not isSetup: self.common_metric_collector.setup_labels(beans) diff --git a/yarn_resourcemanager.py b/yarn_resourcemanager.py index d9afa22..6b87b3d 100644 --- a/yarn_resourcemanager.py +++ b/yarn_resourcemanager.py @@ -24,9 +24,10 @@ class ResourceManagerMetricCollector(MetricCollector): } def __init__(self, cluster, urls, queue_regexp): - MetricCollector.__init__(self, cluster, urls, "yarn", "resourcemanager") + MetricCollector.__init__(self, cluster, "yarn", "resourcemanager") self.target = "-" self.queue_regexp = queue_regexp + self.nms = set() self.hadoop_resourcemanager_metrics = {} for i in range(len(self.file_list)): @@ -178,8 +179,10 @@ def setup_metrics_labels(self, beans): def get_rmnminfo_metrics(self, bean): for metric in self.metrics['RMNMInfo']: + nms = set() live_nm_list = yaml.safe_load(bean['LiveNodeManagers']) for j in range(len(live_nm_list)): + nms.add("http://"+live_nm_list[j]["NodeHTTPAddress"]+"/jmx") host = live_nm_list[j]['HostName'] version = live_nm_list[j]['NodeManagerVersion'] rack = live_nm_list[j]['Rack'] @@ -189,6 +192,7 @@ def get_rmnminfo_metrics(self, bean): else: value = live_nm_list[j][metric] if metric in live_nm_list[j] else 0.0 self.hadoop_resourcemanager_metrics['RMNMInfo'][metric].add_metric(label, value) + self.nms = nms def get_queue_metrics(self, bean): for metric in self.metrics['QueueMetrics']: