Skip to content

Commit

Permalink
auto-discovery datanode and nodemanager.
Browse files Browse the repository at this point in the history
  • Loading branch information
Zhang Jun committed May 15, 2020
1 parent 3ce99f1 commit ba15d6f
Show file tree
Hide file tree
Showing 9 changed files with 38 additions and 37 deletions.
16 changes: 5 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,11 @@ Tested on CDH 5.14.2.
``` bash
➜ hadoop_jmx_exporter git:(master) ✗ pip2 install -r requirements.txt

➜ hadoop_exporter git:(master) ✗ python2 hadoop_jmx_exporter.py --help
➜ hadoop_exporter git:(master) ✗ python2 hadoop_jmx_exporter.py -h
usage: hadoop_jmx_exporter.py [-h] -cluster cluster_name
[-queue yarn_queue_regexp]
[-nns [namenode_jmx_url [namenode_jmx_url ...]]]
[-dns [datanode_jmx_url [datanode_jmx_url ...]]]
[-rms [resourcemanager_jmx_url [resourcemanager_jmx_url ...]]]
[-nms [nodemanager_jmx_url [nodemanager_jmx_url ...]]]
[-jns [journalnode_jmx_url [journalnode_jmx_url ...]]]
[-host host] [-port port]

Expand All @@ -34,20 +32,16 @@ optional arguments:
-queue yarn_queue_regexp
Regular expression of queue name. default: root.*
-nns [namenode_jmx_url [namenode_jmx_url ...]]
Hadoop hdfs namenode jmx metrics: URL.
-dns [datanode_jmx_url [datanode_jmx_url ...]]
Hadoop datanode jmx metrics: URL.
Hadoop hdfs namenode jmx metrics URL.
-rms [resourcemanager_jmx_url [resourcemanager_jmx_url ...]]
Hadoop resourcemanager metrics: jmx URL.
-nms [nodemanager_jmx_url [nodemanager_jmx_url ...]]
Hadoop nodemanager jmx metrics: URL.
Hadoop resourcemanager metrics jmx URL.
-jns [journalnode_jmx_url [journalnode_jmx_url ...]]
Hadoop journalnode jmx metrics: URL.
Hadoop journalnode jmx metrics URL.
-host host Listen on this address. default: 0.0.0.0
-port port Listen to this port. default: 6688
➜ hadoop_exporter git:(master) ✗

➜ hadoop_exporter git:(master) ✗ python2 hadoop_jmx_exporter.py -cluster yh-cdh -nns http://10.193.40.10:50070/jmx http://10.193.40.3:50070/jmx -dns http://10.193.40.9:50075/jmx http://10.193.40.3:50075/jmx http://10.193.40.10:50075/jmx -rms http://yh-shhd-cdh04:8088/jmx http://yh-shhd-cdh01:8088/jmx -nms http://yh-shhd-cdh04:8042/jmx http://yh-shhd-cdh05:8042/jmx
➜ hadoop_exporter git:(master) ✗ python2 hadoop_jmx_exporter.py -cluster yh-cdh -nns http://10.193.40.10:50070/jmx http://10.193.40.3:50070/jmx -rms http://yh-shhd-cdh04:8088/jmx http://yh-shhd-cdh01:8088/jmx
Listen at 0.0.0.0:6688
```
Expand Down
3 changes: 1 addition & 2 deletions common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@


class MetricCollector(object):
def __init__(self, cluster, urls, component, service):
def __init__(self, cluster, component, service):
self.cluster = cluster
self.urls = urls
self.component = component
self.prefix = 'hadoop_{0}_{1}'.format(component, service)

Expand Down
17 changes: 9 additions & 8 deletions hadoop_jmx_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,17 @@

def register_prometheus(cluster, args):
if args.nns is not None and len(args.nns) > 0:
REGISTRY.register(NameNodeMetricCollector(cluster, args.nns))
if args.dns is not None and len(args.dns) > 0:
REGISTRY.register(DataNodeMetricCollector(cluster, args.dns))
nnc = NameNodeMetricCollector(cluster, args.nns)
nnc.collect()
REGISTRY.register(nnc)
REGISTRY.register(DataNodeMetricCollector(cluster, nnc))
if args.rms is not None and len(args.rms) > 0:
rmc = ResourceManagerMetricCollector(cluster, args.rms, args.queue)
rmc.collect()
REGISTRY.register(rmc)
REGISTRY.register(NodeManagerMetricCollector(cluster, rmc))
if args.jns is not None and len(args.jns) > 0:
REGISTRY.register(JournalNodeMetricCollector(cluster, args.jns))
if args.rms is not None and len(args.rms) > 0:
REGISTRY.register(ResourceManagerMetricCollector(cluster, args.rms, args.queue))
if args.nms is not None and len(args.nms) > 0:
REGISTRY.register(NodeManagerMetricCollector(cluster, args.nms))

def main():
args = utils.parse_args()
host = args.host
Expand Down
11 changes: 6 additions & 5 deletions hdfs_datanode.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,22 @@


class DataNodeMetricCollector(MetricCollector):
def __init__(self, cluster, urls):
MetricCollector.__init__(self, cluster, urls, "hdfs", "datanode")
def __init__(self, cluster, nnc, urls=""):
MetricCollector.__init__(self, cluster, "hdfs", "datanode")
self.target = "-"
self.nnc = nnc

self.hadoop_datanode_metrics = {}
for i in range(len(self.file_list)):
self.hadoop_datanode_metrics.setdefault(self.file_list[i], {})

self.common_metric_collector = CommonMetricCollector(cluster, "hdfs", "datanode")

self.scrape_metrics = ScrapeMetrics(urls)

def collect(self):
isSetup = False
beans_list = self.scrape_metrics.scrape()
if self.nnc.dns == "":
return
beans_list = ScrapeMetrics(self.nnc.dns).scrape()
for beans in beans_list:
if not isSetup:
self.common_metric_collector.setup_labels(beans)
Expand Down
3 changes: 2 additions & 1 deletion hdfs_journalnode.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@

class JournalNodeMetricCollector(MetricCollector):
def __init__(self, cluster, urls):
MetricCollector.__init__(self, cluster, urls, "hdfs", "journalnode")
MetricCollector.__init__(self, cluster, "hdfs", "journalnode")
self.target = "-"
self.urls = urls

self.hadoop_journalnode_metrics = {}
for i in range(len(self.file_list)):
Expand Down
7 changes: 6 additions & 1 deletion hdfs_namenode.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@

class NameNodeMetricCollector(MetricCollector):
def __init__(self, cluster, urls):
MetricCollector.__init__(self, cluster, urls, "hdfs", "namenode")
MetricCollector.__init__(self, cluster, "hdfs", "namenode")
self.target = "-"
self.urls = urls
self.dns = set()

self.hadoop_namenode_metrics = {}
for i in range(len(self.file_list)):
Expand Down Expand Up @@ -388,10 +390,12 @@ def get_nninfo_metrics(self, bean):
if "LiveNodes" in metric and "LiveNodes" in bean:
live_node_dict = yaml.safe_load(bean["LiveNodes"])
self.hadoop_namenode_metrics["NameNodeInfo"]["LiveNodeCount"].add_metric([self.cluster, self.target], len(live_node_dict))
dns = set()
for node, info in live_node_dict.items():
label = [self.cluster, node, info["infoAddr"], info["infoSecureAddr"], info["xferaddr"], info["version"], self.target]
items = ["lastContact", "usedSpace", "adminState", "nonDfsUsedSpace", "capacity", "numBlocks",
"used", "remaining", "blockScheduled", "blockPoolUsed", "blockPoolUsedPercent", "volfails"]
dns.add("http://"+info["infoAddr"]+"/jmx")
for item in items:
value = info[item] if item in info else 0
if item == "adminState":
Expand All @@ -404,6 +408,7 @@ def get_nninfo_metrics(self, bean):
item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower()
key = "LiveNodes-" + item
self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value)
self.dns = dns
elif "DeadNodes" in metric and "DeadNodes" in bean:
dead_node_dict = yaml.safe_load(bean["DeadNodes"])
self.hadoop_namenode_metrics["NameNodeInfo"]["DeadNodeCount"].add_metric([self.cluster, self.target], len(dead_node_dict))
Expand Down
3 changes: 0 additions & 3 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ def get_module_logger(mod_name):

logger = get_module_logger(__name__)


def read_json_file(path_name, file_name):
path = os.path.dirname(os.path.realpath(__file__))
metric_path = os.path.join(path, "metrics", path_name)
Expand Down Expand Up @@ -64,9 +63,7 @@ def parse_args():
parser.add_argument('-cluster', required=True, metavar='cluster_name', help='Hadoop cluster name (maybe HA name)')
parser.add_argument('-queue', required=False, metavar='yarn_queue_regexp', help='Regular expression of queue name. default: root.*', default='root.*')
parser.add_argument('-nns', required=False, metavar='namenode_jmx_url', help='Hadoop hdfs namenode jmx metrics URL.', nargs="*")
parser.add_argument('-dns', required=False, metavar='datanode_jmx_url', help='Hadoop datanode jmx metrics URL.', nargs="*")
parser.add_argument('-rms', required=False, metavar='resourcemanager_jmx_url', help='Hadoop resourcemanager metrics jmx URL.', nargs="*")
parser.add_argument('-nms', required=False, metavar='nodemanager_jmx_url', help='Hadoop nodemanager jmx metrics URL.', nargs="*")
parser.add_argument('-jns', required=False, metavar='journalnode_jmx_url', help='Hadoop journalnode jmx metrics URL.', nargs="*")
parser.add_argument('-host', required=False, metavar='host', help='Listen on this address. default: 0.0.0.0', default='0.0.0.0')
parser.add_argument('-port', required=False, metavar='port', type=int, help='Listen to this port. default: 6688', default=6688)
Expand Down
9 changes: 4 additions & 5 deletions yarn_nodemanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,20 @@

class NodeManagerMetricCollector(MetricCollector):

def __init__(self, cluster, urls):
MetricCollector.__init__(self, cluster, urls, "yarn", "nodemanager")
def __init__(self, cluster, rmc):
MetricCollector.__init__(self, cluster, "yarn", "nodemanager")
self.target = "-"
self.rmc = rmc

self.hadoop_nodemanager_metrics = {}
for i in range(len(self.file_list)):
self.hadoop_nodemanager_metrics.setdefault(self.file_list[i], {})

self.common_metric_collector = CommonMetricCollector(cluster, "yarn", "nodemanager")

self.scrape_metrics = ScrapeMetrics(urls)

def collect(self):
isSetup = False
beans_list = self.scrape_metrics.scrape()
beans_list = ScrapeMetrics(self.rmc.nms).scrape()
for beans in beans_list:
if not isSetup:
self.common_metric_collector.setup_labels(beans)
Expand Down
6 changes: 5 additions & 1 deletion yarn_resourcemanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@ class ResourceManagerMetricCollector(MetricCollector):
}

def __init__(self, cluster, urls, queue_regexp):
MetricCollector.__init__(self, cluster, urls, "yarn", "resourcemanager")
MetricCollector.__init__(self, cluster, "yarn", "resourcemanager")
self.target = "-"
self.queue_regexp = queue_regexp
self.nms = set()

self.hadoop_resourcemanager_metrics = {}
for i in range(len(self.file_list)):
Expand Down Expand Up @@ -178,8 +179,10 @@ def setup_metrics_labels(self, beans):

def get_rmnminfo_metrics(self, bean):
for metric in self.metrics['RMNMInfo']:
nms = set()
live_nm_list = yaml.safe_load(bean['LiveNodeManagers'])
for j in range(len(live_nm_list)):
nms.add("http://"+live_nm_list[j]["NodeHTTPAddress"]+"/jmx")
host = live_nm_list[j]['HostName']
version = live_nm_list[j]['NodeManagerVersion']
rack = live_nm_list[j]['Rack']
Expand All @@ -189,6 +192,7 @@ def get_rmnminfo_metrics(self, bean):
else:
value = live_nm_list[j][metric] if metric in live_nm_list[j] else 0.0
self.hadoop_resourcemanager_metrics['RMNMInfo'][metric].add_metric(label, value)
self.nms = nms

def get_queue_metrics(self, bean):
for metric in self.metrics['QueueMetrics']:
Expand Down

0 comments on commit ba15d6f

Please sign in to comment.