diff --git a/.copr/Makefile b/.copr/Makefile index 03719ed0..84869420 100644 --- a/.copr/Makefile +++ b/.copr/Makefile @@ -25,11 +25,14 @@ COPR_PACKAGE_NAMES_SUFFIX:=1.8-4 2.0-0 2.2-0 latest # make -f /home/fmontorsi/git/cmonitor/.copr/Makefile srpm outdir=/tmp/cmonitor-tools-rpm # # NOTE: -# https://docs.pagure.org/copr.copr/user_documentation.html#how-long-do-you-keep-the-builds +# see https://docs.pagure.org/copr.copr/user_documentation.html#how-long-do-you-keep-the-builds +# https://pagure.io/copr/copr/issue/2050 # srpm: @echo "Env variables in this COPR build:" - export + @export + @echo "Installing git client:" + dnf -y install git @echo "Running srpm target from .copr/Makefile" mkdir -p $(RPM_TMP_DIR)/ $(RPM_TARBALL_DIR)/ rm -rf $(RPM_TMP_DIR)/* $(RPM_TARBALL_DIR)/* @@ -45,3 +48,14 @@ endif curl -X POST https://copr.fedorainfracloud.org/webhooks/custom/24475/$(COPR_TOKEN)/cmonitor-collector-$${pkgname}/ ; \ curl -X POST https://copr.fedorainfracloud.org/webhooks/custom/24475/$(COPR_TOKEN)/cmonitor-tools-$${pkgname}/ ; \ done + +rebuild_latest_copr_packages: + # IMPORTANT: take the COPR_TOKEN from the webpage: https://copr.fedorainfracloud.org/coprs/f18m/cmonitor/integrations/ +ifndef COPR_TOKEN + @echo "*** ERROR: please call this makefile supplying explicitly the COPR_TOKEN variable" + @exit 1 +endif + for pkgname in latest; do \ + curl -X POST https://copr.fedorainfracloud.org/webhooks/custom/24475/$(COPR_TOKEN)/cmonitor-collector-$${pkgname}/ ; \ + curl -X POST https://copr.fedorainfracloud.org/webhooks/custom/24475/$(COPR_TOKEN)/cmonitor-tools-$${pkgname}/ ; \ + done diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 948cabe7..8349f09a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -13,7 +13,7 @@ jobs: steps: - uses: actions/checkout@v2 - name: install deps - run: sudo apt install -y libgtest-dev libbenchmark-dev libfmt-dev tidy python3 python3-dateutil + run: sudo apt install -y libgtest-dev libbenchmark-dev libfmt-dev tidy python3 python3-dateutil git - name: make all run: make - name: make test diff --git a/.travis.yml b/.travis.yml index d6d0224e..453cfffe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,7 @@ addons: - libbenchmark-dev - libfmt-dev - tidy + - git install: pip install j2cli diff --git a/collector/spec/collector.spec b/collector/spec/collector.spec index 58caed45..9437567f 100644 --- a/collector/spec/collector.spec +++ b/collector/spec/collector.spec @@ -10,7 +10,7 @@ Requires: fmt # these are the requirements that we need on COPR builds: # IMPORTANT: cmonitor-collector RPM is built also on the 'old' Centos7 platform shipping fmt-devel-6.2.1 # so make sure not to use any feature of libfmt > 6.2.1 -BuildRequires: gcc-c++, make, gtest-devel, fmt-devel +BuildRequires: gcc-c++, make, gtest-devel, fmt-devel, git # python3-pip works and gets installed but then it fails later for unknown reasons # Disable automatic debug package creation: it fails within Fedora 28, 29 and 30 for the lack diff --git a/collector/src/Makefile b/collector/src/Makefile index 786e684b..83aecbe7 100644 --- a/collector/src/Makefile +++ b/collector/src/Makefile @@ -11,7 +11,8 @@ DEFS=-DVERSION_STRING=\"$(CMONITOR_VERSION)-$(CMONITOR_RELEASE)\" -DCMONITOR_LAS # IMPORTANT: we do require c++11, i.e. GCC 4.8.1 or higher; # Centos7 which is one of the oldest-supported distributions provides gcc 4.8.5 so we # cannot move to e.g. c++14/17/20 yet -CXXFLAGS=-Wall -Werror -Wno-switch-bool -std=c++11 $(DEFS) +# IMPORTANT#2: -fPIC is required to build on fedora-rawhide +CXXFLAGS=-Wall -Werror -Wno-switch-bool -std=c++11 -fPIC $(DEFS) ifeq ($(DEBUG),1) CXXFLAGS += -g -O0 #useful for debugging diff --git a/collector/src/benchmarks/Makefile b/collector/src/benchmarks/Makefile index 449ad7a5..ca3419fb 100644 --- a/collector/src/benchmarks/Makefile +++ b/collector/src/benchmarks/Makefile @@ -5,7 +5,7 @@ THIS_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) ROOT_DIR:=$(shell readlink -f $(THIS_DIR)/../../..) -CXXFLAGS=-Wall -Werror -Wno-switch-bool -std=c++11 -DVERSION_STRING=\"$(RPM_VERSION)-$(RPM_RELEASE)\" +CXXFLAGS=-Wall -Werror -Wno-switch-bool -std=c++11 -fPIC -DVERSION_STRING=\"$(RPM_VERSION)-$(RPM_RELEASE)\" ifeq ($(DEBUG),1) CXXFLAGS += -g -O0 #useful for debugging diff --git a/collector/src/tests/Makefile b/collector/src/tests/Makefile index cfaf22fb..7ef54691 100644 --- a/collector/src/tests/Makefile +++ b/collector/src/tests/Makefile @@ -5,7 +5,7 @@ THIS_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) ROOT_DIR:=$(shell readlink -f $(THIS_DIR)/../../..) -CXXFLAGS=-Wall -Werror -Wno-switch-bool -std=c++11 -DVERSION_STRING=\"$(RPM_VERSION)-$(RPM_RELEASE)\" +CXXFLAGS=-Wall -Werror -Wno-switch-bool -std=c++11 -fPIC -DVERSION_STRING=\"$(RPM_VERSION)-$(RPM_RELEASE)\" ifeq ($(DEBUG),1) CXXFLAGS += -g -O0 #useful for debugging diff --git a/examples/baremetal1.html b/examples/baremetal1.html index 4e7c81fe..8b3888f5 100644 --- a/examples/baremetal1.html +++ b/examples/baremetal1.html @@ -4105,7 +4105,7 @@

Monitoring Summary

diff --git a/examples/docker-collecting-baremetal-stats.html b/examples/docker-collecting-baremetal-stats.html index 8365ab32..9cf9b7e3 100644 --- a/examples/docker-collecting-baremetal-stats.html +++ b/examples/docker-collecting-baremetal-stats.html @@ -2080,7 +2080,7 @@

Monitoring Summary

diff --git a/examples/docker-collecting-docker-stats.html b/examples/docker-collecting-docker-stats.html index c1831aec..bce125ad 100644 --- a/examples/docker-collecting-docker-stats.html +++ b/examples/docker-collecting-docker-stats.html @@ -713,7 +713,7 @@

Monitoring Summary

diff --git a/examples/docker-stress-test-cpu.html b/examples/docker-stress-test-cpu.html index e1bbcc70..7ae66c01 100644 --- a/examples/docker-stress-test-cpu.html +++ b/examples/docker-stress-test-cpu.html @@ -684,7 +684,7 @@

Monitoring Summary

diff --git a/examples/docker-stress-test-mem.html b/examples/docker-stress-test-mem.html index 7815e532..141c2c47 100644 --- a/examples/docker-stress-test-mem.html +++ b/examples/docker-stress-test-mem.html @@ -2306,7 +2306,7 @@

Monitoring Summary

diff --git a/examples/docker-userapp.html b/examples/docker-userapp.html index cac32ae7..7d27eea4 100644 --- a/examples/docker-userapp.html +++ b/examples/docker-userapp.html @@ -703,7 +703,7 @@

Monitoring Summary

diff --git a/tools/Makefile b/tools/Makefile index 0d51e584..f13711cd 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -18,7 +18,7 @@ SYMLINKS = \ filter/cmonitor_filter \ statistics/cmonitor_statistics COMMON_CODE = \ - common-code/cmonitor_loader.py \ + $(wildcard common-code/*.py) \ common-code/cmonitor_version.py # @@ -89,6 +89,7 @@ srpm_tarball: mv cmonitor cmonitor-tools-$(CMONITOR_VERSION) && \ sed -i 's@__RPM_VERSION__@$(CMONITOR_VERSION)@g' cmonitor-tools-$(CMONITOR_VERSION)/tools/spec/tools.spec && \ sed -i 's@__RPM_RELEASE__@$(CMONITOR_RELEASE)@g' cmonitor-tools-$(CMONITOR_VERSION)/tools/spec/tools.spec && \ + sed -i 's@__LAST_COMMIT_HASH__@$(CMONITOR_LAST_COMMIT_HASH)@g' cmonitor-tools-$(CMONITOR_VERSION)/tools/spec/tools.spec && \ tar cvzf $(RPM_TMP_DIR)/cmonitor-tools-$(CMONITOR_VERSION).tar.gz cmonitor-tools-$(CMONITOR_VERSION)/* # # This target is used by Fedora COPR to automatically produce RPMs for lots of distros. diff --git a/tools/chart/cmonitor_chart.py b/tools/chart/cmonitor_chart.py index 70c6b792..d0a3154e 100755 --- a/tools/chart/cmonitor_chart.py +++ b/tools/chart/cmonitor_chart.py @@ -21,2066 +21,13 @@ import time from cmonitor_loader import CmonitorCollectorJsonLoader from cmonitor_version import CmonitorToolVersion - -# ======================================================================================================= -# CONSTANTS -# ======================================================================================================= - -GRAPH_SOURCE_DATA_IS_BAREMETAL = 1 -GRAPH_SOURCE_DATA_IS_CGROUP_TOTALS = 2 -GRAPH_SOURCE_DATA_IS_CGROUP_PER_PROCESS = 3 - -GRAPH_TYPE_AREA_CHART = 1 -GRAPH_TYPE_BUBBLE_CHART = 2 - -SAVE_DEFLATED_JS_DATATABLES = True -JS_INDENT_SIZE = 2 - -# see https://developers.google.com/chart/interactive/docs/reference#dateformat -# the idea is that cmonitor_chart will most likely be used to explore short time intervals -# so that day/month/year part is not useful, just the time is useful; in tooltip we also -# reach millisec accuracy: -X_AXIS_DATEFORMAT = "HH:mm:ss" -TOOLTIP_DATEFORMAT = "HH:mm:ss.SSS z" - +from cmonitor_chart_engine import CMonitorGraphGenerator # ======================================================================================================= # GLOBALs # ======================================================================================================= verbose = False -g_num_generated_charts = 1 -g_next_graph_need_stacking = 0 -g_datetime = "localtz" # can be changed to "UTC" with --utc; FIXME currently we always use just UTC, never localtz... - -# ======================================================================================================= -# GoogleChartsTimeSeries -# ======================================================================================================= - - -class GoogleChartsTimeSeries(object): - """ - GoogleChartsTimeSeries is a (N+1)xM table of - t_1;Y1_1;Y2_1;...;YN_1 - t_2;Y1_2;Y2_2;...;YN_2 - ... - t_M;Y1_M;Y2_M;...;YN_M - data points for a GoogleCharts graph that is representing the evolution of N quantities over time - """ - - def __init__(self, column_names, column_units=None): - self.column_names = column_names # must be a LIST of strings - self.column_units = column_units - if self.column_units: - assert len(self.column_units) == len(self.column_names) - self.rows = [] # list of lists with values - - def ISOdatetimeToJSDate(self, date): - """convert ISO datetime strings like - "2017-08-21T20:12:30.123" - to strings like: - "Date(2017,8,21,20,12,30,123000)" - which are the datetime representation suitable for JS GoogleCharts, see - https://developers.google.com/chart/interactive/docs/datesandtimes - """ - dateAsPythonObj = datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") - - return "Date(%d,%d,%d,%d,%d,%d,%d)" % ( - dateAsPythonObj.year, - dateAsPythonObj.month, - dateAsPythonObj.day, - dateAsPythonObj.hour, - dateAsPythonObj.minute, - dateAsPythonObj.second, - dateAsPythonObj.microsecond / 1000, # NOTE: the JavaScript Date() object wants milliseconds - ) - - def addRow(self, row_data_list): - assert len(row_data_list) == len(self.column_names) - - # convert first column to a GoogleCharts-compatible datetime: - row_data_list[0] = self.ISOdatetimeToJSDate(row_data_list[0]) - self.rows.append(row_data_list) - - def getRow(self, index): - return self.rows[index] - - def getListColumnNames(self): - return self.column_names - - def getNumDataSeries(self): - # assuming first column is the timestamp, the number of "data series" - # present in this table is all remaining columns - assert len(self.column_names) >= 2 - return len(self.column_names) - 1 - - def getMaxValueDataSerie(self, column_index): - # WARNING: this looks very inefficient! - assert column_index >= 0 and column_index <= len(self.column_names) - 1 - ret = 0 - for r in self.rows: - ret = max(ret, r[1 + column_index]) - return ret - - def getDataSeriesIndexByName(self, column_name): - assert column_name != self.column_names[0] # the first column is not a "data serie", it's the timestamp column! - try: - col_idx = self.column_names.index(column_name) - assert col_idx >= 1 - return col_idx - 1 # the first data serie, with index 0, is the column immediately after the timestamp column - except ValueError: - # column name not found - return -1 - - def writeTo(self, file): - for r in self.rows: - # assume first column is always the timestamp: - row_text = "['Date(%s)'," % r[0] - row_text += ",".join(str(x) for x in r[1:]) - row_text += "],\n" - file.write(row_text) - - def toJSONForJS(self): - ret = "[[" # start 2D JSON array - - # convert 1st column: - assert self.column_names[0] == "Timestamp" - ret += '{"type":"datetime","label":"Datetime"},' - - # convert all other columns: - for colName in self.column_names[1:]: - ret += '"' + colName + '",' - ret = ret[:-1] - - # separe first line; start conversion of actual table data: - ret += "]," - - data = json.dumps(self.rows, separators=(",", ":")) - data = data[1:] - - return ret + data - - def toDeflatedJSONBase64Encoded(self): - """Returns this table in JSON format (for JS), deflated using zlib, and represented as a Base64-encoded ASCII string""" - json_string = self.toJSONForJS() - json_compressed_bytearray = zlib.compress(json_string.encode(), 9) - - ret = str(binascii.b2a_base64(json_compressed_bytearray)) - return ret[1:] - - def toGoogleChartTable(self, graphName): - """Writes in the given file the JavaScript GoogleCharts object representing this table""" - ret_string = "" - if SAVE_DEFLATED_JS_DATATABLES: - # to reduce the HTML size save the deflated, serialized JSON of the 2D JS array: - ret_string += "var deflated_data_base64_%s = %s;\n" % ( - graphName, - self.toDeflatedJSONBase64Encoded(), - ) - - # then convert it base64 -> JS binary string - ret_string += "var deflated_data_binary_%s = window.atob(deflated_data_base64_%s);\n" % (graphName, graphName) - - # now inflate it in the browser using "pako" library (https://github.com/nodeca/pako) - ret_string += "var inflated_data_%s = JSON.parse(pako.inflate(deflated_data_binary_%s, { to: 'string' }));\n" % (graphName, graphName) - else: - ret_string += "var inflated_data_%s = %s;\n" % ( - graphName, - self.toJSONForJS(), - ) - - # finally create the GoogleCharts table from it: - ret_string += "var data_%s = google.visualization.arrayToDataTable(inflated_data_%s);\n\n" % (graphName, graphName) - - # add DateFormatter to use custom formatting of the 1st column (like everywhere else we assume first column is the timestamp) - ret_string += "var date_formatter = new google.visualization.DateFormat({pattern: '%s'});\n" % (TOOLTIP_DATEFORMAT) - ret_string += "date_formatter.format(data_%s, 0);\n" % (graphName) - - if self.column_units: - column_units_strings = ["'" + v + "'" for v in self.column_units] - - # add Javascript code to set the formatted value on EACH and EVERY single entry of the table (except timestamp); - # this improves greatly the readability of TOOLTIPs generated by Google Charts: instead of showing very large numbers - # they will show up nice "k", "M" and "G" units - ret_string += """ -var column_unit = [%s] -for (var c=1; c < data_%s.getNumberOfColumns(); c++) { - for (var r=0; r < data_%s.getNumberOfRows(); r++) { - var v = data_%s.getValue(r, c); - data_%s.setFormattedValue(r, c, prettyPrinter(v) + column_unit[c]); - } -} - -""" % ( - ",".join(column_units_strings), - graphName, - graphName, - graphName, - graphName, - ) - - return ret_string - - -# ======================================================================================================= -# GoogleChartsGenericTable -# ======================================================================================================= - - -class GoogleChartsGenericTable(object): - """ - This is the NxM table of - Y1_1;Y2_1;...;YN_1 - ... - Y1_M;Y2_M;...;YN_M - data points for a GoogleCharts graph for M different objects characterized by N features. - This class is useful to create graphs which are NOT related to a measurement that evolves over TIME. - - Currently this class is used only for the generation of bubble charts, which are, by their nature, - suited to represent relationships among different features (in our case total IO, memory and CPU usage) - """ - - def __init__(self, column_names): - self.column_names = column_names # must be a LIST of strings - self.rows = [] # list of lists with values - - def addRow(self, row_data_list): - assert len(row_data_list) == len(self.column_names) - self.rows.append(row_data_list) - - def getRow(self, index): - return self.rows[index] - - def getListColumnNames(self): - return self.column_names - - def getNumDataSeries(self): - # assuming first column is the timestamp, the number of "data series" - # present in this table is all remaining columns - return len(self.column_names) - 1 - - def writeTo(self, file): - for r in self.rows: - file.write(",".join(r)) - - def toJSONForJS(self): - ret = "[[" # start 2D JSON array - - # convert all other columns: - for colName in self.column_names: - ret += '"' + colName + '",' - ret = ret[:-1] - - # separe first line; start conversion of actual table data: - ret += "]," - - data = json.dumps(self.rows, separators=(",", ":")) - data = data[1:] - - return ret + data - - def toDeflatedJSONBase64Encoded(self): - """Returns this table in JSON format (for JS), deflated using zlib, and represented as a Base64-encoded ASCII string""" - json_string = self.toJSONForJS() - json_compressed_bytearray = zlib.compress(json_string.encode(), 9) - - ret = str(binascii.b2a_base64(json_compressed_bytearray)) - return ret[1:] - - def toGoogleChartTable(self, graphName): - """Writes in the given file the JavaScript GoogleCharts object representing this table""" - ret_string = "" - if SAVE_DEFLATED_JS_DATATABLES: - # to reduce the HTML size save the deflated, serialized JSON of the 2D JS array: - ret_string += "var deflated_data_base64_%s = %s;\n" % ( - graphName, - self.toDeflatedJSONBase64Encoded(), - ) - - # then convert it base64 -> JS binary string - ret_string += "var deflated_data_binary_%s = window.atob(deflated_data_base64_%s);\n" % (graphName, graphName) - - # now inflate it in the browser using "pako" library (https://github.com/nodeca/pako) - ret_string += "var inflated_data_%s = JSON.parse(pako.inflate(deflated_data_binary_%s, { to: 'string' }));\n" % (graphName, graphName) - else: - ret_string += "var inflated_data_%s = %s;\n" % ( - graphName, - self.toJSONForJS(), - ) - - # finally create the GoogleCharts table from it: - ret_string += "var data_%s = google.visualization.arrayToDataTable(inflated_data_%s);\n" % (graphName, graphName) - return ret_string - - -# ======================================================================================================= -# GoogleChartsGraph -# ======================================================================================================= - - -class GoogleChartsGraph: - """ - This is a simple object that can generate a JavaScript snippet (to be embedded in HTML output page) - that will render at runtime a GoogleChart drawing inside a JavaScript-enabled browser of course. - - It supports Google AreaChart (see https://developers.google.com/chart/interactive/docs/gallery/areachart) - with 1 or 2 Y axes. The data series that are placed on the 2nd Y axis are higlighted automatically by - using a tick RED line. - """ - - def __init__( - self, - data=None, - button_label="", - combobox_label="", - combobox_entry="", - graph_source=GRAPH_SOURCE_DATA_IS_BAREMETAL, - graph_type=GRAPH_TYPE_AREA_CHART, - graph_title="", - stack_state=False, - y_axes_titles=[], - y_axes_max_value=[None], - columns_for_2nd_yaxis=None, - ): - self.data_table = data # of type GoogleChartsGenericTable or GoogleChartsTimeSeries - self.button_label = button_label - self.combobox_label = combobox_label - assert (len(self.button_label) == 0 and len(self.combobox_label) > 0) or (len(self.button_label) > 0 and len(self.combobox_label) == 0) - self.combobox_entry = combobox_entry - self.source_data = graph_source # one of GRAPH_TYPE_BAREMETAL or GRAPH_TYPE_CGROUP - self.stack_state = stack_state - self.graph_type = graph_type - self.graph_title = graph_title.strip() + (", STACKED graph" if self.stack_state else "") - self.y_axes_titles = y_axes_titles - self.columns_for_2nd_yaxis = columns_for_2nd_yaxis - self.y_axes_max_value = y_axes_max_value - - # generate new JS name for this graph - global g_num_generated_charts - self.js_name = "graph" + str(g_num_generated_charts) - g_num_generated_charts += 1 - - def __genGoogleChartJS_AreaChart(self): - """After the JavaScript line graph data is output, the data is terminated and the graph options set""" - global g_next_graph_need_stacking - - def __internalWriteSeries(series_indexes, target_axis_index): - ret = "" - for i, idx in enumerate(series_indexes, start=0): - if target_axis_index == 0: - ret += " %d: {targetAxisIndex:%d}" % (idx, target_axis_index) - else: - # IMPORTANT: the data series that go on the 2nd Y axis (typically just one) are drawn with a RED thick line - # to underline their importance; area opacity is removed to avoid clutter with data series on the first Y axis - ret += " %d: {targetAxisIndex:%d, lineWidth: 5, areaOpacity: 0, color: 'red', lineDashStyle: [10,2]}" % ( - idx, - target_axis_index, - ) - # print("i=%d, idx=%d, target_axis_index=%d" % (i,idx,target_axis_index)) - if i < len(series_indexes): - ret += ",\n" - else: - ret += "\n" - return ret - - def __internalWriteVAxis(v_axis_idx, max_value, title, data_series_indexes): - ret = "" - if max_value is None: - # let Google Chart automatically determine min/max on this axis - ret += ' %d: { title: "%s", format: "short" },\n' % (v_axis_idx, title) - elif max_value == 0: - # autocompute the best MAX - actual_max = 0 - for idx in data_series_indexes: - actual_max = max(actual_max, self.data_table.getMaxValueDataSerie(idx)) - ret += ' %d: { title: "%s", format: "short", minValue: -1, maxValue: %d },\n' % (v_axis_idx, title, actual_max * 5 + 10) - else: - ret += ' %d: { title: "%s", format: "short", minValue: -1, maxValue: %d },\n' % (v_axis_idx, title, max_value) - return ret - - ret_string = "" - ret_string += "var options_%s = {\n" % (self.js_name) - ret_string += ' chartArea: {left: "5%", width: "85%", top: "10%", height: "80%"},\n' - ret_string += ' title: "%s",\n' % (self.graph_title) - ret_string += ' focusTarget: "category",\n' - - # by default this tool plots the top 20 processes; in these cases both tooltips and legend will have up to 21 rows (including time) - # so we make the font a bit smaller to make it more likely to view all the lines - ret_string += " tooltip: { textStyle: { fontSize: 12 } },\n" - ret_string += " legend: { textStyle: { fontSize: 12 } },\n" - ret_string += ' explorer: { actions: ["dragToZoom", "rightClickToReset"], keepInBounds: true, maxZoomIn: 20.0 },\n' - - # HORIZONTAL AXIS - ret_string += ' hAxis: { format: "%s", gridlines: { color: "lightgrey", count: 30 } },\n' % X_AXIS_DATEFORMAT - - # VERTICAL AXIS (OR AXES) - if self.columns_for_2nd_yaxis: - # compute indexes of series that use the 2nd Y axis: - series_for_2nd_yaxis = [] - for colname in self.columns_for_2nd_yaxis: - idx = self.data_table.getDataSeriesIndexByName(colname) - assert idx != -1, f"Column named {colname} is not a column inside the data table!" - series_for_2nd_yaxis.append(idx) - # print("series_for_2nd_yaxis: %s" % ",".join(str(x) for x in series_for_2nd_yaxis)) - - # compute indexes of series that use 1st Y axis: - all_indexes = range(0, self.data_table.getNumDataSeries()) - series_for_1st_yaxis = [idx for idx in all_indexes if idx not in series_for_2nd_yaxis] - # print("series_for_1st_yaxis: %s" % ",".join(str(x) for x in series_for_1st_yaxis)) - - # assign data series to the 2 Y axes: - ret_string += " series: {\n" - ret_string += __internalWriteSeries(series_for_1st_yaxis, 0) - ret_string += __internalWriteSeries(series_for_2nd_yaxis, 1) - ret_string += " },\n" - - # check data - assert len(self.y_axes_titles) == 2 - assert len(self.y_axes_max_value) == 2, f"Got {self.y_axes_max_value}, but columns_for_2nd_yaxis={self.columns_for_2nd_yaxis}" - - # allocate 2 Y axes: - ret_string += " vAxes: {\n" - ret_string += __internalWriteVAxis(0, self.y_axes_max_value[0], self.y_axes_titles[0], series_for_1st_yaxis) - ret_string += __internalWriteVAxis(1, self.y_axes_max_value[1], self.y_axes_titles[1], series_for_2nd_yaxis) - ret_string += " },\n" - else: - # single vertical axis: - assert len(self.y_axes_titles) == 1 - ret_string += ' vAxis: { title: "%s", format: "short", gridlines: { color: "lightgrey", count: 11 } },\n' % str(self.y_axes_titles[0]) - - # graph stacking - g_next_graph_need_stacking = self.stack_state - if g_next_graph_need_stacking: - ret_string += " isStacked: 1\n" - g_next_graph_need_stacking = 0 - else: - ret_string += " isStacked: 0\n" - - ret_string += "};\n" # end of "options_%s" variable - ret_string += "\n" - ret_string += "set_main_chart_div_as_visible();\n" - ret_string += "if (g_chart && g_chart.clearChart)\n" - ret_string += " g_chart.clearChart();\n" - ret_string += 'g_chart = new google.visualization.AreaChart(document.getElementById("chart_master_div"));\n' - - # immediately before drawing the chart, add a listener to hack some ugly labeling by Google Charts - ret_string += "google.visualization.events.addListener(g_chart, 'ready', fix_vaxis_ticks);\n" - ret_string += "g_chart.draw(data_%s, options_%s);\n" % ( - self.js_name, - self.js_name, - ) - - ret_string += "g_current_data = data_%s;\n" % (self.js_name) - ret_string += "g_current_options = options_%s;\n" % (self.js_name) - - # this graph will be activated by either - # - a button that should reset all comboboxes of the page - # - a combo box entry that should reset all other comboboxes in the page - ret_string += 'reset_combo_boxes("%s");\n' % self.combobox_label - - return ret_string - - def __genGoogleChartJS_BubbleChart(self): - assert len(self.y_axes_titles) == 2 - ret_string = "" - ret_string += "var options_%s = {\n" % (self.js_name) - ret_string += ' explorer: { actions: ["dragToZoom", "rightClickToReset"], keepInBounds: true, maxZoomIn: 20.0 },\n' - ret_string += ' chartArea: { left: "5%", width: "85%", top: "10%", height: "80%" },\n' - ret_string += ' title: "%s",\n' % (self.graph_title) - ret_string += ' hAxis: { title:"%s" },\n' % str(self.y_axes_titles[0]) - ret_string += ' vAxis: { title:"%s", format:"short" },\n' % str(self.y_axes_titles[1]) - ret_string += " sizeAxis: { maxSize: 200 },\n" - ret_string += " bubble: { textStyle: {fontSize: 15} }\n" - ret_string += "};\n" # end of "options_%s" variable - ret_string += "\n" - ret_string += "if (g_chart && g_chart.clearChart)\n" - ret_string += " g_chart.clearChart();\n" - ret_string += "set_main_chart_div_as_visible();\n" - ret_string += 'g_chart = new google.visualization.BubbleChart(document.getElementById("chart_master_div"));\n' - ret_string += "g_chart.draw(data_%s, options_%s);\n" % ( - self.js_name, - self.js_name, - ) - ret_string += "g_current_data = data_%s;\n" % (self.js_name) - ret_string += "g_current_options = options_%s;\n" % (self.js_name) - return ret_string - - def toGoogleChartJS(self): - global g_next_graph_need_stacking - - # generate the JS - js_code_inner = self.data_table.toGoogleChartTable(self.js_name) - - if self.graph_type == GRAPH_TYPE_AREA_CHART: - js_code_inner += self.__genGoogleChartJS_AreaChart() - else: - js_code_inner += self.__genGoogleChartJS_BubbleChart() - - js_code = "function draw_%s() {\n" % (self.js_name) - js_code += textwrap.indent(js_code_inner, " " * JS_INDENT_SIZE) - js_code += "}\n" # end of draw_%s function - js_code += "\n" - - return js_code - - -# ======================================================================================================= -# HtmlOutputPage -# ======================================================================================================= - - -class HtmlOutputPage: - """ - This is able to produce a self-contained HTML page with embedded JavaScript to draw performance charts - """ - - def __init__(self, outfile, title): - self.title = title - self.outfile = outfile - self.file = open(outfile, "w") # Open the output file - self.graphs = [] - - def appendGoogleChart(self, chart): - assert isinstance(chart, GoogleChartsGraph) - self.graphs.append(chart) - - def writeHtmlHead(self): - """Write the head of the HTML webpage and start the JS section""" - self.file.write( - """ - - - {pageTitle}""".format( - pageTitle=self.title - ) - ) - - self.file.write( - """ - - - - \n") - self.file.write("\n") - - def startHtmlBody(self, cgroup_name, monitored_system, jheader, collected_threads): - self.file.write("\n") - self.file.write('

Data collected from ' + monitored_system + "

\n") - self.file.write('
\n') - self.file.write(' \n') - - # Table header row - self.file.write(" \n") - self.file.write(' \n') - self.file.write(' \n' % cgroup_name) - if collected_threads: - self.file.write(' \n') - else: - self.file.write(' \n') - self.file.write(' \n') - self.file.write(" \n") - - # Datarow - self.file.write(" \n") - self.file.write(' \n") - self.file.write("
Static InfoCGroup stats (Data collected from %s)CGroup per-thread stats (Data collected from cgroup and /proc)CGroup per-process stats (Data collected from cgroup and /proc)Baremetal stats (Data collected only from /proc)
\n') - self.file.write('
\n') - self.file.write('
\n') - - def write_buttons_for_graph_type(source_data): - nwritten_controls = 0 - - # find all graphs that will be activated through a combobox - graphs_combobox = {} - for num, graph in enumerate(self.graphs, start=1): - if graph.source_data == source_data and len(graph.combobox_label) > 0: - if graph.combobox_label not in graphs_combobox: - # add new dict entry as empty list - graphs_combobox[graph.combobox_label] = [] - - # add to the existing dict entry a new graph: - graphs_combobox[graph.combobox_label].append([graph.combobox_entry, graph.js_name]) - - # generate the CPU select box: - if len(graphs_combobox) > 0: - for combobox_label in graphs_combobox.keys(): - graph_list = graphs_combobox[combobox_label] - self.file.write(' \n") - nwritten_controls += 1 - - # find in all graphs registered so far all those related to the CGROUP - for num, graph in enumerate(self.graphs, start=1): - if graph.source_data == source_data: - if len(graph.combobox_label) > 0: - continue # skip - already drawn via \n') - write_buttons_for_graph_type(GRAPH_SOURCE_DATA_IS_CGROUP_PER_PROCESS) - self.file.write(' \n') - write_buttons_for_graph_type(GRAPH_SOURCE_DATA_IS_BAREMETAL) - - self.file.write("
\n") - self.file.write("
\n") - # self.file.write("

\n") - - # finally generate the MAIN div: i.e. where the selected chart is going to be drawn: - self.file.write( - '

...click on a button above to show a graph...

\n' - ) - - def configdump(jheader, section, displayName): - # newstr = '

' + displayName + '

\n' - newstr = " " + displayName + "\n" - config_dict = jheader[section] - for label in config_dict: - newstr += " %s%s\n" % ( - label.capitalize().replace("_", " "), - str(config_dict[label]), - ) - return newstr - - def aggregate_cpuinfo(jheader): - cpudict = {} - - # first take the unique strings about the CPU vendor/model - for field_name in ["vendor_id", "model_name"]: - cpudict[field_name] = set() - for cpu_name in jheader["cpuinfo"].keys(): - cpudict[field_name].add(jheader["cpuinfo"][cpu_name][field_name]) - - # secondly take the unique values of min/max frequency, MIPS, cache size - for field_name in ["scaling_min_freq_mhz", "scaling_max_freq_mhz", "bogomips", "cache_size_kb"]: - cpudict[field_name] = set() - for cpu_name in jheader["cpuinfo"].keys(): - cpuinfo_from_header = jheader["cpuinfo"][cpu_name] - if field_name in cpuinfo_from_header: # these fields are optionals: cmonitor_collector may not be able to populate them - cpudict[field_name].add(int(cpuinfo_from_header[field_name])) - - # now convert each dictionary entry from a set() to a simple string: - for field_name in cpudict.keys(): - the_list = [str(v) for v in cpudict[field_name]] - # join by comma each set() inside the dict: - if len(the_list) > 0: - cpudict[field_name] = ",".join(the_list) - else: - cpudict[field_name] = "Not Available" - - return cpudict - - # immediately after the MAIN div, the element where the configuration info are shown (when toggled): - self.file.write('
\n') - self.file.write("

Monitored System Details

\n") - self.file.write(" \n") - self.file.write(configdump(jheader, "identity", "Server Identity")) - self.file.write(configdump(jheader, "os_release", "Operating System Release")) - self.file.write(configdump(jheader, "proc_version", "Linux Kernel Version")) - if "cgroup_config" in jheader: # if cgroups are off, this section will not be present - self.file.write(configdump(jheader, "cgroup_config", "Linux Control Group (CGroup) Configuration")) - if "cpuinfo" in jheader: - jheader["cpu_summary"] = aggregate_cpuinfo(jheader) - self.file.write(configdump(jheader, "cpu_summary", "CPU Overview")) - if "numa_nodes" in jheader: - self.file.write(configdump(jheader, "numa_nodes", "NUMA Overview")) - if "proc_meminfo" in jheader: - self.file.write(configdump(jheader, "proc_meminfo", "Memory Overview")) - # self.file.write(configdump(jheader, "cpuinfo", "CPU Core Details") - self.file.write("
\n") - self.file.write("

CMonitor Collector

\n") - self.file.write(" \n") - self.file.write(configdump(jheader, "cmonitor", "Performance Stats Collector Configuration")) - if "custom_metadata" in jheader: - if len(jheader["custom_metadata"]) > 0: - self.file.write(configdump(jheader, "custom_metadata", "Custom Metadata")) - self.file.write("
\n") - self.file.write("
\n") # end of 'config_viewer_div' - - def appendHtmlTable(self, name, table_entries, div_class="bottom_div"): - self.file.write("
\n") - self.file.write("

" + name + "

\n") - self.file.write(" \n") - self.file.write(" \n") - self.file.write("
    \n") - for i, entry in enumerate(table_entries, start=1): - self.file.write("
  • " + entry[0] + " " + entry[1] + "
  • \n") - if (i % 4) == 0 and i < len(table_entries): - self.file.write("
    \n") - self.file.write("
\n") - self.file.write("
\n") - - def endHtmlBody(self): - self.file.write("\n") - self.file.write("\n") - self.file.close() - - -# ======================================================================================================= -# CMonitorGraphGenerator -# ======================================================================================================= - - -class CMonitorGraphGenerator: - """ - This is the main class of cmonitor_chart, able to read a JSON file produced by cmonitor_collector, - extract the most useful information and render them inside an HtmlOutputPage object. - """ - - def __init__(self, outfile, jheader, jdata): - self.jheader = jheader # a dictionary with cmonitor_collector "header" JSON object - self.jdata = jdata # a list of dictionaries with cmonitor_collector "samples" objects - - # in many places below we need to get "immutable" data that we know won't change across all samples - # like the names of network devices or the list of CPUs... - # since for some metrics the very sample does not contain any KPI (e.g. cgroup network traffic is generated - # only for samples after the first one) if possible we pick the 2nd sample and not the 1st one: - assert len(self.jdata) >= 2 - self.sample_template = self.jdata[1] - - # did we collect at PROCESS-level granularity or just at THREAD-level granularity? - string_collected_kpis = self.jheader["cmonitor"]["collecting"] # e.g. "cgroup_cpu,cgroup_memory,cgroup_threads" - self.collected_threads = "cgroup_threads" in string_collected_kpis - if verbose: - if self.collected_threads: - print("Per-thread stats (instead of per-process stats) have been collected in the input JSON file.") - else: - print("Per-process stats (instead of per-thread stats) have been collected in the input JSON file.") - - # detect num of CPUs: - self.baremetal_logical_cpus_indexes = [] - if "stat" in self.sample_template: - self.baremetal_logical_cpus_indexes = CMonitorGraphGenerator.collect_logical_cpu_indexes_from_section(self.sample_template, "stat") - if verbose: - print( - "Found %d CPUs in baremetal stats with logical indexes [%s]" - % ( - len(self.baremetal_logical_cpus_indexes), - ", ".join(str(x) for x in self.baremetal_logical_cpus_indexes), - ) - ) - - self.cgroup_logical_cpus_indexes = [] - if "cgroup_cpuacct_stats" in self.sample_template: - self.cgroup_logical_cpus_indexes = CMonitorGraphGenerator.collect_logical_cpu_indexes_from_section( - self.sample_template, "cgroup_cpuacct_stats" - ) - if verbose: - print( - "Found %d CPUs in cgroup stats with logical indexes [%s]" - % ( - len(self.cgroup_logical_cpus_indexes), - ", ".join(str(x) for x in self.cgroup_logical_cpus_indexes), - ) - ) - - # load IDENTITY of monitored system - self.monitored_system = "Unknown" - if "identity" in self.jheader: - if "hostname" in self.jheader["identity"]: - self.monitored_system = self.jheader["identity"]["hostname"] - if "custom_metadata" in self.jheader: - if "cmonitor_chart_name" in self.jheader["custom_metadata"]: - self.monitored_system = self.jheader["custom_metadata"]["cmonitor_chart_name"] - - # get the CGROUP name - self.cgroup_name = "None" - if "cgroup_config" in self.jheader and "name" in self.jheader["cgroup_config"]: - self.cgroup_name = self.jheader["cgroup_config"]["name"] - if "custom_metadata" in self.jheader: - if "cmonitor_chart_name" in self.jheader["custom_metadata"]: - self.cgroup_name = "docker/" + self.jheader["custom_metadata"]["cmonitor_chart_name"] - - # get the CGROUP version (v1 or v2 ?) - self.cgroup_ver = None - if "cgroup_config" in self.jheader and "version" in self.jheader["cgroup_config"]: - self.cgroup_ver = int(self.jheader["cgroup_config"]["version"]) - - # finally create the main HTML output page object - self.output_page = HtmlOutputPage(outfile, self.monitored_system) - - # ======================================================================================================= - # Private helpers - # ======================================================================================================= - - @staticmethod - def collect_logical_cpu_indexes_from_section(jsample, section_name): - """ - Walks over given JSON sample looking for keys 'cpuXYZ' and storing all 'XYZ' CPU indexes. - Returns a list of CPU indexes - """ - logical_cpus_indexes = [] - for key in jsample[section_name]: - if key.startswith("cpu") and key != "cpu_total" and key != "cpu_tot": - cpuIdx = int(key[3:]) - logical_cpus_indexes.append(cpuIdx) - # print("%s %s" %(key, cpuIdx)) - return logical_cpus_indexes - - @staticmethod - def sizeof_fmt(num, suffix="B"): - for unit in ["", "k", "M", "G", "T", "P", "E", "Z"]: - if abs(num) < 1000.0: - return "%3.1f%s%s" % (num, unit, suffix) - num /= 1000.0 - return "%.1f%s%s" % (num, "Y", suffix) - - def __make_jheader_nicer(self): - """ - This function just improves self.jheader by adding new sections in that dict and - adding measurement units where they are required. - This is useful because the - """ - - # provide some human-readable config files: - if "cgroup_config" in self.jheader: - avail_cpus = self.jheader["cgroup_config"]["cpus"].split(",") - self.jheader["cgroup_config"]["num_allowed_cpus"] = len(avail_cpus) - self.jheader["cgroup_config"]["cpus"] = self.jheader["cgroup_config"]["cpus"].replace(",", ", ") - - self.jheader["cgroup_config"]["memory_limit_bytes"] = self.__cgroup_get_memory_limit_human_friendly() - self.jheader["cgroup_config"]["cpu_quota_perc"] = self.__cgroup_get_cpu_quota_human_friendly() - - if "cmonitor" in self.jheader: - if self.jheader["cmonitor"]["sample_num"] == 0: - self.jheader["cmonitor"]["sample_num"] = "Infinite" - - if "proc_meminfo" in self.jheader: - self.jheader["proc_meminfo"]["MemTotal"] = CMonitorGraphGenerator.sizeof_fmt(int(self.jheader["proc_meminfo"]["MemTotal"])) - self.jheader["proc_meminfo"]["Hugepagesize"] = CMonitorGraphGenerator.sizeof_fmt(int(self.jheader["proc_meminfo"]["Hugepagesize"])) - - def __print_data_loading_stats(self, desc, n_invalid_samples): - if n_invalid_samples > 0: - print( - "While parsing %s statistics found %d/%d (%.1f%%) samples that did not contain some required JSON section." - % ( - desc, - n_invalid_samples, - len(self.jdata), - 100 * n_invalid_samples / len(self.jdata), - ) - ) - else: - print("Parsed correctly %d samples for [%s] category" % (len(self.jdata), desc)) - - def __cgroup_get_cpu_quota_percentage(self): - """ - Returns a number, in percentage, that indicates how much&many CPUs can be used. - E.g. possible values are 50%, 140%, 300% or -1 to indicate no limit. - """ - cpu_quota_perc = 100 - if "cpu_quota_perc" in self.jheader["cgroup_config"]: - cpu_quota_perc = 100 * self.jheader["cgroup_config"]["cpu_quota_perc"] - if cpu_quota_perc == -100: # means there's no CPU limit - cpu_quota_perc = -1 - return cpu_quota_perc - - def __cgroup_get_cpu_quota_human_friendly(self): - if "cpu_quota_perc" not in self.jheader["cgroup_config"]: - return "NO LIMIT" - if int(self.jheader["cgroup_config"]["cpu_quota_perc"]) == -1: - return "NO LIMIT" - cpu_quota_perc = 100 * self.jheader["cgroup_config"]["cpu_quota_perc"] - return f"cpu quota = {cpu_quota_perc}%" - - @staticmethod - def cgroup_get_cpu_throttling(s): - cpu_throttling = 0 - if "throttling" in s["cgroup_cpuacct_stats"]: - # throttling is new since cmonitor_collector 1.5-0 - nr_periods = s["cgroup_cpuacct_stats"]["throttling"]["nr_periods"] - if nr_periods: - cpu_throttling = 100 * s["cgroup_cpuacct_stats"]["throttling"]["nr_throttled"] / nr_periods - return cpu_throttling - - def __cgroup_get_memory_limit(self): - """ - Returns the cgroup memory limit in bytes; can be -1 if there's no limit - """ - cgroup_limit_bytes = -1 - if "memory_limit_bytes" in self.jheader["cgroup_config"]: - # IMPORTANT: this value could be -1 if there's no limit - cgroup_limit_bytes = int(self.jheader["cgroup_config"]["memory_limit_bytes"]) - - return cgroup_limit_bytes - - def __cgroup_get_memory_limit_human_friendly(self): - if "memory_limit_bytes" not in self.jheader["cgroup_config"]: - return "NO LIMIT" - if int(self.jheader["cgroup_config"]["memory_limit_bytes"]) == -1: - return "NO LIMIT" - cgroup_limit_bytes = CMonitorGraphGenerator.sizeof_fmt(self.jheader["cgroup_config"]["memory_limit_bytes"]) - return f"memory limit = {cgroup_limit_bytes}" - - @staticmethod - def __get_main_thread_associated_with(sample, tid): - json_key = "pid_%s" % tid - tgid = sample["cgroup_tasks"][json_key]["tgid"] - if tgid == sample["cgroup_tasks"][json_key]["pid"]: - # actually current entry is not a secondary thread but a PROCESS, append it: - return tid - else: - json_key_of_main_process = "pid_%s" % tgid - if json_key_of_main_process in sample["cgroup_tasks"]: - return tgid - else: - # the main thread / process associated with given THREAD ID is missing: - return None - - def __get_main_threads_only(self, tids_list_to_filter): - x = set() # use a Python set to automatically remove duplicates - for tid in tids_list_to_filter: - json_key = "pid_%s" % tid - - # first of all, find the first JSON sample that contains the current TID - n_sample = 0 - while n_sample < len(self.jdata) and json_key not in self.jdata[n_sample]["cgroup_tasks"]: - n_sample += 1 - - assert n_sample < len(self.jdata) # the TID comes from a processing of self.jdata itself... it must be there - pid = CMonitorGraphGenerator.__get_main_thread_associated_with(self.jdata[n_sample], tid) - if pid is None: - print(f"WARNING: the input JSON does not contain collected stats for PID [{pid}] associated with thread ID [{tid}]...") - else: - x.add(pid) - - return x - - def __generate_topN_procs_bubble_chart(self, process_dict, topN_pids_list, max_byte_value_dict, chart_desc_postfix): - cpu_label = "CPU time" - io_label = "I/O (B)" - thread_proc_label = "Thread" if self.collected_threads else "Process" - memory_label = "Memory (B)" - - def get_nice_process_or_thread_name(pid): - return "%s (%d)" % (process_dict[pid]["cmd"], pid) - - # now select the N top processes and put their data in a GoogleChart table: - topN_process_table = GoogleChartsGenericTable(["Command", cpu_label, io_label, thread_proc_label, memory_label]) - for i, pid in enumerate(topN_pids_list): - p = process_dict[pid] - nicecmd = get_nice_process_or_thread_name(pid) - if verbose: - print("Processing data for %d-th CPU-top-scorer process [%s]" % (i + 1, nicecmd)) - topN_process_table.addRow([p["cmd"], p["cpu"], int(p["io"]), nicecmd, int(p["mem"])]) - - # generate the bubble chart graph: - self.output_page.appendGoogleChart( - GoogleChartsGraph( - data=topN_process_table, - button_label="CPU/Memory/Disk Bubbles by Thread" if self.collected_threads else "CPU/Memory/Disk Bubbles by Process", - graph_source=GRAPH_SOURCE_DATA_IS_CGROUP_PER_PROCESS, - graph_type=GRAPH_TYPE_BUBBLE_CHART, - graph_title=f"CPU/disk total usage on X/Y axes; memory usage as bubble size {chart_desc_postfix}", - y_axes_titles=[cpu_label, io_label], - ) - ) - - def __generate_topN_procs_cpu_io_mem_vs_time(self, process_dict, topN_pids_list, max_byte_value_dict, chart_desc_postfix): - mem_limit_bytes = self.__cgroup_get_memory_limit() - cpu_quota_perc = self.__cgroup_get_cpu_quota_percentage() - - def get_nice_process_or_thread_name(pid): - return "%s (%d)" % (process_dict[pid]["cmd"], pid) - - chart_data = {} - - ## -- CPU -- - if cpu_quota_perc > 0: - # it is possible to compute the "idle" time inside this cgroup and it's possible that CPU is throttled... - # so in such case we add 2 more data series to the chart: - cpu_time_serie = GoogleChartsTimeSeries( - ["Timestamp"] + [get_nice_process_or_thread_name(pid) for pid in topN_pids_list] + ["Idle", "Throttling"], - [""] + ["%" for pid in topN_pids_list] + ["%", "%"], - ) - y_axes_max_value = [None, 0] - columns_for_2nd_yaxis = ["Throttling"] - y_axes_titles = ["CPU (%)", "CPU Throttling (%)"] - else: - # no CPU limit... creating an "idle" column (considering as max CPU all the CPUs available) likely produces weird - # results out-of-scale (imagine servers with hundreds of CPUs and cmonitor_collector monitoring just a Redis container!) - # so we do not place any "idle" column. The "throttling" column does not apply either. - assert cpu_quota_perc == -1 - cpu_time_serie = GoogleChartsTimeSeries( - ["Timestamp"] + [get_nice_process_or_thread_name(pid) for pid in topN_pids_list], - [""] + ["%" for pid in topN_pids_list], - ) - y_axes_max_value = [None] # let Google Charts autocompute the Y axes limits - columns_for_2nd_yaxis = None - y_axes_titles = ["CPU (%)"] - - # CPU by thread/process: - chart_data["cpu"] = GoogleChartsGraph( - data=cpu_time_serie, - graph_title=f"CPU usage ({self.__cgroup_get_cpu_quota_human_friendly()}) {chart_desc_postfix}", - button_label="CPU by Thread" if self.collected_threads else "CPU by Process", - graph_source=GRAPH_SOURCE_DATA_IS_CGROUP_PER_PROCESS, - # take any contribute of any thread/process and stack it together: this way it becomes easier to spot - # when the cgroup CPU limit was hit and due to which threads/processes - stack_state=True, - y_axes_titles=y_axes_titles, - # throttling should not be stacked to the CPU usage contributions, so move it on 2nd y axis: - columns_for_2nd_yaxis=columns_for_2nd_yaxis, - # make the 2 axes have the same identical Y scale to make it easier to read it: - y_axes_max_value=y_axes_max_value, - ) - - ## -- MEM -- - topN_pids_list_for_memory = self.__get_main_threads_only(topN_pids_list) - if verbose: - print( - f"While generating the per-process memory chart, the following MAIN PIDs were selected: {topN_pids_list_for_memory} from the list of top-CPU scorer processes {topN_pids_list}" - ) - if mem_limit_bytes > 0: - # it is possible to compute the "free" memory inside this cgroup and it's possible to have allocation failures - # so in such case we add 2 more data series to the chart: - mem_time_serie = GoogleChartsTimeSeries( - ["Timestamp"] + [get_nice_process_or_thread_name(pid) for pid in topN_pids_list_for_memory] + ["Free", "Alloc Failures"], - [""] + ["B" for pid in topN_pids_list_for_memory] + ["B", ""], - ) - y_axes_max_value = [None, 0] - columns_for_2nd_yaxis = ["Alloc Failures"] - y_axes_titles = ["Memory (B)", "Alloc Failures"] - else: - # no memory limit... creating the "free" data serie (considering all the system memory as actual limit) likely produces weird - # results out-of-scale so we do not place any "idle" column. The "alloc failures" column does not apply either. - assert mem_limit_bytes == -1 - mem_time_serie = GoogleChartsTimeSeries( - ["Timestamp"] + [get_nice_process_or_thread_name(pid) for pid in topN_pids_list_for_memory], - [""] + ["B" for pid in topN_pids_list_for_memory], - ) - y_axes_max_value = [None] # let Google Charts autocompute the Y axes limits - columns_for_2nd_yaxis = None - y_axes_titles = ["Memory (B)"] - - chart_data["mem"] = GoogleChartsGraph( - data=mem_time_serie, - graph_title=f"Memory usage ({self.__cgroup_get_memory_limit_human_friendly()}) {chart_desc_postfix}", - button_label="Memory by Process", - graph_source=GRAPH_SOURCE_DATA_IS_CGROUP_PER_PROCESS, - # take any contribute of any thread/process and stack it together: this way it becomes easier to spot - # when the cgroup MEMORY limit was hit and due to which threads/processes - stack_state=True, - y_axes_titles=y_axes_titles, - y_axes_max_value=y_axes_max_value, - # alloc failures should not be stacked to the memory usage contributions, so move it on 2nd y axis: - columns_for_2nd_yaxis=columns_for_2nd_yaxis, - ) - - ## -- IO -- - io_time_serie = GoogleChartsTimeSeries( - ["Timestamp"] + [get_nice_process_or_thread_name(pid) for pid in topN_pids_list], - [""] + ["B" for pid in topN_pids_list], - ) - chart_data["io"] = GoogleChartsGraph( - data=io_time_serie, - graph_title=f"I/O usage (from cgroup stats) {chart_desc_postfix}", - button_label="IO by Thread" if self.collected_threads else "IO by Process", - y_axes_titles=["IO Read+Write (B)"], - graph_source=GRAPH_SOURCE_DATA_IS_CGROUP_PER_PROCESS, - stack_state=False, - ) - - # now generate a table of CPU/IO/MEMORY usage over time, per process/thread: - for sample in self.jdata: - try: - row = {} - - # save the same timestamp in all charts - for key in ["cpu", "io", "mem"]: - row[key] = [sample["timestamp"]["UTC"]] - - # append CPU & IO samples - tot_cpu_usage_perc = 0 - for top_process_pid in topN_pids_list: - # print(top_process_pid) - json_key = "pid_%s" % top_process_pid - if json_key in sample["cgroup_tasks"]: - top_proc_sample = sample["cgroup_tasks"][json_key] - - cpu = top_proc_sample["cpu_usr"] + top_proc_sample["cpu_sys"] - io = int((top_proc_sample["io_rchar"] + top_proc_sample["io_wchar"])) - - tot_cpu_usage_perc += cpu - row["cpu"].append(cpu) - row["io"].append(io) - else: - # probably this process was born later or dead earlier than this timestamp - row["cpu"].append(0) - row["io"].append(0) - - # for memory chart, only include PROCESSES, never include SECONDARY THREADS since there - # is no distinction between memory of whole process and memory of secondary threads - tot_mem_usage_bytes = 0 - for top_process_pid in topN_pids_list_for_memory: - # print(top_process_pid) - json_key = "pid_%s" % top_process_pid - if json_key in sample["cgroup_tasks"]: - top_proc_sample = sample["cgroup_tasks"][json_key] - - tot_mem_usage_bytes += top_proc_sample["mem_rss_bytes"] - mem = int(top_proc_sample["mem_rss_bytes"]) - row["mem"].append(mem) - else: - # probably this process was born later or dead earlier than this timestamp - row["mem"].append(0) - - # CPU graph has - # - idle (if cpu_quota_perc > 0) - # - throttling - # as additional columns right after the last PID serie - if cpu_quota_perc > 0: - row["cpu"].append(max(cpu_quota_perc - tot_cpu_usage_perc, 0)) - row["cpu"].append(CMonitorGraphGenerator.cgroup_get_cpu_throttling(sample)) - - # Memory graph has - # - free mem - # - alloc failures - # as additional columns right after the timestamp - if mem_limit_bytes > 0: - row["mem"].append(max(int(mem_limit_bytes - tot_mem_usage_bytes), 0)) - if self.cgroup_ver == 1: - failcnt = sample["cgroup_memory_stats"]["events.failcnt"] - else: - failcnt = sample["cgroup_memory_stats"]["events.oom_kill"] - row["mem"].append(failcnt) - - for key in ["cpu", "io", "mem"]: - chart_data[key].data_table.addRow(row[key]) - except KeyError: # avoid crashing if a key is not present in the dictionary... - # print("Missing cgroup data while parsing sample %d" % i) - pass - - self.output_page.appendGoogleChart(chart_data["cpu"]) - self.output_page.appendGoogleChart(chart_data["mem"]) - self.output_page.appendGoogleChart(chart_data["io"]) - - # ======================================================================================================= - # Public API - # ======================================================================================================= - - def generate_cgroup_topN_procs(self, numProcsToShow=20): - # if process data was not collected, just return: - if "cgroup_tasks" not in self.sample_template: - return - - # build a dictionary containing cumulative metrics for CPU/IO/MEM data for each process - # along all collected samples - process_dict = {} - max_byte_value_dict = {} - max_byte_value_dict["mem_rss"] = 0 - max_byte_value_dict["io_total"] = 0 - n_invalid_samples = 0 - for i, sample in enumerate(self.jdata): - try: - for process in sample["cgroup_tasks"]: - # parse data from JSON - entry = sample["cgroup_tasks"][process] - cmd = entry["cmd"] - cputime = entry["cpu_usr_total_secs"] + entry["cpu_sys_total_secs"] - iobytes = entry["io_total_read"] + entry["io_total_write"] - membytes = entry["mem_rss_bytes"] # take RSS, more realistic/useful compared to the "mem_virtual_bytes" - thepid = entry["pid"] # can be the TID (thread ID) if cmonitor_collector was started with --collect=cgroup_threads - - # keep track of maxs: - max_byte_value_dict["mem_rss"] = max(membytes, max_byte_value_dict["mem_rss"]) - max_byte_value_dict["io_total"] = max(iobytes, max_byte_value_dict["io_total"]) - - try: # update the current entry - process_dict[thepid]["cpu"] = cputime - process_dict[thepid]["io"] = iobytes - process_dict[thepid]["mem"] = membytes - process_dict[thepid]["cmd"] = cmd - - # FIXME FIXME - # process_dict[thepid]["is_thread"] = - except: # no current entry so add one - process_dict.update( - { - thepid: { - "cpu": cputime, - "io": iobytes, - "mem": membytes, - "cmd": cmd, - } - } - ) - except KeyError as e: # avoid crashing if a key is not present in the dictionary... - print(f"Missing cgroup data while parsing {i}-th sample: {e}") - n_invalid_samples += 1 - pass - - self.__print_data_loading_stats("per-process", n_invalid_samples) - - # now sort all collected processes by the amount of CPU*memory used: - # NOTE: sorted() will return just the sorted list of KEYs = PIDs - def sort_key(d): - # return process_dict[d]['cpu'] * process_dict[d]['mem'] - return process_dict[d]["cpu"] - - topN_pids_list = sorted(process_dict, key=sort_key, reverse=True) - - # truncate to first N: - if numProcsToShow > 0: - topN_pids_list = topN_pids_list[0:numProcsToShow] - - # provide common chart description - chart_desc_postfix = "" - if numProcsToShow > 0: - if self.collected_threads: - chart_desc_postfix = f"of {numProcsToShow} top-CPU-utilizing threads" - else: - chart_desc_postfix = f"of {numProcsToShow} top-CPU-utilizing processes" - # else: if there's no filter on the processes to show, simply produce an empty postfix since it's - # weird to see e.g. "CPU usage of ALL top-CPU-utilizing processes" - - self.__generate_topN_procs_cpu_io_mem_vs_time(process_dict, topN_pids_list, max_byte_value_dict, chart_desc_postfix) - self.__generate_topN_procs_bubble_chart(process_dict, topN_pids_list, max_byte_value_dict, chart_desc_postfix) - - def generate_baremetal_disks_io(self): - # if disk data was not collected, just return: - if "disks" not in self.sample_template: - return - - all_disks = self.sample_template["disks"].keys() - if len(all_disks) == 0: - return - - # see https://www.kernel.org/doc/Documentation/iostats.txt - - diskcols = ["Timestamp"] - for device in all_disks: - # diskcols.append(str(device) + " Disk Time") - # diskcols.append(str(device) + " Reads") - # diskcols.append(str(device) + " Writes") - diskcols.append(str(device) + " Read MB") - diskcols.append(str(device) + " Write MB") - - # convert from kB to MB - divider = 1000 - - # - # MAIN LOOP - # Process JSON sample and fill the GoogleChartsTimeSeries() object - # - - disk_table = GoogleChartsTimeSeries(diskcols) - for i, s in enumerate(self.jdata): - if i == 0: - continue - - row = [] - row.append(s["timestamp"]["UTC"]) - for device in all_disks: - # row.append(s["disks"][device]["time"]) - # row.append(s["disks"][device]["reads"]) - # row.append(s["disks"][device]["writes"]) - row.append(s["disks"][device]["rkb"] / divider) - row.append(-s["disks"][device]["wkb"] / divider) - disk_table.addRow(row) - - self.output_page.appendGoogleChart( - GoogleChartsGraph( - data=disk_table, - button_label="Disk I/O", - graph_source=GRAPH_SOURCE_DATA_IS_BAREMETAL, - graph_title="Disk I/O (from baremetal stats)", - y_axes_titles=["MB"], - ) - ) - return - - # def generate_filesystems(self.output_page, self.jdata): - # global self.graphs - # fsstr = "" - # for fs in self.sample_template["filesystems"].keys(): - # fsstr = fsstr + "'" + fs + "'," - # fsstr = fsstr[:-1] - # writeHtmlHead_line_graph(self.output_page, fsstr) - # for i, s in enumerate(self.jdata): - # self.output_page.write(",['Date(%s)' " % (googledate(s['timestamp']["UTC"]))) - # for fs in s["filesystems"].keys(): - # self.output_page.write(", %.1f" % (s["filesystems"][fs]["fs_full_percent"])) - # self.output_page.write("]\n") - # self.output_page.appendGoogleChart(GoogleChartsGraph( 'File Systems Used percent') - # return - - def __generate_network_traffic_graphs(self, graph_source, sample_section_name, graph_desc): - # if network traffic data was not collected, just return: - if sample_section_name not in self.sample_template: - return - - all_netdevices = self.sample_template[sample_section_name].keys() - if len(all_netdevices) == 0: - return - - netcols = ["Timestamp"] - for device in all_netdevices: - netcols.append(str(device) + "+in") - netcols.append(str(device) + "-out") - - # convert from bytes to MB - divider = 1000 * 1000 - unit = "MB" - - # - # MAIN LOOP - # Process JSON sample and fill the GoogleChartsTimeSeries() object - # - - # MB/sec - - net_table = GoogleChartsTimeSeries(netcols, [unit for col in netcols]) - for i, s in enumerate(self.jdata): - if i == 0: - continue - - row = [s["timestamp"]["UTC"]] - for device in all_netdevices: - try: - row.append(+s[sample_section_name][device]["ibytes"] / divider) - row.append(-s[sample_section_name][device]["obytes"] / divider) - except KeyError: - if verbose: - print("Missing key '%s' while parsing sample %d" % (device, i)) - row.append(0) - row.append(0) - net_table.addRow(row) - - self.output_page.appendGoogleChart( - GoogleChartsGraph( - data=net_table, - graph_title=f"Network Traffic in MB/s {graph_desc}", - button_label="Network (MB/s)", - y_axes_titles=["MB/s"], - graph_source=graph_source, - stack_state=False, - ) - ) - - # PPS - - net_table = GoogleChartsTimeSeries(netcols) - for i, s in enumerate(self.jdata): - if i == 0: - continue - - row = [s["timestamp"]["UTC"]] - for device in all_netdevices: - try: - row.append(+s[sample_section_name][device]["ipackets"]) - row.append(-s[sample_section_name][device]["opackets"]) - except KeyError: - if verbose: - print("Missing key '%s' while parsing sample %d" % (device, i)) - row.append(0) - row.append(0) - net_table.addRow(row) - - self.output_page.appendGoogleChart( - GoogleChartsGraph( - data=net_table, - graph_title=f"Network Traffic in PPS {graph_desc}", - button_label="Network (PPS)", - y_axes_titles=["PPS"], - graph_source=graph_source, - stack_state=False, - ) - ) - return - - def generate_baremetal_network_traffic(self): - self.__generate_network_traffic_graphs(GRAPH_SOURCE_DATA_IS_BAREMETAL, "network_interfaces", "(from baremetal stats)") - - def generate_cgroup_network_traffic(self): - self.__generate_network_traffic_graphs(GRAPH_SOURCE_DATA_IS_CGROUP_TOTALS, "cgroup_network", "(from cgroup stats)") - - def generate_baremetal_cpus(self): - # if baremetal CPU data was not collected, just return: - if "stat" not in self.sample_template: - return - - # prepare empty tables - baremetal_cpu_stats = {} - for c in self.baremetal_logical_cpus_indexes: - baremetal_cpu_stats[c] = GoogleChartsTimeSeries( - [ - "Timestamp", - "User", - "Nice", - "System", - "Idle", - "I/O wait", - "Hard IRQ", - "Soft IRQ", - "Steal", - ], - [ - "", - "%", - "%", - "%", - "%", - "%", - "%", - "%", - "%", - ], - ) - - all_cpus_table = GoogleChartsTimeSeries( - ["Timestamp"] + [("CPU" + str(x)) for x in self.baremetal_logical_cpus_indexes], # force newline - [""] + ["%" for x in self.baremetal_logical_cpus_indexes], - ) - - # - # MAIN LOOP - # Process JSON sample and fill the GoogleChartsTimeSeries() object - # - - for i, s in enumerate(self.jdata): - if i == 0: - continue # skip first sample - - ts = s["timestamp"]["UTC"] - all_cpus_row = [ts] - for c in self.baremetal_logical_cpus_indexes: - cpu_stats = s["stat"]["cpu" + str(c)] - cpu_total = ( - cpu_stats["user"] - + cpu_stats["nice"] - + cpu_stats["sys"] - + cpu_stats["iowait"] - + cpu_stats["hardirq"] - + cpu_stats["softirq"] - + cpu_stats["steal"] - ) - baremetal_cpu_stats[c].addRow( - [ - ts, - cpu_stats["user"], - cpu_stats["nice"], - cpu_stats["sys"], - cpu_stats["idle"], - cpu_stats["iowait"], - cpu_stats["hardirq"], - cpu_stats["softirq"], - cpu_stats["steal"], - ] - ) - all_cpus_row.append(cpu_total) - - all_cpus_table.addRow(all_cpus_row) - - # Produce the javascript: - for c in self.baremetal_logical_cpus_indexes: - self.output_page.appendGoogleChart( - GoogleChartsGraph( - data=baremetal_cpu_stats[c], # Data - graph_title="Logical CPU " + str(c) + " (from baremetal stats)", - combobox_label="baremetal_cpus", - combobox_entry="CPU" + str(c), - y_axes_titles=["CPU (%)"], - graph_source=GRAPH_SOURCE_DATA_IS_BAREMETAL, - stack_state=True, - ) - ) - - # Also produce the "all CPUs" graph - self.output_page.appendGoogleChart( - GoogleChartsGraph( - data=all_cpus_table, # Data - graph_title="All logical CPUs (from baremetal stats)", - button_label="All CPUs", - y_axes_titles=["CPU (%)"], - graph_source=GRAPH_SOURCE_DATA_IS_BAREMETAL, - stack_state=False, - ) - ) - return - - def generate_cgroup_cpus(self): - if "cgroup_cpuacct_stats" not in self.sample_template: - return # cgroup mode not enabled at collection time! - - # prepare empty tables - cpu_stats_table = {} - for c in self.cgroup_logical_cpus_indexes: - cpu_stats_table[c] = GoogleChartsTimeSeries(["Timestamp", "User", "System"], ["", "%", "%"]) - - all_cpus_table = GoogleChartsTimeSeries( - ["Timestamp", "Limit/Quota", "Throttling"] + [("CPU" + str(x)) for x in self.cgroup_logical_cpus_indexes], - ["", "%", "%"] + ["%" for x in self.cgroup_logical_cpus_indexes], - ) - - # - # MAIN LOOP - # Process JSON sample and fill the GoogleChartsTimeSeries() object - # - - cpu_quota_perc = self.__cgroup_get_cpu_quota_percentage() - n_invalid_samples = 0 - # max_cpu_throttling = 0 - for i, s in enumerate(self.jdata): - if i == 0: - continue # skip first sample - - try: - ts = s["timestamp"]["UTC"] - - throttling = CMonitorGraphGenerator.cgroup_get_cpu_throttling(s) - # max_cpu_throttling = max(max_cpu_throttling, throttling) - all_cpus_row = [ts, cpu_quota_perc, throttling] - for c in self.cgroup_logical_cpus_indexes: - # get data: - cpu_stats = s["cgroup_cpuacct_stats"]["cpu" + str(c)] - if "sys" in cpu_stats: - cpu_sys = cpu_stats["sys"] - else: - cpu_sys = 0 - cpu_total = cpu_stats["user"] + cpu_sys - - # append data: - cpu_stats_table[c].addRow([ts, cpu_stats["user"], cpu_sys]) - all_cpus_row.append(cpu_total) - - all_cpus_table.addRow(all_cpus_row) - except KeyError: # avoid crashing if a key is not present in the dictionary... - # print("Missing cgroup data while parsing sample %d" % i) - n_invalid_samples += 1 - pass - - self.__print_data_loading_stats("cgroup CPU", n_invalid_samples) - - # Produce 1 graph for each CPU: - for c in self.cgroup_logical_cpus_indexes: - self.output_page.appendGoogleChart( - GoogleChartsGraph( - data=cpu_stats_table[c], # Data - graph_title="Logical CPU " + str(c) + " (from CGroup stats)", - combobox_label="cgroup_cpus", - combobox_entry="CPU" + str(c), - y_axes_titles=["CPU (%)"], - graph_source=GRAPH_SOURCE_DATA_IS_CGROUP_TOTALS, - stack_state=True, - ) - ) - - # Also produce the "all CPUs" graph that includes some very useful KPIs like - # - CPU limit imposed on Linux CFS scheduler - # - Amount of CPU throttling - # NOTE: when cgroups v2 are used, there's no per-CPU stat just the total CPU usage, - # so we change the title of the tab to reflect that - graph_title = "CPU usage by index of CPU available inside cgroup" if self.cgroup_ver == 1 else "CPU usage measured in the cgroup" - graph_title = f"{graph_title} ({self.__cgroup_get_cpu_quota_human_friendly()})" - - self.output_page.appendGoogleChart( - GoogleChartsGraph( - data=all_cpus_table, # Data - graph_title=graph_title, - button_label="All CPUs" if self.cgroup_ver == 1 else "CPU", - y_axes_titles=["CPU (%)", "CPU Throttling (%)"], - graph_source=GRAPH_SOURCE_DATA_IS_CGROUP_TOTALS, - stack_state=False, - # give evidence to CPU throttling by moving it on 2nd y axis: - columns_for_2nd_yaxis=["Throttling"], - y_axes_max_value=[None, 0], - ) - ) - - return - - def generate_baremetal_memory(self): - # if baremetal memory data was not collected, just return: - if "proc_meminfo" not in self.sample_template: - return - - # - # MAIN LOOP - # Process JSON sample and build Google Chart-compatible Javascript variable - # See https://developers.google.com/chart/interactive/docs/reference - # - - mem_total_bytes = self.sample_template["proc_meminfo"]["MemTotal"] - baremetal_memory_stats = GoogleChartsTimeSeries(["Timestamp", "Used", "Cached (DiskRead)", "Free"], ["", "B", "B", "B"]) - - for i, s in enumerate(self.jdata): - if i == 0: - continue # skip first sample - meminfo_stats = s["proc_meminfo"] - - if meminfo_stats["MemTotal"] != mem_total_bytes: - continue # this is impossible AFAIK (hot swap of memory is not handled!!) - - # - # NOTE: most tools like e.g. free -k just map: - # - # free output | corresponding /proc/meminfo fields - # --------------+--------------------------------------- - # Mem: total | MemTotal - # Mem: used | MemTotal - MemFree - Buffers - Cached - Slab - # Mem: free | MemFree ^^^^^^^^^ ^^^^ - # Buffers and Slab are close to zero 99% of the time - # - # see https://access.redhat.com/solutions/406773 - - mf = meminfo_stats["MemFree"] - mc = meminfo_stats["Cached"] - - baremetal_memory_stats.addRow( - [ - s["timestamp"]["UTC"], - int(mem_total_bytes - mf - mc), # compute used memory - int(mc), # cached - int(mf), # free - ] - ) - - # Produce the javascript: - self.output_page.appendGoogleChart( - GoogleChartsGraph( - data=baremetal_memory_stats, # Data - graph_title="Memory usage in Bytes (from baremetal stats)", - button_label="Memory", - y_axes_titles=["Memory (B)"], - graph_source=GRAPH_SOURCE_DATA_IS_BAREMETAL, - stack_state=True, - ) - ) - return - - def generate_cgroup_memory(self): - # if cgroup data was not collected, just return: - if "cgroup_memory_stats" not in self.sample_template: - return - - # - # MAIN LOOP - # Process JSON sample and build Google Chart-compatible Javascript variable - # See https://developers.google.com/chart/interactive/docs/reference - # - - mem_limit_bytes = self.__cgroup_get_memory_limit() - if mem_limit_bytes > 0: - # it is possible to compute the "free" memory inside this cgroup and it's possible to have allocation failures - # so in such case we add 2 more data series to the chart: - cgroup_memory_stats = GoogleChartsTimeSeries( - ["Timestamp", "Used", "Cached (DiskRead)", "Free", "Alloc Failures"], ["", "B", "B", "B", ""] - ) - y_axes_max_value = [None, 0] - columns_for_2nd_yaxis = ["Alloc Failures"] - y_axes_titles = ["Memory (B)", "Alloc Failures"] - else: - # no memory limit... creating the "free" series (considering all the system memory as actual limit) likely produces weird - # results out-of-scale so we do not place any "idle" column. The "alloc failures" column does not apply either. - assert mem_limit_bytes == -1 - cgroup_memory_stats = GoogleChartsTimeSeries(["Timestamp", "Used", "Cached (DiskRead)"], ["", "B", "B"]) - y_axes_max_value = [None] # use default GoogleChart logic - columns_for_2nd_yaxis = None - y_axes_titles = ["Memory (B)"] - - n_invalid_samples = 0 - # max_mfail = 0 - for i, s in enumerate(self.jdata): - if i == 0: - continue # skip first sample - - try: - # mu = memory actually Used - # mc = memory used as Cache - # mfail = memory alloc failures inside cgroup - if self.cgroup_ver == 1: - mu = s["cgroup_memory_stats"]["stat.rss"] - mc = s["cgroup_memory_stats"]["stat.cache"] - mfail = s["cgroup_memory_stats"]["events.failcnt"] - else: - # cgroups v2 - mu = s["cgroup_memory_stats"]["stat.anon"] - mc = s["cgroup_memory_stats"]["stat.file"] - mfail = s["cgroup_memory_stats"]["events.oom_kill"] - - mfree = mem_limit_bytes - mu - mc - # max_mfail = max(max_mfail, mfail) - - if mem_limit_bytes > 0: - cgroup_memory_stats.addRow( - [ - s["timestamp"]["UTC"], - int(mu), - int(mc), - int(mfree), - mfail, - ] - ) - else: - cgroup_memory_stats.addRow( - [ - s["timestamp"]["UTC"], - int(mu), - int(mc), - ] - ) - - except KeyError as e: # avoid crashing if a key is not present in the dictionary... - print(f"Missing cgroup data while parsing {i}-th sample: {e}") - n_invalid_samples += 1 - pass - - self.__print_data_loading_stats("cgroup memory", n_invalid_samples) - - # Produce the javascript: - # NOTE: on 2nd axis we try to keep the plotted line below the ones that belong to the first axis (to avoid cluttering) - # and we also add some offset to deal with the case where "max_mfail is zero" - # if mem_limit_bytes > 0: - # y_axes_max_value = [None, max_mfail * 5 + 10] - self.output_page.appendGoogleChart( - GoogleChartsGraph( - data=cgroup_memory_stats, # Data - graph_title=f"Used memory in Bytes measured inside cgroup ({self.__cgroup_get_memory_limit_human_friendly()})", - button_label="Memory", - graph_source=GRAPH_SOURCE_DATA_IS_CGROUP_TOTALS, - stack_state=True, - y_axes_titles=y_axes_titles, - columns_for_2nd_yaxis=columns_for_2nd_yaxis, - y_axes_max_value=y_axes_max_value, - ) - ) - - return - - def generate_baremetal_avg_load(self): - # - # MAIN LOOP - # Process JSON sample and build Google Chart-compatible Javascript variable - # See https://developers.google.com/chart/interactive/docs/reference - # - - num_baremetal_cpus = len(self.baremetal_logical_cpus_indexes) - if num_baremetal_cpus == 0: - num_baremetal_cpus = 1 - load_avg_stats = GoogleChartsTimeSeries(["Timestamp", "LoadAvg (1min)", "LoadAvg (5min)", "LoadAvg (15min)"]) - for i, s in enumerate(self.jdata): - if i == 0: - continue # skip first sample - - # - # See https://linux.die.net/man/5/proc - # and https://blog.appsignal.com/2018/03/28/understanding-system-load-and-load-averages.html - # - # "The load of a system is essentially the number of processes active at any given time. - # When idle, the load is 0. When a process starts, the load is incremented by 1. - # A terminating process decrements the load by 1. Besides running processes, - # any process that's queued up is also counted. So, when one process is actively using the CPU, - # and two are waiting their turn, the load is 3." - # ... - # "Generally, single-core CPU can handle one process at a time. An average load of 1.0 would mean - # that one core is busy 100% of the time. If the load average drops to 0.5, the CPU has been idle - # for 50% of the time." - - # since kernel reports a percentage in range [0-n], where n= number of cores, - # we remap that in range [0-100%] - - load_avg_stats.addRow( - [ - s["timestamp"]["UTC"], - 100 * float(s["proc_loadavg"]["load_avg_1min"]) / num_baremetal_cpus, - 100 * float(s["proc_loadavg"]["load_avg_5min"]) / num_baremetal_cpus, - 100 * float(s["proc_loadavg"]["load_avg_15min"]) / num_baremetal_cpus, - ] - ) - - # Produce the javascript: - self.output_page.appendGoogleChart( - GoogleChartsGraph( - data=load_avg_stats, # Data - graph_title="Average Load (from baremetal stats)", - button_label="Average Load", - y_axes_titles=["Load (%)"], - graph_source=GRAPH_SOURCE_DATA_IS_BAREMETAL, - stack_state=False, - ) - ) - return - - def generate_monitoring_summary(self): - monitoring_summary = [ - # ( "User:", self.jheader["cmonitor"]["username"] ), # not really useful - ("Collected:", self.jheader["cmonitor"]["collecting"].replace(",", ", ")), - # ( "Started sampling at:", self.sample_template["timestamp"]["datetime"] + " (Local)" ), # not really useful - ("Started sampling at:", self.jdata[0]["timestamp"]["UTC"] + " (UTC)"), - ("Samples:", str(len(self.jdata))), - ("Sampling Interval (s):", str(self.jheader["cmonitor"]["sample_interval_seconds"])), - ( - "Total time sampled (hh:mm:ss):", - str(datetime.timedelta(seconds=self.jheader["cmonitor"]["sample_interval_seconds"] * len(self.jdata))), - ), - ("Version (cmonitor_collector):", self.jheader["cmonitor"]["version"]), - ("Version (cmonitor_chart):", CmonitorToolVersion().get()), - ] - self.output_page.appendHtmlTable("Monitoring Summary", monitoring_summary) - - def __generate_monitored_summary_with_cpus(self, logical_cpus_indexes): - # NOTE: unfortunately some useful information like: - # - RAM memory model/speed - # - Disk model/speed - # - NIC model/speed - # will not be available from inside a container, which is where cmonitor_collector usually runs... - # so we mostly show CPU stats: - all_disks = [] - if "disks" in self.sample_template: - all_disks = self.sample_template["disks"].keys() - all_netdevices = [] - if "network_interfaces" in self.sample_template: - all_netdevices = self.sample_template["network_interfaces"].keys() - all_numanodes = [] - if "numa_nodes" in jheader: - all_numanodes = list(jheader["numa_nodes"].keys()) - all_numanodes = [v.replace("node", "") for v in all_numanodes] - - cpu_model = "Unknown" - bogomips = "Unknown" - if "cpuinfo" in self.jheader: - first_cpu = list(self.jheader["cpuinfo"].keys())[0] - cpu_model = self.jheader["cpuinfo"][first_cpu]["model_name"] - bogomips = str(self.jheader["cpuinfo"][first_cpu]["bogomips"]) - - monitored_summary = [ - ("Hostname:", self.jheader["identity"]["hostname"]), - ("OS:", self.jheader["os_release"]["pretty_name"]), - ("CPU:", cpu_model), - ("BogoMIPS:", bogomips), - ("Monitored CPUs:", str(len(logical_cpus_indexes))), - ("Monitored Disks:", str(len(all_disks))), - ("Monitored Network Devices:", str(len(all_netdevices))), - ("Monitored NUMA Nodes:", ",".join(all_numanodes)), - ] - return monitored_summary - - def generate_monitored_summary(self): - if len(self.baremetal_logical_cpus_indexes) > 0: - self.output_page.appendHtmlTable( - "Monitored System Summary", - self.__generate_monitored_summary_with_cpus(self.baremetal_logical_cpus_indexes), - ) - elif len(self.cgroup_logical_cpus_indexes) > 0: - self.output_page.appendHtmlTable( - "Monitored System Summary", - self.__generate_monitored_summary_with_cpus(self.cgroup_logical_cpus_indexes), - ) - - def generate_about_this(self): - about_this = [ - ("Zoom:", "use left-click and drag"), - ("Reset view:", "use right-click"), - ("Generated by", 'cmonitor'), - ] - self.output_page.appendHtmlTable("About this", about_this, div_class="bottom_about_div") - - def generate_html(self, top_scorer): - # baremetal stats: - self.generate_baremetal_cpus() - self.generate_baremetal_memory() - self.generate_baremetal_network_traffic() - self.generate_baremetal_disks_io() - self.generate_baremetal_avg_load() - - # cgroup stats: - self.generate_cgroup_cpus() - self.generate_cgroup_memory() - self.generate_cgroup_topN_procs(top_scorer) - self.generate_cgroup_network_traffic() - - # HTML HEAD -- generate all the JS code to draw all the graphs created so far - self.output_page.writeHtmlHead() - - # HTML BODY -- now we start actual HTML body which are just a few tables with buttons - # that invoke the JS code produced earlier inside the - self.__make_jheader_nicer() - self.output_page.startHtmlBody(self.cgroup_name, self.monitored_system, self.jheader, self.collected_threads) - - self.generate_monitoring_summary() - self.generate_monitored_summary() - self.generate_about_this() - - self.output_page.endHtmlBody() - # ======================================================================================================= # CLI options @@ -2112,11 +59,10 @@ def parse_command_line(): global verbose verbose = args.verbose - global g_datetime # instead of default 'datetime' which means local timezone # FIXME: currently the presence/absence of --utc flag is ignored... we need to add code that reads from the JSON header the timezone offset # and that applies the offset on top of the UTC timestamps - g_datetime = "UTC" + datetime = "UTC" if args.version: CmonitorToolVersion().print() @@ -2165,10 +111,11 @@ def parse_command_line(): if __name__ == "__main__": config = parse_command_line() start_time = time.time() + my_ver = CmonitorToolVersion().get() # load the JSON entry = CmonitorCollectorJsonLoader().load( - config["input_json"], this_tool_version=CmonitorToolVersion().get(), min_num_samples=2, be_verbose=verbose + config["input_json"], this_tool_version=my_ver, min_num_samples=2, be_verbose=verbose ) jheader = entry["header"] jdata = entry["samples"] @@ -2176,8 +123,8 @@ def parse_command_line(): print("Found %d data samples" % len(jdata)) print("Opening output file [%s]" % config["output_html"]) - graph_generator = CMonitorGraphGenerator(config["output_html"], jheader, jdata) - graph_generator.generate_html(config["top_scorer"]) + graph_generator = CMonitorGraphGenerator(config["output_html"], jheader, jdata, be_verbose=verbose) + graph_generator.generate_html(config["top_scorer"], version=my_ver) end_time = time.time() print("Completed processing of input JSON file of %d samples in %.3fsec. HTML output file is ready." % (len(jdata), end_time - start_time)) diff --git a/tools/common-code/cmonitor_chart_engine.py b/tools/common-code/cmonitor_chart_engine.py new file mode 100644 index 00000000..c93af63b --- /dev/null +++ b/tools/common-code/cmonitor_chart_engine.py @@ -0,0 +1,2079 @@ +#!/usr/bin/python3 + +# +# cmonitor_chart_engine.py +# Originally based on the "njmonchart_aix_v7.py" from Nigel project: http://nmon.sourceforge.net/ +# +# Author: Francesco Montorsi +# Created: April 2019 +# + +import sys +import json +import gzip +import datetime +import zlib +import binascii +import textwrap +import argparse +import getopt +import os +import time +from cmonitor_version import CmonitorToolVersion + +# ======================================================================================================= +# CONSTANTS +# ======================================================================================================= + +GRAPH_SOURCE_DATA_IS_BAREMETAL = 1 +GRAPH_SOURCE_DATA_IS_CGROUP_TOTALS = 2 +GRAPH_SOURCE_DATA_IS_CGROUP_PER_PROCESS = 3 + +GRAPH_TYPE_AREA_CHART = 1 +GRAPH_TYPE_BUBBLE_CHART = 2 + +SAVE_DEFLATED_JS_DATATABLES = True +JS_INDENT_SIZE = 2 + +# see https://developers.google.com/chart/interactive/docs/reference#dateformat +# the idea is that cmonitor_chart will most likely be used to explore short time intervals +# so that day/month/year part is not useful, just the time is useful; in tooltip we also +# reach millisec accuracy: +X_AXIS_DATEFORMAT = "HH:mm:ss" +TOOLTIP_DATEFORMAT = "HH:mm:ss.SSS z" + + +# ======================================================================================================= +# GoogleChartsTimeSeries +# ======================================================================================================= + + +class GoogleChartsTimeSeries(object): + """ + GoogleChartsTimeSeries is a (N+1)xM table of + t_1;Y1_1;Y2_1;...;YN_1 + t_2;Y1_2;Y2_2;...;YN_2 + ... + t_M;Y1_M;Y2_M;...;YN_M + data points for a GoogleCharts graph that is representing the evolution of N quantities over time + """ + + def __init__(self, column_names, column_units=None): + self.column_names = column_names # must be a LIST of strings + self.column_units = column_units + if self.column_units: + assert len(self.column_units) == len(self.column_names) + self.rows = [] # list of lists with values + + def ISOdatetimeToJSDate(self, date): + """convert ISO datetime strings like + "2017-08-21T20:12:30.123" + to strings like: + "Date(2017,8,21,20,12,30,123000)" + which are the datetime representation suitable for JS GoogleCharts, see + https://developers.google.com/chart/interactive/docs/datesandtimes + """ + dateAsPythonObj = datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") + + return "Date(%d,%d,%d,%d,%d,%d,%d)" % ( + dateAsPythonObj.year, + dateAsPythonObj.month, + dateAsPythonObj.day, + dateAsPythonObj.hour, + dateAsPythonObj.minute, + dateAsPythonObj.second, + dateAsPythonObj.microsecond / 1000, # NOTE: the JavaScript Date() object wants milliseconds + ) + + def addRow(self, row_data_list): + assert len(row_data_list) == len(self.column_names) + + # convert first column to a GoogleCharts-compatible datetime: + row_data_list[0] = self.ISOdatetimeToJSDate(row_data_list[0]) + self.rows.append(row_data_list) + + def getRow(self, index): + return self.rows[index] + + def getListColumnNames(self): + return self.column_names + + def getNumDataSeries(self): + # assuming first column is the timestamp, the number of "data series" + # present in this table is all remaining columns + assert len(self.column_names) >= 2 + return len(self.column_names) - 1 + + def getMaxValueDataSerie(self, column_index): + # WARNING: this looks very inefficient! + assert column_index >= 0 and column_index <= len(self.column_names) - 1 + ret = 0 + for r in self.rows: + ret = max(ret, r[1 + column_index]) + return ret + + def getDataSeriesIndexByName(self, column_name): + assert column_name != self.column_names[0] # the first column is not a "data serie", it's the timestamp column! + try: + col_idx = self.column_names.index(column_name) + assert col_idx >= 1 + return col_idx - 1 # the first data serie, with index 0, is the column immediately after the timestamp column + except ValueError: + # column name not found + return -1 + + def writeTo(self, file): + for r in self.rows: + # assume first column is always the timestamp: + row_text = "['Date(%s)'," % r[0] + row_text += ",".join(str(x) for x in r[1:]) + row_text += "],\n" + file.write(row_text) + + def toJSONForJS(self): + ret = "[[" # start 2D JSON array + + # convert 1st column: + assert self.column_names[0] == "Timestamp" + ret += '{"type":"datetime","label":"Datetime"},' + + # convert all other columns: + for colName in self.column_names[1:]: + ret += '"' + colName + '",' + ret = ret[:-1] + + # separe first line; start conversion of actual table data: + ret += "]," + + data = json.dumps(self.rows, separators=(",", ":")) + data = data[1:] + + return ret + data + + def toDeflatedJSONBase64Encoded(self): + """Returns this table in JSON format (for JS), deflated using zlib, and represented as a Base64-encoded ASCII string""" + json_string = self.toJSONForJS() + json_compressed_bytearray = zlib.compress(json_string.encode(), 9) + + ret = str(binascii.b2a_base64(json_compressed_bytearray)) + return ret[1:] + + def toGoogleChartTable(self, graphName): + """Writes in the given file the JavaScript GoogleCharts object representing this table""" + ret_string = "" + if SAVE_DEFLATED_JS_DATATABLES: + # to reduce the HTML size save the deflated, serialized JSON of the 2D JS array: + ret_string += "var deflated_data_base64_%s = %s;\n" % ( + graphName, + self.toDeflatedJSONBase64Encoded(), + ) + + # then convert it base64 -> JS binary string + ret_string += "var deflated_data_binary_%s = window.atob(deflated_data_base64_%s);\n" % (graphName, graphName) + + # now inflate it in the browser using "pako" library (https://github.com/nodeca/pako) + ret_string += "var inflated_data_%s = JSON.parse(pako.inflate(deflated_data_binary_%s, { to: 'string' }));\n" % (graphName, graphName) + else: + ret_string += "var inflated_data_%s = %s;\n" % ( + graphName, + self.toJSONForJS(), + ) + + # finally create the GoogleCharts table from it: + ret_string += "var data_%s = google.visualization.arrayToDataTable(inflated_data_%s);\n\n" % (graphName, graphName) + + # add DateFormatter to use custom formatting of the 1st column (like everywhere else we assume first column is the timestamp) + ret_string += "var date_formatter = new google.visualization.DateFormat({pattern: '%s'});\n" % (TOOLTIP_DATEFORMAT) + ret_string += "date_formatter.format(data_%s, 0);\n" % (graphName) + + if self.column_units: + column_units_strings = ["'" + v + "'" for v in self.column_units] + + # add Javascript code to set the formatted value on EACH and EVERY single entry of the table (except timestamp); + # this improves greatly the readability of TOOLTIPs generated by Google Charts: instead of showing very large numbers + # they will show up nice "k", "M" and "G" units + ret_string += """ +var column_unit = [%s] +for (var c=1; c < data_%s.getNumberOfColumns(); c++) { + for (var r=0; r < data_%s.getNumberOfRows(); r++) { + var v = data_%s.getValue(r, c); + data_%s.setFormattedValue(r, c, prettyPrinter(v) + column_unit[c]); + } +} + +""" % ( + ",".join(column_units_strings), + graphName, + graphName, + graphName, + graphName, + ) + + return ret_string + + +# ======================================================================================================= +# GoogleChartsGenericTable +# ======================================================================================================= + + +class GoogleChartsGenericTable(object): + """ + This is the NxM table of + Y1_1;Y2_1;...;YN_1 + ... + Y1_M;Y2_M;...;YN_M + data points for a GoogleCharts graph for M different objects characterized by N features. + This class is useful to create graphs which are NOT related to a measurement that evolves over TIME. + + Currently this class is used only for the generation of bubble charts, which are, by their nature, + suited to represent relationships among different features (in our case total IO, memory and CPU usage) + """ + + def __init__(self, column_names): + self.column_names = column_names # must be a LIST of strings + self.rows = [] # list of lists with values + + def addRow(self, row_data_list): + assert len(row_data_list) == len(self.column_names) + self.rows.append(row_data_list) + + def getRow(self, index): + return self.rows[index] + + def getListColumnNames(self): + return self.column_names + + def getNumDataSeries(self): + # assuming first column is the timestamp, the number of "data series" + # present in this table is all remaining columns + return len(self.column_names) - 1 + + def writeTo(self, file): + for r in self.rows: + file.write(",".join(r)) + + def toJSONForJS(self): + ret = "[[" # start 2D JSON array + + # convert all other columns: + for colName in self.column_names: + ret += '"' + colName + '",' + ret = ret[:-1] + + # separe first line; start conversion of actual table data: + ret += "]," + + data = json.dumps(self.rows, separators=(",", ":")) + data = data[1:] + + return ret + data + + def toDeflatedJSONBase64Encoded(self): + """Returns this table in JSON format (for JS), deflated using zlib, and represented as a Base64-encoded ASCII string""" + json_string = self.toJSONForJS() + json_compressed_bytearray = zlib.compress(json_string.encode(), 9) + + ret = str(binascii.b2a_base64(json_compressed_bytearray)) + return ret[1:] + + def toGoogleChartTable(self, graphName): + """Writes in the given file the JavaScript GoogleCharts object representing this table""" + ret_string = "" + if SAVE_DEFLATED_JS_DATATABLES: + # to reduce the HTML size save the deflated, serialized JSON of the 2D JS array: + ret_string += "var deflated_data_base64_%s = %s;\n" % ( + graphName, + self.toDeflatedJSONBase64Encoded(), + ) + + # then convert it base64 -> JS binary string + ret_string += "var deflated_data_binary_%s = window.atob(deflated_data_base64_%s);\n" % (graphName, graphName) + + # now inflate it in the browser using "pako" library (https://github.com/nodeca/pako) + ret_string += "var inflated_data_%s = JSON.parse(pako.inflate(deflated_data_binary_%s, { to: 'string' }));\n" % (graphName, graphName) + else: + ret_string += "var inflated_data_%s = %s;\n" % ( + graphName, + self.toJSONForJS(), + ) + + # finally create the GoogleCharts table from it: + ret_string += "var data_%s = google.visualization.arrayToDataTable(inflated_data_%s);\n" % (graphName, graphName) + return ret_string + + +# ======================================================================================================= +# GoogleChartsGraph +# ======================================================================================================= + + +class GoogleChartsGraph: + """ + This is a simple object that can generate a JavaScript snippet (to be embedded in HTML output page) + that will render at runtime a GoogleChart drawing inside a JavaScript-enabled browser of course. + + It supports Google AreaChart (see https://developers.google.com/chart/interactive/docs/gallery/areachart) + with 1 or 2 Y axes. The data series that are placed on the 2nd Y axis are higlighted automatically by + using a tick RED line. + """ + + def __init__( + self, + js_name=None, + data=None, + button_label="", + combobox_label="", + combobox_entry="", + graph_source=GRAPH_SOURCE_DATA_IS_BAREMETAL, + graph_type=GRAPH_TYPE_AREA_CHART, + graph_title="", + stack_state=False, + y_axes_titles=[], + y_axes_max_value=[None], + columns_for_2nd_yaxis=None, + ): + self.data_table = data # of type GoogleChartsGenericTable or GoogleChartsTimeSeries + self.button_label = button_label + self.combobox_label = combobox_label + assert (len(self.button_label) == 0 and len(self.combobox_label) > 0) or (len(self.button_label) > 0 and len(self.combobox_label) == 0) + self.combobox_entry = combobox_entry + self.source_data = graph_source # one of GRAPH_TYPE_BAREMETAL or GRAPH_TYPE_CGROUP + self.stack_state = stack_state + self.graph_type = graph_type + self.graph_title = graph_title.strip() + (", STACKED graph" if self.stack_state else "") + self.y_axes_titles = y_axes_titles + self.columns_for_2nd_yaxis = columns_for_2nd_yaxis + self.y_axes_max_value = y_axes_max_value + self.js_name = js_name + + def setJSName(self, js_name): + self.js_name = js_name + + def __genGoogleChartJS_AreaChart(self): + """After the JavaScript line graph data is output, the data is terminated and the graph options set""" + + def __internalWriteSeries(series_indexes, target_axis_index): + ret = "" + for i, idx in enumerate(series_indexes, start=0): + if target_axis_index == 0: + ret += " %d: {targetAxisIndex:%d}" % (idx, target_axis_index) + else: + # IMPORTANT: the data series that go on the 2nd Y axis (typically just one) are drawn with a RED thick line + # to underline their importance; area opacity is removed to avoid clutter with data series on the first Y axis + ret += " %d: {targetAxisIndex:%d, lineWidth: 5, areaOpacity: 0, color: 'red', lineDashStyle: [10,2]}" % ( + idx, + target_axis_index, + ) + # print("i=%d, idx=%d, target_axis_index=%d" % (i,idx,target_axis_index)) + if i < len(series_indexes): + ret += ",\n" + else: + ret += "\n" + return ret + + def __internalWriteVAxis(v_axis_idx, max_value, title, data_series_indexes): + ret = "" + if max_value is None: + # let Google Chart automatically determine min/max on this axis + ret += ' %d: { title: "%s", format: "short" },\n' % (v_axis_idx, title) + elif max_value == 0: + # autocompute the best MAX + actual_max = 0 + for idx in data_series_indexes: + actual_max = max(actual_max, self.data_table.getMaxValueDataSerie(idx)) + ret += ' %d: { title: "%s", format: "short", minValue: -1, maxValue: %d },\n' % (v_axis_idx, title, actual_max * 5 + 10) + else: + ret += ' %d: { title: "%s", format: "short", minValue: -1, maxValue: %d },\n' % (v_axis_idx, title, max_value) + return ret + + ret_string = "" + ret_string += "var options_%s = {\n" % (self.js_name) + ret_string += ' chartArea: {left: "5%", width: "85%", top: "10%", height: "80%"},\n' + ret_string += ' title: "%s",\n' % (self.graph_title) + ret_string += ' focusTarget: "category",\n' + + # by default this tool plots the top 20 processes; in these cases both tooltips and legend will have up to 21 rows (including time) + # so we make the font a bit smaller to make it more likely to view all the lines + ret_string += " tooltip: { textStyle: { fontSize: 12 } },\n" + ret_string += " legend: { textStyle: { fontSize: 12 } },\n" + ret_string += ' explorer: { actions: ["dragToZoom", "rightClickToReset"], keepInBounds: true, maxZoomIn: 20.0 },\n' + + # HORIZONTAL AXIS + ret_string += ' hAxis: { format: "%s", gridlines: { color: "lightgrey", count: 30 } },\n' % X_AXIS_DATEFORMAT + + # VERTICAL AXIS (OR AXES) + if self.columns_for_2nd_yaxis: + # compute indexes of series that use the 2nd Y axis: + series_for_2nd_yaxis = [] + for colname in self.columns_for_2nd_yaxis: + idx = self.data_table.getDataSeriesIndexByName(colname) + assert idx != -1, f"Column named {colname} is not a column inside the data table!" + series_for_2nd_yaxis.append(idx) + # print("series_for_2nd_yaxis: %s" % ",".join(str(x) for x in series_for_2nd_yaxis)) + + # compute indexes of series that use 1st Y axis: + all_indexes = range(0, self.data_table.getNumDataSeries()) + series_for_1st_yaxis = [idx for idx in all_indexes if idx not in series_for_2nd_yaxis] + # print("series_for_1st_yaxis: %s" % ",".join(str(x) for x in series_for_1st_yaxis)) + + # assign data series to the 2 Y axes: + ret_string += " series: {\n" + ret_string += __internalWriteSeries(series_for_1st_yaxis, 0) + ret_string += __internalWriteSeries(series_for_2nd_yaxis, 1) + ret_string += " },\n" + + # check data + assert len(self.y_axes_titles) == 2 + assert len(self.y_axes_max_value) == 2, f"Got {self.y_axes_max_value}, but columns_for_2nd_yaxis={self.columns_for_2nd_yaxis}" + + # allocate 2 Y axes: + ret_string += " vAxes: {\n" + ret_string += __internalWriteVAxis(0, self.y_axes_max_value[0], self.y_axes_titles[0], series_for_1st_yaxis) + ret_string += __internalWriteVAxis(1, self.y_axes_max_value[1], self.y_axes_titles[1], series_for_2nd_yaxis) + ret_string += " },\n" + else: + # single vertical axis: + assert len(self.y_axes_titles) == 1 + ret_string += ' vAxis: { title: "%s", format: "short", gridlines: { color: "lightgrey", count: 11 } },\n' % str(self.y_axes_titles[0]) + + # graph stacking + if self.stack_state: + ret_string += " isStacked: 1\n" + else: + ret_string += " isStacked: 0\n" + + ret_string += "};\n" # end of "options_%s" variable + ret_string += "\n" + ret_string += "set_main_chart_div_as_visible();\n" + ret_string += "if (g_chart && g_chart.clearChart)\n" + ret_string += " g_chart.clearChart();\n" + ret_string += 'g_chart = new google.visualization.AreaChart(document.getElementById("chart_master_div"));\n' + + # immediately before drawing the chart, add a listener to hack some ugly labeling by Google Charts + ret_string += "google.visualization.events.addListener(g_chart, 'ready', fix_vaxis_ticks);\n" + ret_string += "g_chart.draw(data_%s, options_%s);\n" % ( + self.js_name, + self.js_name, + ) + + ret_string += "g_current_data = data_%s;\n" % (self.js_name) + ret_string += "g_current_options = options_%s;\n" % (self.js_name) + + # this graph will be activated by either + # - a button that should reset all comboboxes of the page + # - a combo box entry that should reset all other comboboxes in the page + ret_string += 'reset_combo_boxes("%s");\n' % self.combobox_label + + return ret_string + + def __genGoogleChartJS_BubbleChart(self): + assert len(self.y_axes_titles) == 2 + ret_string = "" + ret_string += "var options_%s = {\n" % (self.js_name) + ret_string += ' explorer: { actions: ["dragToZoom", "rightClickToReset"], keepInBounds: true, maxZoomIn: 20.0 },\n' + ret_string += ' chartArea: { left: "5%", width: "85%", top: "10%", height: "80%" },\n' + ret_string += ' title: "%s",\n' % (self.graph_title) + ret_string += ' hAxis: { title:"%s" },\n' % str(self.y_axes_titles[0]) + ret_string += ' vAxis: { title:"%s", format:"short" },\n' % str(self.y_axes_titles[1]) + ret_string += " sizeAxis: { maxSize: 200 },\n" + ret_string += " bubble: { textStyle: {fontSize: 15} }\n" + ret_string += "};\n" # end of "options_%s" variable + ret_string += "\n" + ret_string += "if (g_chart && g_chart.clearChart)\n" + ret_string += " g_chart.clearChart();\n" + ret_string += "set_main_chart_div_as_visible();\n" + ret_string += 'g_chart = new google.visualization.BubbleChart(document.getElementById("chart_master_div"));\n' + ret_string += "g_chart.draw(data_%s, options_%s);\n" % ( + self.js_name, + self.js_name, + ) + ret_string += "g_current_data = data_%s;\n" % (self.js_name) + ret_string += "g_current_options = options_%s;\n" % (self.js_name) + return ret_string + + def toGoogleChartJS(self): + # generate the JS + js_code_inner = self.data_table.toGoogleChartTable(self.js_name) + + if self.graph_type == GRAPH_TYPE_AREA_CHART: + js_code_inner += self.__genGoogleChartJS_AreaChart() + else: + js_code_inner += self.__genGoogleChartJS_BubbleChart() + + js_code = "function draw_%s() {\n" % (self.js_name) + js_code += textwrap.indent(js_code_inner, " " * JS_INDENT_SIZE) + js_code += "}\n" # end of draw_%s function + js_code += "\n" + + return js_code + + +# ======================================================================================================= +# HtmlOutputPage +# ======================================================================================================= + + +class HtmlOutputPage: + """ + This is able to produce a self-contained HTML page with embedded JavaScript to draw performance charts + """ + + def __init__(self, outfile, title, be_verbose=False): + self.title = title + self.outfile = outfile + self.file = open(outfile, "w") # Open the output file + self.graphs = [] + self.num_generated_charts = 1 + self.verbose = be_verbose + + def appendGoogleChart(self, chart): + assert isinstance(chart, GoogleChartsGraph) + + # set the unique name for this page + chart.setJSName("graph" + str(self.num_generated_charts)) + self.num_generated_charts += 1 + + # save the graph for later + self.graphs.append(chart) + + def writeHtmlHead(self): + """Write the head of the HTML webpage and start the JS section""" + self.file.write( + """ + + + {pageTitle}""".format( + pageTitle=self.title + ) + ) + + self.file.write( + """ + + + + \n") + self.file.write("\n") + + def startHtmlBody(self, cgroup_name, monitored_system, jheader, collected_threads): + self.file.write("\n") + self.file.write('

Data collected from ' + monitored_system + "

\n") + self.file.write('
\n') + self.file.write(' \n') + + # Table header row + self.file.write(" \n") + self.file.write(' \n') + self.file.write(' \n' % cgroup_name) + if collected_threads: + self.file.write(' \n') + else: + self.file.write(' \n') + self.file.write(' \n') + self.file.write(" \n") + + # Datarow + self.file.write(" \n") + self.file.write(' \n") + self.file.write("
Static InfoCGroup stats (Data collected from %s)CGroup per-thread stats (Data collected from cgroup and /proc)CGroup per-process stats (Data collected from cgroup and /proc)Baremetal stats (Data collected only from /proc)
\n') + self.file.write('
\n') + self.file.write('
\n') + + def write_buttons_for_graph_type(source_data): + nwritten_controls = 0 + + # find all graphs that will be activated through a combobox + graphs_combobox = {} + for num, graph in enumerate(self.graphs, start=1): + if graph.source_data == source_data and len(graph.combobox_label) > 0: + if graph.combobox_label not in graphs_combobox: + # add new dict entry as empty list + graphs_combobox[graph.combobox_label] = [] + + # add to the existing dict entry a new graph: + graphs_combobox[graph.combobox_label].append([graph.combobox_entry, graph.js_name]) + + # generate the CPU select box: + if len(graphs_combobox) > 0: + for combobox_label in graphs_combobox.keys(): + graph_list = graphs_combobox[combobox_label] + self.file.write(' \n") + nwritten_controls += 1 + + # find in all graphs registered so far all those related to the CGROUP + for num, graph in enumerate(self.graphs, start=1): + if graph.source_data == source_data: + if len(graph.combobox_label) > 0: + continue # skip - already drawn via \n') + write_buttons_for_graph_type(GRAPH_SOURCE_DATA_IS_CGROUP_PER_PROCESS) + self.file.write(' \n') + write_buttons_for_graph_type(GRAPH_SOURCE_DATA_IS_BAREMETAL) + + self.file.write("
\n") + self.file.write("
\n") + # self.file.write("

\n") + + # finally generate the MAIN div: i.e. where the selected chart is going to be drawn: + self.file.write( + '

...click on a button above to show a graph...

\n' + ) + + def configdump(jheader, section, displayName): + # newstr = '

' + displayName + '

\n' + newstr = " " + displayName + "\n" + config_dict = jheader[section] + for label in config_dict: + newstr += " %s%s\n" % ( + label.capitalize().replace("_", " "), + str(config_dict[label]), + ) + return newstr + + def aggregate_cpuinfo(jheader): + cpudict = {} + + # first take the unique strings about the CPU vendor/model + for field_name in ["vendor_id", "model_name"]: + cpudict[field_name] = set() + for cpu_name in jheader["cpuinfo"].keys(): + cpudict[field_name].add(jheader["cpuinfo"][cpu_name][field_name]) + + # secondly take the unique values of min/max frequency, MIPS, cache size + for field_name in ["scaling_min_freq_mhz", "scaling_max_freq_mhz", "bogomips", "cache_size_kb"]: + cpudict[field_name] = set() + for cpu_name in jheader["cpuinfo"].keys(): + cpuinfo_from_header = jheader["cpuinfo"][cpu_name] + if field_name in cpuinfo_from_header: # these fields are optionals: cmonitor_collector may not be able to populate them + cpudict[field_name].add(int(cpuinfo_from_header[field_name])) + + # now convert each dictionary entry from a set() to a simple string: + for field_name in cpudict.keys(): + the_list = [str(v) for v in cpudict[field_name]] + # join by comma each set() inside the dict: + if len(the_list) > 0: + cpudict[field_name] = ",".join(the_list) + else: + cpudict[field_name] = "Not Available" + + return cpudict + + # immediately after the MAIN div, the element where the configuration info are shown (when toggled): + self.file.write('
\n') + self.file.write("

Monitored System Details

\n") + self.file.write(" \n") + self.file.write(configdump(jheader, "identity", "Server Identity")) + self.file.write(configdump(jheader, "os_release", "Operating System Release")) + self.file.write(configdump(jheader, "proc_version", "Linux Kernel Version")) + if "cgroup_config" in jheader: # if cgroups are off, this section will not be present + self.file.write(configdump(jheader, "cgroup_config", "Linux Control Group (CGroup) Configuration")) + if "cpuinfo" in jheader: + jheader["cpu_summary"] = aggregate_cpuinfo(jheader) + self.file.write(configdump(jheader, "cpu_summary", "CPU Overview")) + if "numa_nodes" in jheader: + self.file.write(configdump(jheader, "numa_nodes", "NUMA Overview")) + if "proc_meminfo" in jheader: + self.file.write(configdump(jheader, "proc_meminfo", "Memory Overview")) + # self.file.write(configdump(jheader, "cpuinfo", "CPU Core Details") + self.file.write("
\n") + self.file.write("

CMonitor Collector

\n") + self.file.write(" \n") + self.file.write(configdump(jheader, "cmonitor", "Performance Stats Collector Configuration")) + if "custom_metadata" in jheader: + if len(jheader["custom_metadata"]) > 0: + self.file.write(configdump(jheader, "custom_metadata", "Custom Metadata")) + self.file.write("
\n") + self.file.write("
\n") # end of 'config_viewer_div' + + def appendHtmlTable(self, name, table_entries, div_class="bottom_div"): + self.file.write("
\n") + self.file.write("

" + name + "

\n") + self.file.write(" \n") + self.file.write(" \n") + self.file.write("
    \n") + for i, entry in enumerate(table_entries, start=1): + self.file.write("
  • " + entry[0] + " " + entry[1] + "
  • \n") + if (i % 4) == 0 and i < len(table_entries): + self.file.write("
    \n") + self.file.write("
\n") + self.file.write("
\n") + + def endHtmlBody(self): + self.file.write("\n") + self.file.write("\n") + self.file.close() + + +# ======================================================================================================= +# CMonitorGraphGenerator +# ======================================================================================================= + + +class CMonitorGraphGenerator: + """ + This is the main class of cmonitor_chart, able to read a JSON file produced by cmonitor_collector, + extract the most useful information and render them inside an HtmlOutputPage object. + """ + + def __init__(self, outfile, jheader, jdata, be_verbose=False): + self.jheader = jheader # a dictionary with cmonitor_collector "header" JSON object + self.jdata = jdata # a list of dictionaries with cmonitor_collector "samples" objects + self.verbose = be_verbose + + # in many places below we need to get "immutable" data that we know won't change across all samples + # like the names of network devices or the list of CPUs... + # since for some metrics the very sample does not contain any KPI (e.g. cgroup network traffic is generated + # only for samples after the first one) if possible we pick the 2nd sample and not the 1st one: + assert len(self.jdata) >= 2 + self.sample_template = self.jdata[1] + + # did we collect at PROCESS-level granularity or just at THREAD-level granularity? + string_collected_kpis = self.jheader["cmonitor"]["collecting"] # e.g. "cgroup_cpu,cgroup_memory,cgroup_threads" + self.collected_threads = "cgroup_threads" in string_collected_kpis + if self.verbose: + if self.collected_threads: + print("Per-thread stats (instead of per-process stats) have been collected in the input JSON file.") + else: + print("Per-process stats (instead of per-thread stats) have been collected in the input JSON file.") + + # detect num of CPUs: + self.baremetal_logical_cpus_indexes = [] + if "stat" in self.sample_template: + self.baremetal_logical_cpus_indexes = CMonitorGraphGenerator.collect_logical_cpu_indexes_from_section(self.sample_template, "stat") + if self.verbose: + print( + "Found %d CPUs in baremetal stats with logical indexes [%s]" + % ( + len(self.baremetal_logical_cpus_indexes), + ", ".join(str(x) for x in self.baremetal_logical_cpus_indexes), + ) + ) + + self.cgroup_logical_cpus_indexes = [] + if "cgroup_cpuacct_stats" in self.sample_template: + self.cgroup_logical_cpus_indexes = CMonitorGraphGenerator.collect_logical_cpu_indexes_from_section( + self.sample_template, "cgroup_cpuacct_stats" + ) + if self.verbose: + print( + "Found %d CPUs in cgroup stats with logical indexes [%s]" + % ( + len(self.cgroup_logical_cpus_indexes), + ", ".join(str(x) for x in self.cgroup_logical_cpus_indexes), + ) + ) + + # load IDENTITY of monitored system + self.monitored_system = "Unknown" + if "identity" in self.jheader: + if "hostname" in self.jheader["identity"]: + self.monitored_system = self.jheader["identity"]["hostname"] + if "custom_metadata" in self.jheader: + if "cmonitor_chart_name" in self.jheader["custom_metadata"]: + self.monitored_system = self.jheader["custom_metadata"]["cmonitor_chart_name"] + + # get the CGROUP name + self.cgroup_name = "None" + if "cgroup_config" in self.jheader and "name" in self.jheader["cgroup_config"]: + self.cgroup_name = self.jheader["cgroup_config"]["name"] + if "custom_metadata" in self.jheader: + if "cmonitor_chart_name" in self.jheader["custom_metadata"]: + self.cgroup_name = "docker/" + self.jheader["custom_metadata"]["cmonitor_chart_name"] + + # get the CGROUP version (v1 or v2 ?) + self.cgroup_ver = None + if "cgroup_config" in self.jheader and "version" in self.jheader["cgroup_config"]: + self.cgroup_ver = int(self.jheader["cgroup_config"]["version"]) + + # finally create the main HTML output page object + self.output_page = HtmlOutputPage(outfile, self.monitored_system, be_verbose=self.verbose) + + # ======================================================================================================= + # Private helpers + # ======================================================================================================= + + @staticmethod + def collect_logical_cpu_indexes_from_section(jsample, section_name): + """ + Walks over given JSON sample looking for keys 'cpuXYZ' and storing all 'XYZ' CPU indexes. + Returns a list of CPU indexes + """ + logical_cpus_indexes = [] + for key in jsample[section_name]: + if key.startswith("cpu") and key != "cpu_total" and key != "cpu_tot": + cpuIdx = int(key[3:]) + logical_cpus_indexes.append(cpuIdx) + # print("%s %s" %(key, cpuIdx)) + return logical_cpus_indexes + + @staticmethod + def sizeof_fmt(num, suffix="B"): + for unit in ["", "k", "M", "G", "T", "P", "E", "Z"]: + if abs(num) < 1000.0: + return "%3.1f%s%s" % (num, unit, suffix) + num /= 1000.0 + return "%.1f%s%s" % (num, "Y", suffix) + + def __make_jheader_nicer(self): + """ + This function just improves self.jheader by adding new sections in that dict and + adding measurement units where they are required. + This is useful because the + """ + + # provide some human-readable config files: + if "cgroup_config" in self.jheader: + avail_cpus = self.jheader["cgroup_config"]["cpus"].split(",") + self.jheader["cgroup_config"]["num_allowed_cpus"] = len(avail_cpus) + self.jheader["cgroup_config"]["cpus"] = self.jheader["cgroup_config"]["cpus"].replace(",", ", ") + + self.jheader["cgroup_config"]["memory_limit_bytes"] = self.__cgroup_get_memory_limit_human_friendly() + self.jheader["cgroup_config"]["cpu_quota_perc"] = self.__cgroup_get_cpu_quota_human_friendly() + + if "cmonitor" in self.jheader: + if self.jheader["cmonitor"]["sample_num"] == 0: + self.jheader["cmonitor"]["sample_num"] = "Infinite" + + if "proc_meminfo" in self.jheader: + self.jheader["proc_meminfo"]["MemTotal"] = CMonitorGraphGenerator.sizeof_fmt(int(self.jheader["proc_meminfo"]["MemTotal"])) + self.jheader["proc_meminfo"]["Hugepagesize"] = CMonitorGraphGenerator.sizeof_fmt(int(self.jheader["proc_meminfo"]["Hugepagesize"])) + + def __print_data_loading_stats(self, desc, n_invalid_samples): + if n_invalid_samples > 0: + print( + "While parsing %s statistics found %d/%d (%.1f%%) samples that did not contain some required JSON section." + % ( + desc, + n_invalid_samples, + len(self.jdata), + 100 * n_invalid_samples / len(self.jdata), + ) + ) + else: + print("Parsed correctly %d samples for [%s] category" % (len(self.jdata), desc)) + + def __cgroup_get_cpu_quota_percentage(self): + """ + Returns a number, in percentage, that indicates how much&many CPUs can be used. + E.g. possible values are 50%, 140%, 300% or -1 to indicate no limit. + """ + cpu_quota_perc = 100 + if "cpu_quota_perc" in self.jheader["cgroup_config"]: + cpu_quota_perc = 100 * self.jheader["cgroup_config"]["cpu_quota_perc"] + if cpu_quota_perc == -100: # means there's no CPU limit + cpu_quota_perc = -1 + return cpu_quota_perc + + def __cgroup_get_cpu_quota_human_friendly(self): + if "cpu_quota_perc" not in self.jheader["cgroup_config"]: + return "NO LIMIT" + if int(self.jheader["cgroup_config"]["cpu_quota_perc"]) == -1: + return "NO LIMIT" + cpu_quota_perc = 100 * self.jheader["cgroup_config"]["cpu_quota_perc"] + return f"cpu quota = {cpu_quota_perc}%" + + @staticmethod + def cgroup_get_cpu_throttling(s): + cpu_throttling = 0 + if "throttling" in s["cgroup_cpuacct_stats"]: + # throttling is new since cmonitor_collector 1.5-0 + nr_periods = s["cgroup_cpuacct_stats"]["throttling"]["nr_periods"] + if nr_periods: + cpu_throttling = 100 * s["cgroup_cpuacct_stats"]["throttling"]["nr_throttled"] / nr_periods + return cpu_throttling + + def __cgroup_get_memory_limit(self): + """ + Returns the cgroup memory limit in bytes; can be -1 if there's no limit + """ + cgroup_limit_bytes = -1 + if "memory_limit_bytes" in self.jheader["cgroup_config"]: + # IMPORTANT: this value could be -1 if there's no limit + cgroup_limit_bytes = int(self.jheader["cgroup_config"]["memory_limit_bytes"]) + + return cgroup_limit_bytes + + def __cgroup_get_memory_limit_human_friendly(self): + if "memory_limit_bytes" not in self.jheader["cgroup_config"]: + return "NO LIMIT" + if int(self.jheader["cgroup_config"]["memory_limit_bytes"]) == -1: + return "NO LIMIT" + cgroup_limit_bytes = CMonitorGraphGenerator.sizeof_fmt(self.jheader["cgroup_config"]["memory_limit_bytes"]) + return f"memory limit = {cgroup_limit_bytes}" + + @staticmethod + def __get_main_thread_associated_with(sample, tid): + json_key = "pid_%s" % tid + tgid = sample["cgroup_tasks"][json_key]["tgid"] + if tgid == sample["cgroup_tasks"][json_key]["pid"]: + # actually current entry is not a secondary thread but a PROCESS, append it: + return tid + else: + json_key_of_main_process = "pid_%s" % tgid + if json_key_of_main_process in sample["cgroup_tasks"]: + return tgid + else: + # the main thread / process associated with given THREAD ID is missing: + return None + + def __get_main_threads_only(self, tids_list_to_filter): + x = set() # use a Python set to automatically remove duplicates + for tid in tids_list_to_filter: + json_key = "pid_%s" % tid + + # first of all, find the first JSON sample that contains the current TID + n_sample = 0 + while n_sample < len(self.jdata) and json_key not in self.jdata[n_sample]["cgroup_tasks"]: + n_sample += 1 + + assert n_sample < len(self.jdata) # the TID comes from a processing of self.jdata itself... it must be there + pid = CMonitorGraphGenerator.__get_main_thread_associated_with(self.jdata[n_sample], tid) + if pid is None: + print(f"WARNING: the input JSON does not contain collected stats for PID [{pid}] associated with thread ID [{tid}]...") + else: + x.add(pid) + + return x + + def __generate_topN_procs_bubble_chart(self, process_dict, topN_pids_list, max_byte_value_dict, chart_desc_postfix): + cpu_label = "CPU time" + io_label = "I/O (B)" + thread_proc_label = "Thread" if self.collected_threads else "Process" + memory_label = "Memory (B)" + + def get_nice_process_or_thread_name(pid): + return "%s (%d)" % (process_dict[pid]["cmd"], pid) + + # now select the N top processes and put their data in a GoogleChart table: + topN_process_table = GoogleChartsGenericTable(["Command", cpu_label, io_label, thread_proc_label, memory_label]) + for i, pid in enumerate(topN_pids_list): + p = process_dict[pid] + nicecmd = get_nice_process_or_thread_name(pid) + if self.verbose: + print("Processing data for %d-th CPU-top-scorer process [%s]" % (i + 1, nicecmd)) + topN_process_table.addRow([p["cmd"], p["cpu"], int(p["io"]), nicecmd, int(p["mem"])]) + + # generate the bubble chart graph: + self.output_page.appendGoogleChart( + GoogleChartsGraph( + data=topN_process_table, + button_label="CPU/Memory/Disk Bubbles by Thread" if self.collected_threads else "CPU/Memory/Disk Bubbles by Process", + graph_source=GRAPH_SOURCE_DATA_IS_CGROUP_PER_PROCESS, + graph_type=GRAPH_TYPE_BUBBLE_CHART, + graph_title=f"CPU/disk total usage on X/Y axes; memory usage as bubble size {chart_desc_postfix}", + y_axes_titles=[cpu_label, io_label], + ) + ) + + def __generate_topN_procs_cpu_io_mem_vs_time(self, process_dict, topN_pids_list, max_byte_value_dict, chart_desc_postfix): + mem_limit_bytes = self.__cgroup_get_memory_limit() + cpu_quota_perc = self.__cgroup_get_cpu_quota_percentage() + + def get_nice_process_or_thread_name(pid): + return "%s (%d)" % (process_dict[pid]["cmd"], pid) + + chart_data = {} + + ## -- CPU -- + if cpu_quota_perc > 0: + # it is possible to compute the "idle" time inside this cgroup and it's possible that CPU is throttled... + # so in such case we add 2 more data series to the chart: + cpu_time_serie = GoogleChartsTimeSeries( + ["Timestamp"] + [get_nice_process_or_thread_name(pid) for pid in topN_pids_list] + ["Idle", "Throttling"], + [""] + ["%" for pid in topN_pids_list] + ["%", "%"], + ) + y_axes_max_value = [None, 0] + columns_for_2nd_yaxis = ["Throttling"] + y_axes_titles = ["CPU (%)", "CPU Throttling (%)"] + else: + # no CPU limit... creating an "idle" column (considering as max CPU all the CPUs available) likely produces weird + # results out-of-scale (imagine servers with hundreds of CPUs and cmonitor_collector monitoring just a Redis container!) + # so we do not place any "idle" column. The "throttling" column does not apply either. + assert cpu_quota_perc == -1 + cpu_time_serie = GoogleChartsTimeSeries( + ["Timestamp"] + [get_nice_process_or_thread_name(pid) for pid in topN_pids_list], + [""] + ["%" for pid in topN_pids_list], + ) + y_axes_max_value = [None] # let Google Charts autocompute the Y axes limits + columns_for_2nd_yaxis = None + y_axes_titles = ["CPU (%)"] + + # CPU by thread/process: + chart_data["cpu"] = GoogleChartsGraph( + data=cpu_time_serie, + graph_title=f"CPU usage ({self.__cgroup_get_cpu_quota_human_friendly()}) {chart_desc_postfix}", + button_label="CPU by Thread" if self.collected_threads else "CPU by Process", + graph_source=GRAPH_SOURCE_DATA_IS_CGROUP_PER_PROCESS, + # take any contribute of any thread/process and stack it together: this way it becomes easier to spot + # when the cgroup CPU limit was hit and due to which threads/processes + stack_state=True, + y_axes_titles=y_axes_titles, + # throttling should not be stacked to the CPU usage contributions, so move it on 2nd y axis: + columns_for_2nd_yaxis=columns_for_2nd_yaxis, + # make the 2 axes have the same identical Y scale to make it easier to read it: + y_axes_max_value=y_axes_max_value, + ) + + ## -- MEM -- + topN_pids_list_for_memory = self.__get_main_threads_only(topN_pids_list) + if self.verbose: + print( + f"While generating the per-process memory chart, the following MAIN PIDs were selected: {topN_pids_list_for_memory} from the list of top-CPU scorer processes {topN_pids_list}" + ) + if mem_limit_bytes > 0: + # it is possible to compute the "free" memory inside this cgroup and it's possible to have allocation failures + # so in such case we add 2 more data series to the chart: + mem_time_serie = GoogleChartsTimeSeries( + ["Timestamp"] + [get_nice_process_or_thread_name(pid) for pid in topN_pids_list_for_memory] + ["Free", "Alloc Failures"], + [""] + ["B" for pid in topN_pids_list_for_memory] + ["B", ""], + ) + y_axes_max_value = [None, 0] + columns_for_2nd_yaxis = ["Alloc Failures"] + y_axes_titles = ["Memory (B)", "Alloc Failures"] + else: + # no memory limit... creating the "free" data serie (considering all the system memory as actual limit) likely produces weird + # results out-of-scale so we do not place any "idle" column. The "alloc failures" column does not apply either. + assert mem_limit_bytes == -1 + mem_time_serie = GoogleChartsTimeSeries( + ["Timestamp"] + [get_nice_process_or_thread_name(pid) for pid in topN_pids_list_for_memory], + [""] + ["B" for pid in topN_pids_list_for_memory], + ) + y_axes_max_value = [None] # let Google Charts autocompute the Y axes limits + columns_for_2nd_yaxis = None + y_axes_titles = ["Memory (B)"] + + chart_data["mem"] = GoogleChartsGraph( + data=mem_time_serie, + graph_title=f"Memory usage ({self.__cgroup_get_memory_limit_human_friendly()}) {chart_desc_postfix}", + button_label="Memory by Process", + graph_source=GRAPH_SOURCE_DATA_IS_CGROUP_PER_PROCESS, + # take any contribute of any thread/process and stack it together: this way it becomes easier to spot + # when the cgroup MEMORY limit was hit and due to which threads/processes + stack_state=True, + y_axes_titles=y_axes_titles, + y_axes_max_value=y_axes_max_value, + # alloc failures should not be stacked to the memory usage contributions, so move it on 2nd y axis: + columns_for_2nd_yaxis=columns_for_2nd_yaxis, + ) + + ## -- IO -- + io_time_serie = GoogleChartsTimeSeries( + ["Timestamp"] + [get_nice_process_or_thread_name(pid) for pid in topN_pids_list], + [""] + ["B" for pid in topN_pids_list], + ) + chart_data["io"] = GoogleChartsGraph( + data=io_time_serie, + graph_title=f"I/O usage (from cgroup stats) {chart_desc_postfix}", + button_label="IO by Thread" if self.collected_threads else "IO by Process", + y_axes_titles=["IO Read+Write (B)"], + graph_source=GRAPH_SOURCE_DATA_IS_CGROUP_PER_PROCESS, + stack_state=False, + ) + + # now generate a table of CPU/IO/MEMORY usage over time, per process/thread: + for sample in self.jdata: + try: + row = {} + + # save the same timestamp in all charts + for key in ["cpu", "io", "mem"]: + row[key] = [sample["timestamp"]["UTC"]] + + # append CPU & IO samples + tot_cpu_usage_perc = 0 + for top_process_pid in topN_pids_list: + # print(top_process_pid) + json_key = "pid_%s" % top_process_pid + if json_key in sample["cgroup_tasks"]: + top_proc_sample = sample["cgroup_tasks"][json_key] + + cpu = top_proc_sample["cpu_usr"] + top_proc_sample["cpu_sys"] + io = int((top_proc_sample["io_rchar"] + top_proc_sample["io_wchar"])) + + tot_cpu_usage_perc += cpu + row["cpu"].append(cpu) + row["io"].append(io) + else: + # probably this process was born later or dead earlier than this timestamp + row["cpu"].append(0) + row["io"].append(0) + + # for memory chart, only include PROCESSES, never include SECONDARY THREADS since there + # is no distinction between memory of whole process and memory of secondary threads + tot_mem_usage_bytes = 0 + for top_process_pid in topN_pids_list_for_memory: + # print(top_process_pid) + json_key = "pid_%s" % top_process_pid + if json_key in sample["cgroup_tasks"]: + top_proc_sample = sample["cgroup_tasks"][json_key] + + tot_mem_usage_bytes += top_proc_sample["mem_rss_bytes"] + mem = int(top_proc_sample["mem_rss_bytes"]) + row["mem"].append(mem) + else: + # probably this process was born later or dead earlier than this timestamp + row["mem"].append(0) + + # CPU graph has + # - idle (if cpu_quota_perc > 0) + # - throttling + # as additional columns right after the last PID serie + if cpu_quota_perc > 0: + row["cpu"].append(max(cpu_quota_perc - tot_cpu_usage_perc, 0)) + row["cpu"].append(CMonitorGraphGenerator.cgroup_get_cpu_throttling(sample)) + + # Memory graph has + # - free mem + # - alloc failures + # as additional columns right after the timestamp + if mem_limit_bytes > 0: + row["mem"].append(max(int(mem_limit_bytes - tot_mem_usage_bytes), 0)) + if self.cgroup_ver == 1: + failcnt = sample["cgroup_memory_stats"]["events.failcnt"] + else: + failcnt = sample["cgroup_memory_stats"]["events.oom_kill"] + row["mem"].append(failcnt) + + for key in ["cpu", "io", "mem"]: + chart_data[key].data_table.addRow(row[key]) + except KeyError: # avoid crashing if a key is not present in the dictionary... + # print("Missing cgroup data while parsing sample %d" % i) + pass + + self.output_page.appendGoogleChart(chart_data["cpu"]) + self.output_page.appendGoogleChart(chart_data["mem"]) + self.output_page.appendGoogleChart(chart_data["io"]) + + # ======================================================================================================= + # Public API + # ======================================================================================================= + + def generate_cgroup_topN_procs(self, numProcsToShow=20): + # if process data was not collected, just return: + if "cgroup_tasks" not in self.sample_template: + return + + # build a dictionary containing cumulative metrics for CPU/IO/MEM data for each process + # along all collected samples + process_dict = {} + max_byte_value_dict = {} + max_byte_value_dict["mem_rss"] = 0 + max_byte_value_dict["io_total"] = 0 + n_invalid_samples = 0 + for i, sample in enumerate(self.jdata): + try: + for process in sample["cgroup_tasks"]: + # parse data from JSON + entry = sample["cgroup_tasks"][process] + cmd = entry["cmd"] + cputime = entry["cpu_usr_total_secs"] + entry["cpu_sys_total_secs"] + iobytes = entry["io_total_read"] + entry["io_total_write"] + membytes = entry["mem_rss_bytes"] # take RSS, more realistic/useful compared to the "mem_virtual_bytes" + thepid = entry["pid"] # can be the TID (thread ID) if cmonitor_collector was started with --collect=cgroup_threads + + # keep track of maxs: + max_byte_value_dict["mem_rss"] = max(membytes, max_byte_value_dict["mem_rss"]) + max_byte_value_dict["io_total"] = max(iobytes, max_byte_value_dict["io_total"]) + + try: # update the current entry + process_dict[thepid]["cpu"] = cputime + process_dict[thepid]["io"] = iobytes + process_dict[thepid]["mem"] = membytes + process_dict[thepid]["cmd"] = cmd + + # FIXME FIXME + # process_dict[thepid]["is_thread"] = + except: # no current entry so add one + process_dict.update( + { + thepid: { + "cpu": cputime, + "io": iobytes, + "mem": membytes, + "cmd": cmd, + } + } + ) + except KeyError as e: # avoid crashing if a key is not present in the dictionary... + print(f"Missing cgroup data while parsing {i}-th sample: {e}") + n_invalid_samples += 1 + pass + + self.__print_data_loading_stats("per-process", n_invalid_samples) + + # now sort all collected processes by the amount of CPU*memory used: + # NOTE: sorted() will return just the sorted list of KEYs = PIDs + def sort_key(d): + # return process_dict[d]['cpu'] * process_dict[d]['mem'] + return process_dict[d]["cpu"] + + topN_pids_list = sorted(process_dict, key=sort_key, reverse=True) + + # truncate to first N: + if numProcsToShow > 0: + topN_pids_list = topN_pids_list[0:numProcsToShow] + + # provide common chart description + chart_desc_postfix = "" + if numProcsToShow > 0: + if self.collected_threads: + chart_desc_postfix = f"of {numProcsToShow} top-CPU-utilizing threads" + else: + chart_desc_postfix = f"of {numProcsToShow} top-CPU-utilizing processes" + # else: if there's no filter on the processes to show, simply produce an empty postfix since it's + # weird to see e.g. "CPU usage of ALL top-CPU-utilizing processes" + + self.__generate_topN_procs_cpu_io_mem_vs_time(process_dict, topN_pids_list, max_byte_value_dict, chart_desc_postfix) + self.__generate_topN_procs_bubble_chart(process_dict, topN_pids_list, max_byte_value_dict, chart_desc_postfix) + + def generate_baremetal_disks_io(self): + # if disk data was not collected, just return: + if "disks" not in self.sample_template: + return + + all_disks = self.sample_template["disks"].keys() + if len(all_disks) == 0: + return + + # see https://www.kernel.org/doc/Documentation/iostats.txt + + diskcols = ["Timestamp"] + for device in all_disks: + # diskcols.append(str(device) + " Disk Time") + # diskcols.append(str(device) + " Reads") + # diskcols.append(str(device) + " Writes") + diskcols.append(str(device) + " Read MB") + diskcols.append(str(device) + " Write MB") + + # convert from kB to MB + divider = 1000 + + # + # MAIN LOOP + # Process JSON sample and fill the GoogleChartsTimeSeries() object + # + + disk_table = GoogleChartsTimeSeries(diskcols) + for i, s in enumerate(self.jdata): + if i == 0: + continue + + row = [] + row.append(s["timestamp"]["UTC"]) + for device in all_disks: + # row.append(s["disks"][device]["time"]) + # row.append(s["disks"][device]["reads"]) + # row.append(s["disks"][device]["writes"]) + row.append(s["disks"][device]["rkb"] / divider) + row.append(-s["disks"][device]["wkb"] / divider) + disk_table.addRow(row) + + self.output_page.appendGoogleChart( + GoogleChartsGraph( + data=disk_table, + button_label="Disk I/O", + graph_source=GRAPH_SOURCE_DATA_IS_BAREMETAL, + graph_title="Disk I/O (from baremetal stats)", + y_axes_titles=["MB"], + ) + ) + return + + # def generate_filesystems(self.output_page, self.jdata): + # global self.graphs + # fsstr = "" + # for fs in self.sample_template["filesystems"].keys(): + # fsstr = fsstr + "'" + fs + "'," + # fsstr = fsstr[:-1] + # writeHtmlHead_line_graph(self.output_page, fsstr) + # for i, s in enumerate(self.jdata): + # self.output_page.write(",['Date(%s)' " % (googledate(s['timestamp']["UTC"]))) + # for fs in s["filesystems"].keys(): + # self.output_page.write(", %.1f" % (s["filesystems"][fs]["fs_full_percent"])) + # self.output_page.write("]\n") + # self.output_page.appendGoogleChart(GoogleChartsGraph( 'File Systems Used percent') + # return + + def __generate_network_traffic_graphs(self, graph_source, sample_section_name, graph_desc): + # if network traffic data was not collected, just return: + if sample_section_name not in self.sample_template: + return + + all_netdevices = self.sample_template[sample_section_name].keys() + if len(all_netdevices) == 0: + return + + netcols = ["Timestamp"] + for device in all_netdevices: + netcols.append(str(device) + "+in") + netcols.append(str(device) + "-out") + + # convert from bytes to MB + divider = 1000 * 1000 + unit = "MB" + + # + # MAIN LOOP + # Process JSON sample and fill the GoogleChartsTimeSeries() object + # + + # MB/sec + + net_table = GoogleChartsTimeSeries(netcols, [unit for col in netcols]) + for i, s in enumerate(self.jdata): + if i == 0: + continue + + row = [s["timestamp"]["UTC"]] + for device in all_netdevices: + try: + row.append(+s[sample_section_name][device]["ibytes"] / divider) + row.append(-s[sample_section_name][device]["obytes"] / divider) + except KeyError: + if self.verbose: + print("Missing key '%s' while parsing sample %d" % (device, i)) + row.append(0) + row.append(0) + net_table.addRow(row) + + self.output_page.appendGoogleChart( + GoogleChartsGraph( + data=net_table, + graph_title=f"Network Traffic in MB/s {graph_desc}", + button_label="Network (MB/s)", + y_axes_titles=["MB/s"], + graph_source=graph_source, + stack_state=False, + ) + ) + + # PPS + + net_table = GoogleChartsTimeSeries(netcols) + for i, s in enumerate(self.jdata): + if i == 0: + continue + + row = [s["timestamp"]["UTC"]] + for device in all_netdevices: + try: + row.append(+s[sample_section_name][device]["ipackets"]) + row.append(-s[sample_section_name][device]["opackets"]) + except KeyError: + if self.verbose: + print("Missing key '%s' while parsing sample %d" % (device, i)) + row.append(0) + row.append(0) + net_table.addRow(row) + + self.output_page.appendGoogleChart( + GoogleChartsGraph( + data=net_table, + graph_title=f"Network Traffic in PPS {graph_desc}", + button_label="Network (PPS)", + y_axes_titles=["PPS"], + graph_source=graph_source, + stack_state=False, + ) + ) + return + + def generate_baremetal_network_traffic(self): + self.__generate_network_traffic_graphs(GRAPH_SOURCE_DATA_IS_BAREMETAL, "network_interfaces", "(from baremetal stats)") + + def generate_cgroup_network_traffic(self): + self.__generate_network_traffic_graphs(GRAPH_SOURCE_DATA_IS_CGROUP_TOTALS, "cgroup_network", "(from cgroup stats)") + + def generate_baremetal_cpus(self): + # if baremetal CPU data was not collected, just return: + if "stat" not in self.sample_template: + return + + # prepare empty tables + baremetal_cpu_stats = {} + for c in self.baremetal_logical_cpus_indexes: + baremetal_cpu_stats[c] = GoogleChartsTimeSeries( + [ + "Timestamp", + "User", + "Nice", + "System", + "Idle", + "I/O wait", + "Hard IRQ", + "Soft IRQ", + "Steal", + ], + [ + "", + "%", + "%", + "%", + "%", + "%", + "%", + "%", + "%", + ], + ) + + all_cpus_table = GoogleChartsTimeSeries( + ["Timestamp"] + [("CPU" + str(x)) for x in self.baremetal_logical_cpus_indexes], # force newline + [""] + ["%" for x in self.baremetal_logical_cpus_indexes], + ) + + # + # MAIN LOOP + # Process JSON sample and fill the GoogleChartsTimeSeries() object + # + + for i, s in enumerate(self.jdata): + if i == 0: + continue # skip first sample + + ts = s["timestamp"]["UTC"] + all_cpus_row = [ts] + for c in self.baremetal_logical_cpus_indexes: + cpu_stats = s["stat"]["cpu" + str(c)] + cpu_total = ( + cpu_stats["user"] + + cpu_stats["nice"] + + cpu_stats["sys"] + + cpu_stats["iowait"] + + cpu_stats["hardirq"] + + cpu_stats["softirq"] + + cpu_stats["steal"] + ) + baremetal_cpu_stats[c].addRow( + [ + ts, + cpu_stats["user"], + cpu_stats["nice"], + cpu_stats["sys"], + cpu_stats["idle"], + cpu_stats["iowait"], + cpu_stats["hardirq"], + cpu_stats["softirq"], + cpu_stats["steal"], + ] + ) + all_cpus_row.append(cpu_total) + + all_cpus_table.addRow(all_cpus_row) + + # Produce the javascript: + for c in self.baremetal_logical_cpus_indexes: + self.output_page.appendGoogleChart( + GoogleChartsGraph( + data=baremetal_cpu_stats[c], # Data + graph_title="Logical CPU " + str(c) + " (from baremetal stats)", + combobox_label="baremetal_cpus", + combobox_entry="CPU" + str(c), + y_axes_titles=["CPU (%)"], + graph_source=GRAPH_SOURCE_DATA_IS_BAREMETAL, + stack_state=True, + ) + ) + + # Also produce the "all CPUs" graph + self.output_page.appendGoogleChart( + GoogleChartsGraph( + data=all_cpus_table, # Data + graph_title="All logical CPUs (from baremetal stats)", + button_label="All CPUs", + y_axes_titles=["CPU (%)"], + graph_source=GRAPH_SOURCE_DATA_IS_BAREMETAL, + stack_state=False, + ) + ) + return + + def generate_cgroup_cpus(self): + if "cgroup_cpuacct_stats" not in self.sample_template: + return # cgroup mode not enabled at collection time! + + # prepare empty tables + cpu_stats_table = {} + for c in self.cgroup_logical_cpus_indexes: + cpu_stats_table[c] = GoogleChartsTimeSeries(["Timestamp", "User", "System"], ["", "%", "%"]) + + all_cpus_table = GoogleChartsTimeSeries( + ["Timestamp", "Limit/Quota", "Throttling"] + [("CPU" + str(x)) for x in self.cgroup_logical_cpus_indexes], + ["", "%", "%"] + ["%" for x in self.cgroup_logical_cpus_indexes], + ) + + # + # MAIN LOOP + # Process JSON sample and fill the GoogleChartsTimeSeries() object + # + + cpu_quota_perc = self.__cgroup_get_cpu_quota_percentage() + n_invalid_samples = 0 + # max_cpu_throttling = 0 + for i, s in enumerate(self.jdata): + if i == 0: + continue # skip first sample + + try: + ts = s["timestamp"]["UTC"] + + throttling = CMonitorGraphGenerator.cgroup_get_cpu_throttling(s) + # max_cpu_throttling = max(max_cpu_throttling, throttling) + all_cpus_row = [ts, cpu_quota_perc, throttling] + for c in self.cgroup_logical_cpus_indexes: + # get data: + cpu_stats = s["cgroup_cpuacct_stats"]["cpu" + str(c)] + if "sys" in cpu_stats: + cpu_sys = cpu_stats["sys"] + else: + cpu_sys = 0 + cpu_total = cpu_stats["user"] + cpu_sys + + # append data: + cpu_stats_table[c].addRow([ts, cpu_stats["user"], cpu_sys]) + all_cpus_row.append(cpu_total) + + all_cpus_table.addRow(all_cpus_row) + except KeyError: # avoid crashing if a key is not present in the dictionary... + # print("Missing cgroup data while parsing sample %d" % i) + n_invalid_samples += 1 + pass + + self.__print_data_loading_stats("cgroup CPU", n_invalid_samples) + + # Produce 1 graph for each CPU: + for c in self.cgroup_logical_cpus_indexes: + self.output_page.appendGoogleChart( + GoogleChartsGraph( + data=cpu_stats_table[c], # Data + graph_title="Logical CPU " + str(c) + " (from CGroup stats)", + combobox_label="cgroup_cpus", + combobox_entry="CPU" + str(c), + y_axes_titles=["CPU (%)"], + graph_source=GRAPH_SOURCE_DATA_IS_CGROUP_TOTALS, + stack_state=True, + ) + ) + + # Also produce the "all CPUs" graph that includes some very useful KPIs like + # - CPU limit imposed on Linux CFS scheduler + # - Amount of CPU throttling + # NOTE: when cgroups v2 are used, there's no per-CPU stat just the total CPU usage, + # so we change the title of the tab to reflect that + graph_title = "CPU usage by index of CPU available inside cgroup" if self.cgroup_ver == 1 else "CPU usage measured in the cgroup" + graph_title = f"{graph_title} ({self.__cgroup_get_cpu_quota_human_friendly()})" + + self.output_page.appendGoogleChart( + GoogleChartsGraph( + data=all_cpus_table, # Data + graph_title=graph_title, + button_label="All CPUs" if self.cgroup_ver == 1 else "CPU", + y_axes_titles=["CPU (%)", "CPU Throttling (%)"], + graph_source=GRAPH_SOURCE_DATA_IS_CGROUP_TOTALS, + stack_state=False, + # give evidence to CPU throttling by moving it on 2nd y axis: + columns_for_2nd_yaxis=["Throttling"], + y_axes_max_value=[None, 0], + ) + ) + + return + + def generate_baremetal_memory(self): + # if baremetal memory data was not collected, just return: + if "proc_meminfo" not in self.sample_template: + return + + # + # MAIN LOOP + # Process JSON sample and build Google Chart-compatible Javascript variable + # See https://developers.google.com/chart/interactive/docs/reference + # + + mem_total_bytes = self.sample_template["proc_meminfo"]["MemTotal"] + baremetal_memory_stats = GoogleChartsTimeSeries(["Timestamp", "Used", "Cached (DiskRead)", "Free"], ["", "B", "B", "B"]) + + for i, s in enumerate(self.jdata): + if i == 0: + continue # skip first sample + meminfo_stats = s["proc_meminfo"] + + if meminfo_stats["MemTotal"] != mem_total_bytes: + continue # this is impossible AFAIK (hot swap of memory is not handled!!) + + # + # NOTE: most tools like e.g. free -k just map: + # + # free output | corresponding /proc/meminfo fields + # --------------+--------------------------------------- + # Mem: total | MemTotal + # Mem: used | MemTotal - MemFree - Buffers - Cached - Slab + # Mem: free | MemFree ^^^^^^^^^ ^^^^ + # Buffers and Slab are close to zero 99% of the time + # + # see https://access.redhat.com/solutions/406773 + + mf = meminfo_stats["MemFree"] + mc = meminfo_stats["Cached"] + + baremetal_memory_stats.addRow( + [ + s["timestamp"]["UTC"], + int(mem_total_bytes - mf - mc), # compute used memory + int(mc), # cached + int(mf), # free + ] + ) + + # Produce the javascript: + self.output_page.appendGoogleChart( + GoogleChartsGraph( + data=baremetal_memory_stats, # Data + graph_title="Memory usage in Bytes (from baremetal stats)", + button_label="Memory", + y_axes_titles=["Memory (B)"], + graph_source=GRAPH_SOURCE_DATA_IS_BAREMETAL, + stack_state=True, + ) + ) + return + + def generate_cgroup_memory(self): + # if cgroup data was not collected, just return: + if "cgroup_memory_stats" not in self.sample_template: + return + + # + # MAIN LOOP + # Process JSON sample and build Google Chart-compatible Javascript variable + # See https://developers.google.com/chart/interactive/docs/reference + # + + mem_limit_bytes = self.__cgroup_get_memory_limit() + if mem_limit_bytes > 0: + # it is possible to compute the "free" memory inside this cgroup and it's possible to have allocation failures + # so in such case we add 2 more data series to the chart: + cgroup_memory_stats = GoogleChartsTimeSeries( + ["Timestamp", "Used", "Cached (DiskRead)", "Free", "Alloc Failures"], ["", "B", "B", "B", ""] + ) + y_axes_max_value = [None, 0] + columns_for_2nd_yaxis = ["Alloc Failures"] + y_axes_titles = ["Memory (B)", "Alloc Failures"] + else: + # no memory limit... creating the "free" series (considering all the system memory as actual limit) likely produces weird + # results out-of-scale so we do not place any "idle" column. The "alloc failures" column does not apply either. + assert mem_limit_bytes == -1 + cgroup_memory_stats = GoogleChartsTimeSeries(["Timestamp", "Used", "Cached (DiskRead)"], ["", "B", "B"]) + y_axes_max_value = [None] # use default GoogleChart logic + columns_for_2nd_yaxis = None + y_axes_titles = ["Memory (B)"] + + n_invalid_samples = 0 + # max_mfail = 0 + for i, s in enumerate(self.jdata): + if i == 0: + continue # skip first sample + + try: + # mu = memory actually Used + # mc = memory used as Cache + # mfail = memory alloc failures inside cgroup + if self.cgroup_ver == 1: + mu = s["cgroup_memory_stats"]["stat.rss"] + mc = s["cgroup_memory_stats"]["stat.cache"] + mfail = s["cgroup_memory_stats"]["events.failcnt"] + else: + # cgroups v2 + mu = s["cgroup_memory_stats"]["stat.anon"] + mc = s["cgroup_memory_stats"]["stat.file"] + mfail = s["cgroup_memory_stats"]["events.oom_kill"] + + mfree = mem_limit_bytes - mu - mc + # max_mfail = max(max_mfail, mfail) + + if mem_limit_bytes > 0: + cgroup_memory_stats.addRow( + [ + s["timestamp"]["UTC"], + int(mu), + int(mc), + int(mfree), + mfail, + ] + ) + else: + cgroup_memory_stats.addRow( + [ + s["timestamp"]["UTC"], + int(mu), + int(mc), + ] + ) + + except KeyError as e: # avoid crashing if a key is not present in the dictionary... + print(f"Missing cgroup data while parsing {i}-th sample: {e}") + n_invalid_samples += 1 + pass + + self.__print_data_loading_stats("cgroup memory", n_invalid_samples) + + # Produce the javascript: + # NOTE: on 2nd axis we try to keep the plotted line below the ones that belong to the first axis (to avoid cluttering) + # and we also add some offset to deal with the case where "max_mfail is zero" + # if mem_limit_bytes > 0: + # y_axes_max_value = [None, max_mfail * 5 + 10] + self.output_page.appendGoogleChart( + GoogleChartsGraph( + data=cgroup_memory_stats, # Data + graph_title=f"Used memory in Bytes measured inside cgroup ({self.__cgroup_get_memory_limit_human_friendly()})", + button_label="Memory", + graph_source=GRAPH_SOURCE_DATA_IS_CGROUP_TOTALS, + stack_state=True, + y_axes_titles=y_axes_titles, + columns_for_2nd_yaxis=columns_for_2nd_yaxis, + y_axes_max_value=y_axes_max_value, + ) + ) + + return + + def generate_baremetal_avg_load(self): + # + # MAIN LOOP + # Process JSON sample and build Google Chart-compatible Javascript variable + # See https://developers.google.com/chart/interactive/docs/reference + # + + num_baremetal_cpus = len(self.baremetal_logical_cpus_indexes) + if num_baremetal_cpus == 0: + num_baremetal_cpus = 1 + load_avg_stats = GoogleChartsTimeSeries(["Timestamp", "LoadAvg (1min)", "LoadAvg (5min)", "LoadAvg (15min)"]) + for i, s in enumerate(self.jdata): + if i == 0: + continue # skip first sample + + # + # See https://linux.die.net/man/5/proc + # and https://blog.appsignal.com/2018/03/28/understanding-system-load-and-load-averages.html + # + # "The load of a system is essentially the number of processes active at any given time. + # When idle, the load is 0. When a process starts, the load is incremented by 1. + # A terminating process decrements the load by 1. Besides running processes, + # any process that's queued up is also counted. So, when one process is actively using the CPU, + # and two are waiting their turn, the load is 3." + # ... + # "Generally, single-core CPU can handle one process at a time. An average load of 1.0 would mean + # that one core is busy 100% of the time. If the load average drops to 0.5, the CPU has been idle + # for 50% of the time." + + # since kernel reports a percentage in range [0-n], where n= number of cores, + # we remap that in range [0-100%] + + load_avg_stats.addRow( + [ + s["timestamp"]["UTC"], + 100 * float(s["proc_loadavg"]["load_avg_1min"]) / num_baremetal_cpus, + 100 * float(s["proc_loadavg"]["load_avg_5min"]) / num_baremetal_cpus, + 100 * float(s["proc_loadavg"]["load_avg_15min"]) / num_baremetal_cpus, + ] + ) + + # Produce the javascript: + self.output_page.appendGoogleChart( + GoogleChartsGraph( + data=load_avg_stats, # Data + graph_title="Average Load (from baremetal stats)", + button_label="Average Load", + y_axes_titles=["Load (%)"], + graph_source=GRAPH_SOURCE_DATA_IS_BAREMETAL, + stack_state=False, + ) + ) + return + + def generate_monitoring_summary(self, version): + monitoring_summary = [ + # ( "User:", self.jheader["cmonitor"]["username"] ), # not really useful + ("Collected:", self.jheader["cmonitor"]["collecting"].replace(",", ", ")), + # ( "Started sampling at:", self.sample_template["timestamp"]["datetime"] + " (Local)" ), # not really useful + ("Started sampling at:", self.jdata[0]["timestamp"]["UTC"] + " (UTC)"), + ("Samples:", str(len(self.jdata))), + ("Sampling Interval (s):", str(self.jheader["cmonitor"]["sample_interval_seconds"])), + ( + "Total time sampled (hh:mm:ss):", + str(datetime.timedelta(seconds=self.jheader["cmonitor"]["sample_interval_seconds"] * len(self.jdata))), + ), + ("Version (cmonitor_collector):", self.jheader["cmonitor"]["version"]), + ("Version (cmonitor_chart):", version), + ] + self.output_page.appendHtmlTable("Monitoring Summary", monitoring_summary) + + def __generate_monitored_summary_with_cpus(self, logical_cpus_indexes): + # NOTE: unfortunately some useful information like: + # - RAM memory model/speed + # - Disk model/speed + # - NIC model/speed + # will not be available from inside a container, which is where cmonitor_collector usually runs... + # so we mostly show CPU stats: + all_disks = [] + if "disks" in self.sample_template: + all_disks = self.sample_template["disks"].keys() + all_netdevices = [] + if "network_interfaces" in self.sample_template: + all_netdevices = self.sample_template["network_interfaces"].keys() + all_numanodes = [] + if "numa_nodes" in self.jheader: + all_numanodes = list(self.jheader["numa_nodes"].keys()) + all_numanodes = [v.replace("node", "") for v in all_numanodes] + + cpu_model = "Unknown" + bogomips = "Unknown" + if "cpuinfo" in self.jheader: + first_cpu = list(self.jheader["cpuinfo"].keys())[0] + cpu_model = self.jheader["cpuinfo"][first_cpu]["model_name"] + bogomips = str(self.jheader["cpuinfo"][first_cpu]["bogomips"]) + + monitored_summary = [ + ("Hostname:", self.jheader["identity"]["hostname"]), + ("OS:", self.jheader["os_release"]["pretty_name"]), + ("CPU:", cpu_model), + ("BogoMIPS:", bogomips), + ("Monitored CPUs:", str(len(logical_cpus_indexes))), + ("Monitored Disks:", str(len(all_disks))), + ("Monitored Network Devices:", str(len(all_netdevices))), + ("Monitored NUMA Nodes:", ",".join(all_numanodes)), + ] + return monitored_summary + + def generate_monitored_summary(self): + if len(self.baremetal_logical_cpus_indexes) > 0: + self.output_page.appendHtmlTable( + "Monitored System Summary", + self.__generate_monitored_summary_with_cpus(self.baremetal_logical_cpus_indexes), + ) + elif len(self.cgroup_logical_cpus_indexes) > 0: + self.output_page.appendHtmlTable( + "Monitored System Summary", + self.__generate_monitored_summary_with_cpus(self.cgroup_logical_cpus_indexes), + ) + + def generate_about_this(self): + about_this = [ + ("Zoom:", "use left-click and drag"), + ("Reset view:", "use right-click"), + ("Generated by", 'cmonitor'), + ] + self.output_page.appendHtmlTable("About this", about_this, div_class="bottom_about_div") + + def generate_html(self, top_scorer, version): + """ + Main API linking the generation of all other charts together + """ + # baremetal stats: + self.generate_baremetal_cpus() + self.generate_baremetal_memory() + self.generate_baremetal_network_traffic() + self.generate_baremetal_disks_io() + self.generate_baremetal_avg_load() + + # cgroup stats: + self.generate_cgroup_cpus() + self.generate_cgroup_memory() + self.generate_cgroup_topN_procs(top_scorer) + self.generate_cgroup_network_traffic() + + # HTML HEAD -- generate all the JS code to draw all the graphs created so far + self.output_page.writeHtmlHead() + + # HTML BODY -- now we start actual HTML body which are just a few tables with buttons + # that invoke the JS code produced earlier inside the + self.__make_jheader_nicer() + self.output_page.startHtmlBody(self.cgroup_name, self.monitored_system, self.jheader, self.collected_threads) + + self.generate_monitoring_summary(version) + self.generate_monitored_summary() + self.generate_about_this() + + self.output_page.endHtmlBody() diff --git a/tools/common-code/cmonitor_filter_engine.py b/tools/common-code/cmonitor_filter_engine.py new file mode 100644 index 00000000..fc03c6a5 --- /dev/null +++ b/tools/common-code/cmonitor_filter_engine.py @@ -0,0 +1,122 @@ +# +# cmonitor_filter_engine.py +# +# Author: Francesco Montorsi, Marco Zizzi +# Created: January 2022 +# + +import argparse +import json +import os +import sys +import datetime + +# this introduces as dependency the "python-dateutil" package >= 2.7.0 +# this is better than using datetime.fromisoformat() which introduces as dependency Python >= 3.7 +# which is not available on Centos7 +import dateutil.parser as datetime_parser + +# ======================================================================================================= +# CmonitorFilterEngine +# ======================================================================================================= +class CmonitorFilterEngine: + def __init__(self, json_data, output_file=None, be_verbose=False) -> None: + self.output_file = output_file + self.json_data = json_data + self.verbose = be_verbose + + def write_output_file(self): + """ + Write the filtered JSON to a file or stdout + """ + n_samples = len(self.json_data["samples"]) + if self.output_file: # use has provided an output file... dump on disk: + dest_dir = os.path.dirname(self.output_file) + if not os.path.exists(dest_dir): + os.makedirs(dest_dir) + with open(self.output_file, "w") as f: + json.dump(self.json_data, f) + if self.verbose: + print(f"Wrote {n_samples} samples into {self.output_file}") + else: + print(json.dumps(self.json_data)) + if self.verbose: + print(f"Wrote {n_samples} samples on standard output") + + def filter_by_time(self, start_timestamp=None, end_timestamp=None): + """ + Filter samples outside the given interval. + One of the two timestamps can be None. + """ + + assert start_timestamp is None or isinstance(start_timestamp, datetime.datetime) + assert end_timestamp is None or isinstance(end_timestamp, datetime.datetime) + + n_removed_samples = 0 + + def _filter_by_both_starttime_endtime(sample): + nonlocal n_removed_samples + # convert from string to datetime object: + sample_datetime = datetime.datetime.strptime(sample["timestamp"]["UTC"], "%Y-%m-%dT%H:%M:%S.%f") + # filter: + if not (start_timestamp <= sample_datetime <= end_timestamp): + self.json_data["samples"].remove(sample) + n_removed_samples += 1 + + def _filter_only_by_starttime(sample): + nonlocal n_removed_samples + # convert from string to datetime object: + sample_datetime = datetime.datetime.strptime(sample["timestamp"]["UTC"], "%Y-%m-%dT%H:%M:%S.%f") + print(sample_datetime) + # filter: + if not (start_timestamp <= sample_datetime): + self.json_data["samples"].remove(sample) + n_removed_samples += 1 + + def _filter_only_by_endtime(sample): + nonlocal n_removed_samples + # convert from string to datetime object: + sample_datetime = datetime.datetime.strptime(sample["timestamp"]["UTC"], "%Y-%m-%dT%H:%M:%S.%f") + # filter: + if not (sample_datetime <= end_timestamp): + self.json_data["samples"].remove(sample) + n_removed_samples += 1 + + if start_timestamp and end_timestamp: + for sample in self.json_data["samples"]: + _filter_by_both_starttime_endtime(sample) + if self.verbose: + print(f"Filtering samples by start and end timestamp [{start_timestamp}-{end_timestamp}]. Removed {n_removed_samples} samples.") + elif start_timestamp: + for sample in self.json_data["samples"]: + _filter_only_by_starttime(sample) + if self.verbose: + print(f"Filtering samples by start timestamp [{start_timestamp}]. Removed {n_removed_samples} samples.") + elif end_timestamp: + for sample in self.json_data["samples"]: + _filter_only_by_endtime(sample) + if self.verbose: + print(f"Filtering samples by end timestamp [{end_timestamp}]. Removed {n_removed_samples} samples.") + else: + assert False + + def filter_by_task_name(self, task_name: str): + """ + Filter tasks by given name + """ + + # we cannot iterate over a list on which we're + # original_data = + + samples_copy = self.json_data["samples"].copy() + n_removed_tasks = 0 + + for sample_idx, sample in enumerate(samples_copy): + if "cgroup_tasks" in sample: + for pid_sample in sample["cgroup_tasks"].copy(): + if task_name not in sample["cgroup_tasks"][pid_sample]["cmd"]: + del self.json_data["samples"][sample_idx]["cgroup_tasks"][pid_sample] + n_removed_tasks += 1 + + if self.verbose: + print(f"Filtering samples by task name [{task_name}]. Removed {n_removed_tasks} tasks.") diff --git a/tools/common-code/cmonitor_loader.py b/tools/common-code/cmonitor_loader.py index 8d48573c..15661b30 100644 --- a/tools/common-code/cmonitor_loader.py +++ b/tools/common-code/cmonitor_loader.py @@ -13,6 +13,9 @@ import gzip +# ======================================================================================================= +# CmonitorCollectorJsonLoader +# ======================================================================================================= class CmonitorCollectorJsonLoader: def __init__(self): self.input_file = "" diff --git a/tools/common-code/cmonitor_statistics_engine.py b/tools/common-code/cmonitor_statistics_engine.py new file mode 100644 index 00000000..496582e5 --- /dev/null +++ b/tools/common-code/cmonitor_statistics_engine.py @@ -0,0 +1,218 @@ +#!/usr/bin/python3 + +# +# cmonitor_statistics_engine.py +# +# Author: Gajanan Khandake +# Created: April 2021 +# + +import argparse +import json +import os +import sys +import gzip +from statistics import mean, median, mode, StatisticsError + +# ======================================================================================================= +# Helper classes +# ======================================================================================================= +class GenericStatisticsCalculator: + """ + Provides basic statistical analysis over a number of samples that represent a KPI changing over time. + """ + + def __init__(self, unit: str) -> None: + self.__stats = list() + self.__unit = unit + + def insert_stat(self, value) -> None: + self.__stats.append(value) + + def get_min(self): + return min(self.__stats) + + def get_max(self): + return max(self.__stats) + + def get_mean(self): + return mean(self.__stats) + + def get_median(self): + return median(self.__stats) + + def get_mode(self): + try: + return mode(self.__stats) + except StatisticsError: + return "no unique mode" + + def get_unit(self): + return self.__unit + + def get_all_stats(self): + return self.__stats + + def dump_json(self, verbose) -> dict: + statistics = dict() + + if len(self.__stats) > 0: + statistics["minimum"] = self.get_min() + statistics["maximum"] = self.get_max() + statistics["mean"] = self.get_mean() + statistics["median"] = self.get_median() + statistics["mode"] = self.get_mode() + statistics["unit"] = self.get_unit() + if verbose: + statistics["stats"] = self.__stats + statistics["samples"] = len(self.__stats) + + return statistics + + +class CgroupTasksStatistics: + """ + Stores all important statistical information associated with a Linux Cgroup + """ + + def __init__(self) -> None: + self.cpu = GenericStatisticsCalculator("%") + self.memory = GenericStatisticsCalculator("bytes") + self.io = GenericStatisticsCalculator("bytes") + self.memory_failcnt = GenericStatisticsCalculator("") + self.cpu_throttle = GenericStatisticsCalculator("%") + + def insert_cpu_stats(self, stats: dict, sample_index: int) -> None: + if "cpu_tot" in stats: + self.cpu.insert_stat(stats["cpu_tot"]["user"] + stats["cpu_tot"]["sys"]) + else: + print(f"WARNING: The JSON file provided does not contain the 'cpu_tot' measurement for sample #{sample_index}. Skipping this sample.") + + if "throttling" in stats: + cpu_throttle_percentage = 0 + if stats["throttling"]["nr_periods"] > 0: + cpu_throttle_percentage = (stats["throttling"]["nr_throttled"] * 100) / stats["throttling"]["nr_periods"] + self.cpu_throttle.insert_stat(cpu_throttle_percentage) + else: + print(f"WARNING: The JSON file provided does not contain the 'throttling' measurement for sample #{sample_index}. Skipping this sample.") + + def dump_cpu_stats(self, verbose) -> None: + return self.cpu.dump_json(verbose) + + def dump_cpu_throttle_stats(self, verbose) -> None: + return self.cpu_throttle.dump_json(verbose) + + def insert_memory_stats(self, stats: dict, sample_index: int) -> None: + if "stat.rss" in stats: + self.memory.insert_stat(stats["stat.rss"]) + else: + print(f"WARNING: The JSON file provided does not contain the 'stat.rss' measurement for sample #{sample_index}. Skipping this sample.") + if "events.failcnt" in stats: + self.memory_failcnt.insert_stat(stats["events.failcnt"]) + else: + print( + f"WARNING: The JSON file provided does not contain the 'events.failcnt' measurement for sample #{sample_index}. Skipping this sample." + ) + + def dump_memory_stats(self, verbose) -> None: + return self.memory.dump_json(verbose) + + def dump_memory_failcnt_stats(self, verbose) -> None: + return self.memory_failcnt.dump_json(verbose) + + # cgroup_blkio not yet available + # def insert_io_stats(self, stats: dict) -> None: + # self.io.insert_stat(stats["io_rchar"] + stats["io_wchar"]) + # def dump_io_stats(self) -> None: + # return self.io.dump_json() + + +# ======================================================================================================= +# CmonitorStatisticsEngine +# ======================================================================================================= +class CmonitorStatisticsEngine: + """ + Interface between JSON structure collected by cmonitor_collector and statistical calculators + """ + + def __init__(self, be_verbose=False) -> None: + self.cgroup_statistics = CgroupTasksStatistics() + self.verbose = be_verbose + pass + + def process(self, json_data) -> None: + """ + Loads the provided JSON data and runs all statistical analyses on it. + """ + if "samples" not in json_data: + print("Unexpected JSON format. Aborting.") + sys.exit(1) + if len(json_data["samples"]) <= 2: + print("This tool requires at least 3 samples in the input JSON file. Aborting.") + sys.exit(1) + + # skip sample 0 because it contains less statistics due to the differential logic that requires some + # initialization sample for most of the stats + first_sample = json_data["samples"][1] + do_cpu_stats = True + if "cgroup_cpuacct_stats" not in first_sample: + do_cpu_stats = False + print( + "WARNING: The JSON file provided does not contain measurements for the 'cpuacct' cgroup. Please use '--collect=cgroup_cpu' when launching cmonitor_collector." + ) + elif "cpu_tot" not in first_sample["cgroup_cpuacct_stats"]: + do_cpu_stats = False + print( + "WARNING: The JSON file provided does not contain the 'cpu_tot' measurement. Probably it was produced by cmonitor version 1.7-0 or earlier. Skipping CPU statistics." + ) + + do_memory_stats = True + if "cgroup_memory_stats" not in first_sample: + do_memory_stats = False + print( + "WARNING: The JSON file provided does not contain measurements for the 'memory' cgroup. Please use '--collect=cgroup_memory' when launching cmonitor_collector." + ) + + for sample in json_data["samples"][1:]: + try: + nsample = sample["timestamp"]["sample_index"] + except KeyError: + nsample = -1 + if do_cpu_stats: + self.cgroup_statistics.insert_cpu_stats(sample["cgroup_cpuacct_stats"], nsample) + if do_memory_stats: + self.cgroup_statistics.insert_memory_stats(sample["cgroup_memory_stats"], nsample) + + # self.cgroup_statistics.insert_io_stats(stats) # cgroup_blkio not yet available + + def __dump_json_to_file( + self, + statistics: dict, + outfile: str, + ) -> None: + print(f"Opening output file {outfile}") + with open(outfile, "w") as of: + json.dump(statistics, of) + + def dump_statistics_json(self, output_file=None) -> None: + """ + Writes the result of the statistical analysis on a file or to stdout + """ + statistics = { + "statistics": { + "cpu": self.cgroup_statistics.dump_cpu_stats(self.verbose), + "cpu_throttle": self.cgroup_statistics.dump_cpu_throttle_stats(self.verbose), + "memory": self.cgroup_statistics.dump_memory_stats(self.verbose), + "memory_failcnt": self.cgroup_statistics.dump_memory_failcnt_stats(self.verbose), + # "io": self.cgroup_statistics.dump_io_stats(), + } + } + + if output_file: + self.__dump_json_to_file(statistics, output_file) + else: + print("Result of analysis:") + print(json.dumps(statistics, indent=4, sort_keys=True)) + + def get_cgroup_statistics(self): + return self.cgroup_statistics diff --git a/tools/filter/cmonitor_filter.py b/tools/filter/cmonitor_filter.py index 660fba77..0297ffa8 100755 --- a/tools/filter/cmonitor_filter.py +++ b/tools/filter/cmonitor_filter.py @@ -20,6 +20,7 @@ from cmonitor_loader import CmonitorCollectorJsonLoader from cmonitor_version import CmonitorToolVersion +from cmonitor_filter_engine import CmonitorFilterEngine # ======================================================================================================= # GLOBALs @@ -27,114 +28,6 @@ verbose = False -# ======================================================================================================= -# CLASS -# ======================================================================================================= -class CmonitorFilter: - def __init__(self, input_file, output_file) -> None: - global verbose - self.input_file = input_file - self.output_file = output_file - self.json_data = CmonitorCollectorJsonLoader().load(self.input_file, this_tool_version=CmonitorToolVersion().get(), be_verbose=verbose) - - def __write_output_file(self): - global verbose - - n_samples = len(self.json_data["samples"]) - if self.output_file: # use has provided an output file... dump on disk: - dest_dir = os.path.dirname(self.output_file) - if not os.path.exists(dest_dir): - os.makedirs(dest_dir) - with open(self.output_file, "w") as f: - json.dump(self.json_data, f) - if verbose: - print(f"Wrote {n_samples} samples into {self.output_file}") - else: - print(json.dumps(self.json_data)) - if verbose: - print(f"Wrote {n_samples} samples on standard output") - - def filter_by_time(self, start_timestamp=None, end_timestamp=None): - """ - Filter samples outside the given interval. - One of the two timestamps can be None. - """ - - assert start_timestamp is None or isinstance(start_timestamp, datetime.datetime) - assert end_timestamp is None or isinstance(end_timestamp, datetime.datetime) - - n_removed_samples = 0 - - def _filter_by_both_starttime_endtime(sample): - nonlocal n_removed_samples - # convert from string to datetime object: - sample_datetime = datetime.datetime.strptime(sample["timestamp"]["UTC"], "%Y-%m-%dT%H:%M:%S.%f") - # filter: - if not (start_timestamp <= sample_datetime <= end_timestamp): - self.json_data["samples"].remove(sample) - n_removed_samples += 1 - - def _filter_only_by_starttime(sample): - nonlocal n_removed_samples - # convert from string to datetime object: - sample_datetime = datetime.datetime.strptime(sample["timestamp"]["UTC"], "%Y-%m-%dT%H:%M:%S.%f") - print(sample_datetime) - # filter: - if not (start_timestamp <= sample_datetime): - self.json_data["samples"].remove(sample) - n_removed_samples += 1 - - def _filter_only_by_endtime(sample): - nonlocal n_removed_samples - # convert from string to datetime object: - sample_datetime = datetime.datetime.strptime(sample["timestamp"]["UTC"], "%Y-%m-%dT%H:%M:%S.%f") - # filter: - if not (sample_datetime <= end_timestamp): - self.json_data["samples"].remove(sample) - n_removed_samples += 1 - - if start_timestamp and end_timestamp: - for sample in self.json_data["samples"]: - _filter_by_both_starttime_endtime(sample) - if verbose: - print(f"Filtering samples by start and end timestamp [{start_timestamp}-{end_timestamp}]. Removed {n_removed_samples} samples.") - elif start_timestamp: - for sample in self.json_data["samples"]: - _filter_only_by_starttime(sample) - if verbose: - print(f"Filtering samples by start timestamp [{start_timestamp}]. Removed {n_removed_samples} samples.") - elif end_timestamp: - for sample in self.json_data["samples"]: - _filter_only_by_endtime(sample) - if verbose: - print(f"Filtering samples by end timestamp [{end_timestamp}]. Removed {n_removed_samples} samples.") - else: - assert False - - self.__write_output_file() - - def filter_by_task_name(self, task_name: str): - """ - Filter tasks by given name - """ - - # we cannot iterate over a list on which we're - # original_data = - - samples_copy = self.json_data["samples"].copy() - n_removed_tasks = 0 - - for sample_idx, sample in enumerate(samples_copy): - if "cgroup_tasks" in sample: - for pid_sample in sample["cgroup_tasks"].copy(): - if task_name not in sample["cgroup_tasks"][pid_sample]["cmd"]: - del self.json_data["samples"][sample_idx]["cgroup_tasks"][pid_sample] - n_removed_tasks += 1 - - if verbose: - print(f"Filtering samples by task name [{task_name}]. Removed {n_removed_tasks} tasks.") - self.__write_output_file() - # ======================================================================================================= # MAIN HELPERS @@ -228,12 +121,16 @@ def parse_command_line(): if __name__ == "__main__": config = parse_command_line() - filter_engine = CmonitorFilter(config["input_json"], config["output_file"]) + + json_data = CmonitorCollectorJsonLoader().load(config["input_json"], this_tool_version=CmonitorToolVersion().get(), be_verbose=verbose) + filter_engine = CmonitorFilterEngine(json_data, config["output_file"], be_verbose=verbose) if config["start_timestamp"] or config["end_timestamp"]: filter_engine.filter_by_time(config["start_timestamp"], config["end_timestamp"]) + filter_engine.write_output_file() elif config["task_name"]: filter_engine.filter_by_task_name(config["task_name"]) + filter_engine.write_output_file() else: print("Please provide at least one filter criteria using CLI options. Use --help for more info.") sys.exit(os.EX_USAGE) diff --git a/tools/spec/tools.spec b/tools/spec/tools.spec index 8e68838a..f41b820a 100644 --- a/tools/spec/tools.spec +++ b/tools/spec/tools.spec @@ -34,13 +34,11 @@ containers in real-time. %install rm -rf %{buildroot} -%make_install -C tools BINDIR=%{_bindir} PYTHON3_SITELIB=%{python3_sitelib} +# this command invokes the root Makefile of cmonitor repo, from inside the source tarball +# produced by COPR that will pass all the options listed here to tools/Makefile +%make_install -C tools BINDIR=%{_bindir} PYTHON3_SITELIB=%{python3_sitelib} CMONITOR_LAST_COMMIT_HASH=__LAST_COMMIT_HASH__ %files -%{_bindir}/cmonitor_chart -%{_bindir}/cmonitor_filter -%{_bindir}/cmonitor_statistics -%{python3_sitelib}/cmonitor_loader.py -%{python3_sitelib}/cmonitor_version.py -%{python3_sitelib}//__pycache__/cmonitor_loader.*.pyc -%{python3_sitelib}//__pycache__/cmonitor_version.*.pyc +%{_bindir}/cmonitor_* +%{python3_sitelib}/cmonitor_*.py +%{python3_sitelib}//__pycache__/cmonitor_*.*.pyc diff --git a/tools/statistics/cmonitor_statistics.py b/tools/statistics/cmonitor_statistics.py index 9305acc5..a8deb8c6 100755 --- a/tools/statistics/cmonitor_statistics.py +++ b/tools/statistics/cmonitor_statistics.py @@ -15,6 +15,7 @@ from statistics import mean, median, mode, StatisticsError from cmonitor_loader import CmonitorCollectorJsonLoader from cmonitor_version import CmonitorToolVersion +from cmonitor_statistics_engine import CmonitorStatisticsEngine # ======================================================================================================= # GLOBALs @@ -22,186 +23,6 @@ verbose = False -# ======================================================================================================= -# CLASS -# ======================================================================================================= -class CmonitorStatistics: - class Statistics: - def __init__(self, unit: str) -> None: - self.__stats = list() - self.__unit = unit - - def insert_stat(self, value) -> None: - self.__stats.append(value) - - def __min(self): - return min(self.__stats) - - def __max(self): - return max(self.__stats) - - def __mean(self): - return mean(self.__stats) - - def __median(self): - return median(self.__stats) - - def __mode(self): - try: - return mode(self.__stats) - except StatisticsError: - return "no unique mode" - - def dump_json(self) -> dict: - global verbose - statistics = dict() - - if len(self.__stats) > 0: - statistics["minimum"] = self.__min() - statistics["maximum"] = self.__max() - statistics["mean"] = self.__mean() - statistics["median"] = self.__median() - statistics["mode"] = self.__mode() - statistics["unit"] = self.__unit - if verbose: - statistics["stats"] = self.__stats - statistics["samples"] = len(self.__stats) - - return statistics - - class CgroupTasksStatistics: - def __init__(self) -> None: - self.cpu = CmonitorStatistics.Statistics("%") - self.memory = CmonitorStatistics.Statistics("bytes") - self.io = CmonitorStatistics.Statistics("bytes") - self.memory_failcnt = CmonitorStatistics.Statistics("") - self.cpu_throttle = CmonitorStatistics.Statistics("%") - - def insert_cpu_stats(self, stats: dict, sample_index: int) -> None: - if "cpu_tot" in stats: - self.cpu.insert_stat(stats["cpu_tot"]["user"] + stats["cpu_tot"]["sys"]) - else: - print(f"WARNING: The JSON file provided does not contain the 'cpu_tot' measurement for sample #{sample_index}. Skipping this sample.") - - if "throttling" in stats: - cpu_throttle_percentage = 0 - if stats["throttling"]["nr_periods"] > 0: - cpu_throttle_percentage = (stats["throttling"]["nr_throttled"] * 100) / stats["throttling"]["nr_periods"] - self.cpu_throttle.insert_stat(cpu_throttle_percentage) - else: - print( - f"WARNING: The JSON file provided does not contain the 'throttling' measurement for sample #{sample_index}. Skipping this sample." - ) - - def dump_cpu_stats(self) -> None: - return self.cpu.dump_json() - - def dump_cpu_throttle_stats(self) -> None: - return self.cpu_throttle.dump_json() - - def insert_memory_stats(self, stats: dict, sample_index: int) -> None: - if "stat.rss" in stats: - self.memory.insert_stat(stats["stat.rss"]) - else: - print( - f"WARNING: The JSON file provided does not contain the 'stat.rss' measurement for sample #{sample_index}. Skipping this sample." - ) - if "events.failcnt" in stats: - self.memory_failcnt.insert_stat(stats["events.failcnt"]) - else: - print( - f"WARNING: The JSON file provided does not contain the 'events.failcnt' measurement for sample #{sample_index}. Skipping this sample." - ) - - def dump_memory_stats(self) -> None: - return self.memory.dump_json() - - def dump_memory_failcnt_stats(self) -> None: - return self.memory_failcnt.dump_json() - - # cgroup_blkio not yet available - # def insert_io_stats(self, stats: dict) -> None: - # self.io.insert_stat(stats["io_rchar"] + stats["io_wchar"]) - # def dump_io_stats(self) -> None: - # return self.io.dump_json() - - def __init__(self) -> None: - self.cgroup_statistics = self.CgroupTasksStatistics() - pass - - def process(self, input_json: str, output_file: str) -> None: - global verbose - json_data = CmonitorCollectorJsonLoader().load(input_json, this_tool_version=CmonitorToolVersion().get(), be_verbose=verbose) - if "samples" not in json_data: - print("Unexpected JSON format. Aborting.") - sys.exit(1) - if len(json_data["samples"]) <= 2: - print("This tool requires at least 3 samples in the input JSON file. Aborting.") - sys.exit(1) - - # skip sample 0 because it contains less statistics due to the differential logic that requires some - # initialization sample for most of the stats - first_sample = json_data["samples"][1] - do_cpu_stats = True - if "cgroup_cpuacct_stats" not in first_sample: - do_cpu_stats = False - print( - "WARNING: The JSON file provided does not contain measurements for the 'cpuacct' cgroup. Please use '--collect=cgroup_cpu' when launching cmonitor_collector." - ) - elif "cpu_tot" not in first_sample["cgroup_cpuacct_stats"]: - do_cpu_stats = False - print( - "WARNING: The JSON file provided does not contain the 'cpu_tot' measurement. Probably it was produced by cmonitor version 1.7-0 or earlier. Skipping CPU statistics." - ) - - do_memory_stats = True - if "cgroup_memory_stats" not in first_sample: - do_memory_stats = False - print( - "WARNING: The JSON file provided does not contain measurements for the 'memory' cgroup. Please use '--collect=cgroup_memory' when launching cmonitor_collector." - ) - - for sample in json_data["samples"][1:]: - try: - nsample = sample["timestamp"]["sample_index"] - except KeyError: - nsample = -1 - if do_cpu_stats: - self.cgroup_statistics.insert_cpu_stats(sample["cgroup_cpuacct_stats"], nsample) - if do_memory_stats: - self.cgroup_statistics.insert_memory_stats(sample["cgroup_memory_stats"], nsample) - - # self.cgroup_statistics.insert_io_stats(stats) # cgroup_blkio not yet available - - self.dump_statistics_json(output_file) - - def __dump_json_to_file( - self, - statistics: dict, - outfile: str, - ) -> None: - print(f"Opening output file {outfile}") - with open(outfile, "w") as of: - json.dump(statistics, of) - - def dump_statistics_json(self, output_file="") -> None: - statistics = { - "statistics": { - "cpu": self.cgroup_statistics.dump_cpu_stats(), - "cpu_throttle": self.cgroup_statistics.dump_cpu_throttle_stats(), - "memory": self.cgroup_statistics.dump_memory_stats(), - "memory_failcnt": self.cgroup_statistics.dump_memory_failcnt_stats(), - # "io": self.cgroup_statistics.dump_io_stats(), - } - } - - if output_file: - self.__dump_json_to_file(statistics, output_file) - else: - print("Result of analysis:") - print(json.dumps(statistics, indent=4, sort_keys=True)) - - # ======================================================================================================= # MAIN HELPERS # ======================================================================================================= @@ -254,4 +75,7 @@ def parse_command_line(): if __name__ == "__main__": config = parse_command_line() - CmonitorStatistics().process(config["input_json"], config["output_file"]) + json_data = CmonitorCollectorJsonLoader().load(config["input_json"], this_tool_version=CmonitorToolVersion().get(), be_verbose=verbose) + engine = CmonitorStatisticsEngine() + engine.process(json_data) + engine.dump_statistics_json(config["output_file"])