From 09c4c456cd0069606cf373b623662af3ea3e8fb5 Mon Sep 17 00:00:00 2001
From: shirady <57721533+shirady@users.noreply.github.com>
Date: Wed, 27 Nov 2024 13:33:41 +0200
Subject: [PATCH] NC | NSFS | Disable Prometheus reporting

1. The function clusterMetrics might throw an error (with an error message Operation timed out.), in case it fails, there is no try-catch block, and will end with uncaughtException. To avoid that we wrap it with try-catch block, log an error message and return from the function (we can't proceed with undefined metrics variable). Also added a json response with the details of the error in case it happens.
2. I noticed that the call prom_reporting.start_server(metrics_port, true); in fork_utils.js did not have await and added it - had to change the function signature of start_workers to have async and update its JSDoc @returns part, and also separate the call and the condition in endpoint.js.
3. Add a comment in the endpoint main for developers regarding implementation (running on the main process, running with multiple forks).
4. Change the handing under the function gather_metrics so we will print the json error object that we returned from the server.
5. Change the printing of "_create_nsfs_report: nsfs_report" so we can see the object (we used to see: "core.server.analytic_services.prometheus_reporting:: _create_nsfs_report: nsfs_report [object Object]").
6. Rename a function (we had a minor typo) from nsfs_io_state_handler to nsfs_io_stats_handler.
7. Change the order inside the function start_server in prometheus_reporting.js and set the if (req.url === '/metrics/nsfs_stats') { because it doesn't use metrics.

Signed-off-by: shirady <57721533+shirady@users.noreply.github.com>
---
 src/endpoint/endpoint.js                      | 12 ++++++++-
 src/manage_nsfs/diagnose.js                   | 12 +++++++--
 .../analytic_services/prometheus_reporting.js | 27 ++++++++++++++-----
 src/util/fork_utils.js                        | 10 +++----
 4 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/src/endpoint/endpoint.js b/src/endpoint/endpoint.js
index 3025bc05f1..c4c817500e 100755
--- a/src/endpoint/endpoint.js
+++ b/src/endpoint/endpoint.js
@@ -118,7 +118,17 @@ async function main(options = {}) {
         // the primary just forks and returns, workers will continue to serve
         fork_count = options.forks ?? config.ENDPOINT_FORKS;
         const metrics_port = options.metrics_port || config.EP_METRICS_SERVER_PORT;
-        if (fork_utils.start_workers(metrics_port, fork_count)) return;
+        /**
+        * Please notice that we can run the main in 2 states:
+        * 1. Only the primary process runs the main (fork is 0 or undefined) - everything that 
+        *    is implemented here would be run by this process.
+        * 2. A primary process with multiple forks (IMPORTANT) - if there is implementation that 
+        *    in only relevant to the primary process it should be implemented in 
+        *    fork_utils.start_workers because the primary process returns after start_workers 
+        *    and the forks will continue executing the code lines in this function
+        *  */
+        const is_workers_started_from_primary = await fork_utils.start_workers(metrics_port, fork_count);
+        if (is_workers_started_from_primary) return;
 
         const endpoint_group_id = process.env.ENDPOINT_GROUP_ID || 'default-endpoint-group';
 
diff --git a/src/manage_nsfs/diagnose.js b/src/manage_nsfs/diagnose.js
index 56d65266d2..2e87015b31 100644
--- a/src/manage_nsfs/diagnose.js
+++ b/src/manage_nsfs/diagnose.js
@@ -59,9 +59,17 @@ async function gather_metrics() {
             const buffer = await buffer_utils.read_stream_join(res);
             const body = buffer.toString('utf8');
             metrics_output = JSON.parse(body);
+            if (!metrics_output) throw new Error('received empty metrics response', { cause: res.statusCode });
+            write_stdout_response(ManageCLIResponse.MetricsStatus, metrics_output);
+        } else if (res.statusCode >= 500 && res.rawHeaders.includes('application/json')) {
+            const buffer = await buffer_utils.read_stream_join(res);
+            const body = buffer.toString('utf8');
+            const error_output = JSON.parse(body);
+            if (!error_output) throw new Error('received empty metrics response', { cause: res.statusCode });
+            throw_cli_error({ ...ManageCLIError.MetricsStatusFailed, ...error_output });
+        } else {
+            throw new Error('received empty metrics response', { cause: res.statusCode });
         }
-        if (!metrics_output) throw new Error('recieved empty metrics response', { cause: res.statusCode });
-        write_stdout_response(ManageCLIResponse.MetricsStatus, metrics_output);
     } catch (err) {
         dbg.warn('could not receive metrics response', err);
         throw_cli_error({ ...ManageCLIError.MetricsStatusFailed, cause: err?.errors?.[0] || err });
diff --git a/src/server/analytic_services/prometheus_reporting.js b/src/server/analytic_services/prometheus_reporting.js
index 164e4d502b..caee6bf86b 100644
--- a/src/server/analytic_services/prometheus_reporting.js
+++ b/src/server/analytic_services/prometheus_reporting.js
@@ -61,12 +61,7 @@ async function start_server(
     const server = http.createServer(async (req, res) => {
         // Serve all metrics on the root path for system that do have one or more fork running.
         if (fork_enabled) {
-            const metrics = await aggregatorRegistry.clusterMetrics();
-            if (req.url === '' || req.url === '/') {
-                res.writeHead(200, { 'Content-Type': aggregatorRegistry.contentType });
-                res.end(metrics);
-                return;
-            }
+            // we would like this part to be first as clusterMetrics might fail.
             if (req.url === '/metrics/nsfs_stats') {
                 res.writeHead(200, { 'Content-Type': 'text/plain' });
                 const nsfs_report = {
@@ -77,6 +72,24 @@ async function start_server(
                 res.end(JSON.stringify(nsfs_report));
                 return;
             }
+            let metrics;
+            try {
+                metrics = await aggregatorRegistry.clusterMetrics();
+            } catch (err) {
+                dbg.error('start_server: Could not get the metrics, got an error', err);
+                res.writeHead(504, { 'Content-Type': 'application/json' });
+                const reply = JSON.stringify({
+                    error: 'Internal server error - timeout',
+                    message: 'Looks like the server is taking a long time to respond (Could not get the metrics)',
+                });
+                res.end(reply);
+                return;
+            }
+            if (req.url === '' || req.url === '/') {
+                res.writeHead(200, { 'Content-Type': aggregatorRegistry.contentType });
+                res.end(metrics);
+                return;
+            }
             // Serve report's metrics on the report name path
             const report_name = req.url.substr(1);
             const single_metrics = export_single_metrics(metrics, report_name);
@@ -165,7 +178,7 @@ async function metrics_nsfs_stats_handler() {
         op_stats_counters: op_stats_counters,
         fs_worker_stats_counters: fs_worker_stats_counters
     };
-    dbg.log1(`_create_nsfs_report: nsfs_report ${nsfs_report}`);
+    dbg.log1('_create_nsfs_report: nsfs_report', nsfs_report);
     return JSON.stringify(nsfs_report);
 }
 
diff --git a/src/util/fork_utils.js b/src/util/fork_utils.js
index 5b6e6378e8..52c00d7b60 100644
--- a/src/util/fork_utils.js
+++ b/src/util/fork_utils.js
@@ -30,9 +30,9 @@ const fs_workers_stats = {};
  * 
  * @param {number?} count number of workers to start.
  * @param {number?} metrics_port prometheus metris port.
- * @returns {boolean} true if workers were started.
+ * @returns {Promise<boolean>} true if workers were started.
  */
-function start_workers(metrics_port, count = 0) {
+async function start_workers(metrics_port, count = 0) {
     const exit_events = [];
     if (cluster.isPrimary && count > 0) {
         for (let i = 0; i < count; ++i) {
@@ -68,12 +68,12 @@ function start_workers(metrics_port, count = 0) {
         });
         for (const id in cluster.workers) {
             if (id) {
-                cluster.workers[id].on('message', nsfs_io_state_handler);
+                cluster.workers[id].on('message', nsfs_io_stats_handler);
             }
         }
         if (metrics_port > 0) {
             dbg.log0('Starting metrics server', metrics_port);
-            prom_reporting.start_server(metrics_port, true);
+            await prom_reporting.start_server(metrics_port, true);
             dbg.log0('Started metrics server successfully');
         }
         return true;
@@ -82,7 +82,7 @@ function start_workers(metrics_port, count = 0) {
     return false;
 }
 
-function nsfs_io_state_handler(msg) {
+function nsfs_io_stats_handler(msg) {
     if (msg.io_stats) {
         for (const [key, value] of Object.entries(msg.io_stats)) {
             io_stats[key] += value;