diff --git a/CHANGELOG.md b/CHANGELOG.md index 644fb05900ceb..2211f59574718 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Introduce new dynamic cluster setting to control slice computation for concurrent segment search ([#9107](https://github.com/opensearch-project/OpenSearch/pull/9107)) - Implement on behalf of token passing for extensions ([#8679](https://github.com/opensearch-project/OpenSearch/pull/8679)) - Provide service accounts tokens to extensions ([#9618](https://github.com/opensearch-project/OpenSearch/pull/9618)) +- [Admission control] Add Resource usage collector service and resource usage tracker ([#9890](https://github.com/opensearch-project/OpenSearch/pull/9890)) ### Dependencies - Bump `log4j-core` from 2.18.0 to 2.19.0 diff --git a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodeStats.java b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodeStats.java index dd36b3b8db3ab..69efea186d927 100644 --- a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodeStats.java +++ b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodeStats.java @@ -56,6 +56,7 @@ import org.opensearch.monitor.os.OsStats; import org.opensearch.monitor.process.ProcessStats; import org.opensearch.node.AdaptiveSelectionStats; +import org.opensearch.node.NodesResourceUsageStats; import org.opensearch.script.ScriptCacheStats; import org.opensearch.script.ScriptStats; import org.opensearch.search.backpressure.stats.SearchBackpressureStats; @@ -142,6 +143,9 @@ public class NodeStats extends BaseNodeResponse implements ToXContentFragment { @Nullable private SearchPipelineStats searchPipelineStats; + @Nullable + private NodesResourceUsageStats resourceUsageStats; + public NodeStats(StreamInput in) throws IOException { super(in); timestamp = in.readVLong(); @@ -198,6 +202,11 @@ public NodeStats(StreamInput in) throws IOException { } else { searchPipelineStats = null; } + if (in.getVersion().onOrAfter(Version.V_3_0_0)) { // make it 2.12 when we backport + resourceUsageStats = in.readOptionalWriteable(NodesResourceUsageStats::new); + } else { + resourceUsageStats = null; + } } public NodeStats( @@ -216,6 +225,7 @@ public NodeStats( @Nullable DiscoveryStats discoveryStats, @Nullable IngestStats ingestStats, @Nullable AdaptiveSelectionStats adaptiveSelectionStats, + @Nullable NodesResourceUsageStats resourceUsageStats, @Nullable ScriptCacheStats scriptCacheStats, @Nullable IndexingPressureStats indexingPressureStats, @Nullable ShardIndexingPressureStats shardIndexingPressureStats, @@ -241,6 +251,7 @@ public NodeStats( this.discoveryStats = discoveryStats; this.ingestStats = ingestStats; this.adaptiveSelectionStats = adaptiveSelectionStats; + this.resourceUsageStats = resourceUsageStats; this.scriptCacheStats = scriptCacheStats; this.indexingPressureStats = indexingPressureStats; this.shardIndexingPressureStats = shardIndexingPressureStats; @@ -344,6 +355,11 @@ public AdaptiveSelectionStats getAdaptiveSelectionStats() { return adaptiveSelectionStats; } + @Nullable + public NodesResourceUsageStats getResourceUsageStats() { + return resourceUsageStats; + } + @Nullable public ScriptCacheStats getScriptCacheStats() { return scriptCacheStats; @@ -430,6 +446,9 @@ public void writeTo(StreamOutput out) throws IOException { if (out.getVersion().onOrAfter(Version.V_2_9_0)) { out.writeOptionalWriteable(searchPipelineStats); } + if (out.getVersion().onOrAfter(Version.V_3_0_0)) { // make it 2.12 when we backport + out.writeOptionalWriteable(resourceUsageStats); + } } @Override @@ -520,7 +539,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws if (getSearchPipelineStats() != null) { getSearchPipelineStats().toXContent(builder, params); } - + if (getResourceUsageStats() != null) { + getResourceUsageStats().toXContent(builder, params); + } return builder; } } diff --git a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodesStatsRequest.java b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodesStatsRequest.java index b0caa469033eb..99c9fb2d1e26a 100644 --- a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodesStatsRequest.java +++ b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodesStatsRequest.java @@ -213,7 +213,8 @@ public enum Metric { WEIGHTED_ROUTING_STATS("weighted_routing"), FILE_CACHE_STATS("file_cache"), TASK_CANCELLATION("task_cancellation"), - SEARCH_PIPELINE("search_pipeline"); + SEARCH_PIPELINE("search_pipeline"), + RESOURCE_USAGE_STATS("resource_usage_stats"); private String metricName; diff --git a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/TransportNodesStatsAction.java b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/TransportNodesStatsAction.java index 615abbaef845d..204157236a282 100644 --- a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/TransportNodesStatsAction.java +++ b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/TransportNodesStatsAction.java @@ -124,7 +124,8 @@ protected NodeStats nodeOperation(NodeStatsRequest nodeStatsRequest) { NodesStatsRequest.Metric.WEIGHTED_ROUTING_STATS.containedIn(metrics), NodesStatsRequest.Metric.FILE_CACHE_STATS.containedIn(metrics), NodesStatsRequest.Metric.TASK_CANCELLATION.containedIn(metrics), - NodesStatsRequest.Metric.SEARCH_PIPELINE.containedIn(metrics) + NodesStatsRequest.Metric.SEARCH_PIPELINE.containedIn(metrics), + NodesStatsRequest.Metric.RESOURCE_USAGE_STATS.containedIn(metrics) ); } diff --git a/server/src/main/java/org/opensearch/action/admin/cluster/stats/TransportClusterStatsAction.java b/server/src/main/java/org/opensearch/action/admin/cluster/stats/TransportClusterStatsAction.java index 18098bc31432f..d8323e209be23 100644 --- a/server/src/main/java/org/opensearch/action/admin/cluster/stats/TransportClusterStatsAction.java +++ b/server/src/main/java/org/opensearch/action/admin/cluster/stats/TransportClusterStatsAction.java @@ -168,6 +168,7 @@ protected ClusterStatsNodeResponse nodeOperation(ClusterStatsNodeRequest nodeReq false, false, false, + false, false ); List shardsStats = new ArrayList<>(); diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java index ae1bbe5e5f40c..89414cad064ad 100644 --- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java @@ -130,6 +130,7 @@ import org.opensearch.node.Node.DiscoverySettings; import org.opensearch.node.NodeRoleSettings; import org.opensearch.node.remotestore.RemoteStoreNodeService; +import org.opensearch.node.resource.tracker.ResourceTrackerSettings; import org.opensearch.persistent.PersistentTasksClusterService; import org.opensearch.persistent.decider.EnableAssignmentDecider; import org.opensearch.plugins.PluginsService; @@ -657,6 +658,10 @@ public void apply(Settings value, Settings current, Settings previous) { SegmentReplicationPressureService.MAX_REPLICATION_LIMIT_STALE_REPLICA_SETTING, SegmentReplicationPressureService.MAX_ALLOWED_STALE_SHARDS, + // Settings related to resource trackers + ResourceTrackerSettings.GLOBAL_CPU_USAGE_AC_WINDOW_DURATION_SETTING, + ResourceTrackerSettings.GLOBAL_JVM_USAGE_AC_WINDOW_DURATION_SETTING, + // Settings related to Searchable Snapshots Node.NODE_SEARCH_CACHE_SIZE_SETTING, FileCache.DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING, diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java index ea2d16c2b172c..13ad4cf19c138 100644 --- a/server/src/main/java/org/opensearch/node/Node.java +++ b/server/src/main/java/org/opensearch/node/Node.java @@ -168,6 +168,7 @@ import org.opensearch.monitor.fs.FsProbe; import org.opensearch.monitor.jvm.JvmInfo; import org.opensearch.node.remotestore.RemoteStoreNodeService; +import org.opensearch.node.resource.tracker.NodeResourceUsageTracker; import org.opensearch.persistent.PersistentTasksClusterService; import org.opensearch.persistent.PersistentTasksExecutor; import org.opensearch.persistent.PersistentTasksExecutorRegistry; @@ -806,7 +807,6 @@ protected Node( remoteStoreStatsTrackerFactory, recoverySettings ); - final AliasValidator aliasValidator = new AliasValidator(); final ShardLimitValidator shardLimitValidator = new ShardLimitValidator(settings, clusterService, systemIndices); @@ -1071,6 +1071,16 @@ protected Node( transportService.getTaskManager(), taskCancellationMonitoringSettings ); + final NodeResourceUsageTracker nodeResourceUsageTracker = new NodeResourceUsageTracker( + threadPool, + settings, + clusterService.getClusterSettings() + ); + final ResourceUsageCollectorService resourceUsageCollectorService = new ResourceUsageCollectorService( + nodeResourceUsageTracker, + clusterService, + threadPool + ); this.nodeService = new NodeService( settings, threadPool, @@ -1092,7 +1102,8 @@ protected Node( searchBackpressureService, searchPipelineService, fileCache, - taskCancellationMonitoringService + taskCancellationMonitoringService, + resourceUsageCollectorService ); final SearchService searchService = newSearchService( @@ -1213,6 +1224,8 @@ protected Node( b.bind(RerouteService.class).toInstance(rerouteService); b.bind(ShardLimitValidator.class).toInstance(shardLimitValidator); b.bind(FsHealthService.class).toInstance(fsHealthService); + b.bind(NodeResourceUsageTracker.class).toInstance(nodeResourceUsageTracker); + b.bind(ResourceUsageCollectorService.class).toInstance(resourceUsageCollectorService); b.bind(SystemIndices.class).toInstance(systemIndices); b.bind(IdentityService.class).toInstance(identityService); b.bind(Tracer.class).toInstance(tracer); @@ -1330,6 +1343,8 @@ public Node start() throws NodeValidationException { injector.getInstance(RepositoriesService.class).start(); injector.getInstance(SearchService.class).start(); injector.getInstance(FsHealthService.class).start(); + injector.getInstance(NodeResourceUsageTracker.class).start(); + injector.getInstance(ResourceUsageCollectorService.class).start(); nodeService.getMonitorService().start(); nodeService.getSearchBackpressureService().start(); nodeService.getTaskCancellationMonitoringService().start(); @@ -1492,6 +1507,8 @@ private Node stop() { injector.getInstance(ClusterService.class).stop(); injector.getInstance(NodeConnectionsService.class).stop(); injector.getInstance(FsHealthService.class).stop(); + injector.getInstance(NodeResourceUsageTracker.class).stop(); + injector.getInstance(ResourceUsageCollectorService.class).stop(); nodeService.getMonitorService().stop(); nodeService.getSearchBackpressureService().stop(); injector.getInstance(GatewayService.class).stop(); @@ -1555,6 +1572,10 @@ public synchronized void close() throws IOException { toClose.add(nodeService.getSearchBackpressureService()); toClose.add(() -> stopWatch.stop().start("fsHealth")); toClose.add(injector.getInstance(FsHealthService.class)); + toClose.add(() -> stopWatch.stop().start("resource_usage_tracker")); + toClose.add(injector.getInstance(NodeResourceUsageTracker.class)); + toClose.add(() -> stopWatch.stop().start("resource_usage_collector")); + toClose.add(injector.getInstance(ResourceUsageCollectorService.class)); toClose.add(() -> stopWatch.stop().start("gateway")); toClose.add(injector.getInstance(GatewayService.class)); toClose.add(() -> stopWatch.stop().start("search")); diff --git a/server/src/main/java/org/opensearch/node/NodeResourceUsageStats.java b/server/src/main/java/org/opensearch/node/NodeResourceUsageStats.java new file mode 100644 index 0000000000000..6ef66d4ac1914 --- /dev/null +++ b/server/src/main/java/org/opensearch/node/NodeResourceUsageStats.java @@ -0,0 +1,81 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.node; + +import org.opensearch.core.common.io.stream.StreamInput; +import org.opensearch.core.common.io.stream.StreamOutput; +import org.opensearch.core.common.io.stream.Writeable; + +import java.io.IOException; +import java.util.Locale; + +/** + * This represents the resource usage stats of a node along with the timestamp at which the stats object was created + * in the respective node + */ +public class NodeResourceUsageStats implements Writeable { + final String nodeId; + long timestamp; + double cpuUtilizationPercent; + double memoryUtilizationPercent; + + public NodeResourceUsageStats(String nodeId, long timestamp, double memoryUtilizationPercent, double cpuUtilizationPercent) { + this.nodeId = nodeId; + this.timestamp = timestamp; + this.cpuUtilizationPercent = cpuUtilizationPercent; + this.memoryUtilizationPercent = memoryUtilizationPercent; + } + + public NodeResourceUsageStats(StreamInput in) throws IOException { + this.nodeId = in.readString(); + this.timestamp = in.readLong(); + this.cpuUtilizationPercent = in.readDouble(); + this.memoryUtilizationPercent = in.readDouble(); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeString(this.nodeId); + out.writeLong(this.timestamp); + out.writeDouble(this.cpuUtilizationPercent); + out.writeDouble(this.memoryUtilizationPercent); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("NodeResourceUsageStats["); + sb.append(nodeId).append("]("); + sb.append("Timestamp: ").append(timestamp); + sb.append(", CPU utilization percent: ").append(String.format(Locale.ROOT, "%.1f", cpuUtilizationPercent)); + sb.append(", Memory utilization percent: ").append(String.format(Locale.ROOT, "%.1f", memoryUtilizationPercent)); + sb.append(")"); + return sb.toString(); + } + + NodeResourceUsageStats(NodeResourceUsageStats nodeResourceUsageStats) { + this( + nodeResourceUsageStats.nodeId, + nodeResourceUsageStats.timestamp, + nodeResourceUsageStats.memoryUtilizationPercent, + nodeResourceUsageStats.cpuUtilizationPercent + ); + } + + public double getMemoryUtilizationPercent() { + return memoryUtilizationPercent; + } + + public double getCpuUtilizationPercent() { + return cpuUtilizationPercent; + } + + public long getTimestamp() { + return timestamp; + } +} diff --git a/server/src/main/java/org/opensearch/node/NodeService.java b/server/src/main/java/org/opensearch/node/NodeService.java index 2688b894cb9a7..9bb07080fa717 100644 --- a/server/src/main/java/org/opensearch/node/NodeService.java +++ b/server/src/main/java/org/opensearch/node/NodeService.java @@ -83,6 +83,7 @@ public class NodeService implements Closeable { private final ScriptService scriptService; private final HttpServerTransport httpServerTransport; private final ResponseCollectorService responseCollectorService; + private final ResourceUsageCollectorService resourceUsageCollectorService; private final SearchTransportService searchTransportService; private final IndexingPressureService indexingPressureService; private final AggregationUsageService aggregationUsageService; @@ -114,7 +115,8 @@ public class NodeService implements Closeable { SearchBackpressureService searchBackpressureService, SearchPipelineService searchPipelineService, FileCache fileCache, - TaskCancellationMonitoringService taskCancellationMonitoringService + TaskCancellationMonitoringService taskCancellationMonitoringService, + ResourceUsageCollectorService resourceUsageCollectorService ) { this.settings = settings; this.threadPool = threadPool; @@ -137,6 +139,7 @@ public class NodeService implements Closeable { this.clusterService = clusterService; this.fileCache = fileCache; this.taskCancellationMonitoringService = taskCancellationMonitoringService; + this.resourceUsageCollectorService = resourceUsageCollectorService; clusterService.addStateApplier(ingestService); clusterService.addStateApplier(searchPipelineService); } @@ -217,7 +220,8 @@ public NodeStats stats( boolean weightedRoutingStats, boolean fileCacheStats, boolean taskCancellation, - boolean searchPipelineStats + boolean searchPipelineStats, + boolean resourceUsageStats ) { // for indices stats we want to include previous allocated shards stats as well (it will // only be applied to the sensible ones to use, like refresh/merge/flush/indexing stats) @@ -237,6 +241,7 @@ public NodeStats stats( discoveryStats ? discovery.stats() : null, ingest ? ingestService.stats() : null, adaptiveSelection ? responseCollectorService.getAdaptiveStats(searchTransportService.getPendingSearchRequests()) : null, + resourceUsageStats ? resourceUsageCollectorService.stats() : null, scriptCache ? scriptService.cacheStats() : null, indexingPressure ? this.indexingPressureService.nodeStats() : null, shardIndexingPressure ? this.indexingPressureService.shardStats(indices) : null, diff --git a/server/src/main/java/org/opensearch/node/NodesResourceUsageStats.java b/server/src/main/java/org/opensearch/node/NodesResourceUsageStats.java new file mode 100644 index 0000000000000..3dff9a27f71a8 --- /dev/null +++ b/server/src/main/java/org/opensearch/node/NodesResourceUsageStats.java @@ -0,0 +1,69 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.node; + +import org.opensearch.core.common.io.stream.StreamInput; +import org.opensearch.core.common.io.stream.StreamOutput; +import org.opensearch.core.common.io.stream.Writeable; +import org.opensearch.core.xcontent.ToXContentFragment; +import org.opensearch.core.xcontent.XContentBuilder; + +import java.io.IOException; +import java.util.Locale; +import java.util.Map; + +/** + * This class represents resource usage stats such as CPU, Memory and IO resource usage of each node along with the + * timestamp of the stats recorded. + */ +public class NodesResourceUsageStats implements Writeable, ToXContentFragment { + + // Map of node id to resource usage stats of the corresponding node. + private final Map nodeIdToResourceUsageStatsMap; + + public NodesResourceUsageStats(Map nodeIdToResourceUsageStatsMap) { + this.nodeIdToResourceUsageStatsMap = nodeIdToResourceUsageStatsMap; + } + + public NodesResourceUsageStats(StreamInput in) throws IOException { + this.nodeIdToResourceUsageStatsMap = in.readMap(StreamInput::readString, NodeResourceUsageStats::new); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeMap(this.nodeIdToResourceUsageStatsMap, StreamOutput::writeString, (stream, stats) -> stats.writeTo(stream)); + } + + /** + * Returns map of node id to resource usage stats of the corresponding node. + */ + public Map getNodeIdToResourceUsageStatsMap() { + return nodeIdToResourceUsageStatsMap; + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject("resource_usage_stats"); + for (String nodeId : nodeIdToResourceUsageStatsMap.keySet()) { + builder.startObject(nodeId); + NodeResourceUsageStats resourceUsageStats = nodeIdToResourceUsageStatsMap.get(nodeId); + if (resourceUsageStats != null) { + builder.field("timestamp", resourceUsageStats.timestamp); + builder.field("cpu_utilization_percent", String.format(Locale.ROOT, "%.1f", resourceUsageStats.cpuUtilizationPercent)); + builder.field( + "memory_utilization_percent", + String.format(Locale.ROOT, "%.1f", resourceUsageStats.memoryUtilizationPercent) + ); + } + builder.endObject(); + } + builder.endObject(); + return builder; + } +} diff --git a/server/src/main/java/org/opensearch/node/ResourceUsageCollectorService.java b/server/src/main/java/org/opensearch/node/ResourceUsageCollectorService.java new file mode 100644 index 0000000000000..f1c763e09f147 --- /dev/null +++ b/server/src/main/java/org/opensearch/node/ResourceUsageCollectorService.java @@ -0,0 +1,160 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.node; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.cluster.ClusterChangedEvent; +import org.opensearch.cluster.ClusterStateListener; +import org.opensearch.cluster.node.DiscoveryNode; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.lifecycle.AbstractLifecycleComponent; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.common.util.concurrent.ConcurrentCollections; +import org.opensearch.node.resource.tracker.NodeResourceUsageTracker; +import org.opensearch.threadpool.Scheduler; +import org.opensearch.threadpool.ThreadPool; + +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.ConcurrentMap; + +/** + * This collects node level resource usage statistics such as cpu, memory, IO of each node and makes it available for + * coordinator node to aid in throttling, ranking etc + */ +public class ResourceUsageCollectorService extends AbstractLifecycleComponent implements ClusterStateListener { + + /** + * This refresh interval denotes the polling interval of ResourceUsageCollectorService to refresh the resource usage + * stats from local node + */ + private static long REFRESH_INTERVAL_IN_MILLIS = 1000; + + private static final Logger logger = LogManager.getLogger(ResourceUsageCollectorService.class); + private final ConcurrentMap nodeIdToResourceUsageStats = ConcurrentCollections.newConcurrentMap(); + + private ThreadPool threadPool; + private volatile Scheduler.Cancellable scheduledFuture; + + private NodeResourceUsageTracker nodeResourceUsageTracker; + private ClusterService clusterService; + + public ResourceUsageCollectorService( + NodeResourceUsageTracker nodeResourceUsageTracker, + ClusterService clusterService, + ThreadPool threadPool + ) { + this.threadPool = threadPool; + this.nodeResourceUsageTracker = nodeResourceUsageTracker; + this.clusterService = clusterService; + clusterService.addListener(this); + } + + @Override + public void clusterChanged(ClusterChangedEvent event) { + if (event.nodesRemoved()) { + for (DiscoveryNode removedNode : event.nodesDelta().removedNodes()) { + removeNodeResourceUsageStats(removedNode.getId()); + } + } + } + + void removeNodeResourceUsageStats(String nodeId) { + nodeIdToResourceUsageStats.remove(nodeId); + } + + /** + * Collect node resource usage stats along with the timestamp + */ + public void collectNodeResourceUsageStats( + String nodeId, + long timestamp, + double memoryUtilizationPercent, + double cpuUtilizationPercent + ) { + nodeIdToResourceUsageStats.compute(nodeId, (id, resourceUsageStats) -> { + if (resourceUsageStats == null) { + return new NodeResourceUsageStats(nodeId, timestamp, memoryUtilizationPercent, cpuUtilizationPercent); + } else { + resourceUsageStats.cpuUtilizationPercent = cpuUtilizationPercent; + resourceUsageStats.memoryUtilizationPercent = memoryUtilizationPercent; + resourceUsageStats.timestamp = timestamp; + return resourceUsageStats; + } + }); + } + + /** + * Get all node resource usage statistics which will be used for node stats + */ + public Map getAllNodeStatistics() { + Map nodeStats = new HashMap<>(nodeIdToResourceUsageStats.size()); + nodeIdToResourceUsageStats.forEach((nodeId, resourceUsageStats) -> { + nodeStats.put(nodeId, new NodeResourceUsageStats(resourceUsageStats)); + }); + return nodeStats; + } + + /** + * Optionally return a {@code NodeResourceUsageStats} for the given nodeid, if + * resource usage stats information exists for the given node. Returns an empty + * {@code Optional} if the node was not found. + */ + public Optional getNodeStatistics(final String nodeId) { + return Optional.ofNullable(nodeIdToResourceUsageStats.get(nodeId)) + .map(resourceUsageStats -> new NodeResourceUsageStats(resourceUsageStats)); + } + + /** + * Returns collected resource usage statistics of all nodes + */ + public NodesResourceUsageStats stats() { + return new NodesResourceUsageStats(getAllNodeStatistics()); + } + + /** + * Fetch local node resource usage statistics and add it to store along with the current timestamp + */ + private void collectLocalNodeResourceUsageStats() { + if (nodeResourceUsageTracker.isReady() && clusterService.state() != null) { + collectNodeResourceUsageStats( + clusterService.state().nodes().getLocalNodeId(), + System.currentTimeMillis(), + nodeResourceUsageTracker.getMemoryUtilizationPercent(), + nodeResourceUsageTracker.getCpuUtilizationPercent() + ); + } + } + + @Override + protected void doStart() { + /** + * Fetch local node resource usage statistics every second + */ + scheduledFuture = threadPool.scheduleWithFixedDelay(() -> { + try { + collectLocalNodeResourceUsageStats(); + } catch (Exception e) { + logger.warn("failure in ResourceUsageCollectorService", e); + } + }, new TimeValue(REFRESH_INTERVAL_IN_MILLIS), ThreadPool.Names.GENERIC); + } + + @Override + protected void doStop() { + if (scheduledFuture != null) { + scheduledFuture.cancel(); + } + } + + @Override + protected void doClose() {} +} diff --git a/server/src/main/java/org/opensearch/node/resource/tracker/AbstractAverageUsageTracker.java b/server/src/main/java/org/opensearch/node/resource/tracker/AbstractAverageUsageTracker.java new file mode 100644 index 0000000000000..f83a1b7f9fc05 --- /dev/null +++ b/server/src/main/java/org/opensearch/node/resource/tracker/AbstractAverageUsageTracker.java @@ -0,0 +1,100 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.node.resource.tracker; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.common.lifecycle.AbstractLifecycleComponent; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.common.util.MovingAverage; +import org.opensearch.threadpool.Scheduler; +import org.opensearch.threadpool.ThreadPool; + +import java.util.concurrent.atomic.AtomicReference; + +/** + * Base class for sliding window resource usage trackers + */ +public abstract class AbstractAverageUsageTracker extends AbstractLifecycleComponent { + private static final Logger LOGGER = LogManager.getLogger(AbstractAverageUsageTracker.class); + + private final ThreadPool threadPool; + private final TimeValue pollingInterval; + private TimeValue windowDuration; + private final AtomicReference observations = new AtomicReference<>(); + + private volatile Scheduler.Cancellable scheduledFuture; + + public AbstractAverageUsageTracker(ThreadPool threadPool, TimeValue pollingInterval, TimeValue windowDuration) { + this.threadPool = threadPool; + this.pollingInterval = pollingInterval; + this.windowDuration = windowDuration; + this.setWindowSize(windowDuration); + } + + public abstract long getUsage(); + + /** + * Returns the moving average of the datapoints + */ + public double getAverage() { + return observations.get().getAverage(); + } + + /** + * Checks if we have datapoints more than or equal to the window size + */ + public boolean isReady() { + return observations.get().isReady(); + } + + /** + * Creates a new instance of MovingAverage with a new window size based on WindowDuration + */ + public void setWindowSize(TimeValue windowDuration) { + this.windowDuration = windowDuration; + int windowSize = (int) (windowDuration.nanos() / pollingInterval.nanos()); + LOGGER.debug("updated window size: {}", windowSize); + observations.set(new MovingAverage(windowSize)); + } + + public TimeValue getPollingInterval() { + return pollingInterval; + } + + public TimeValue getWindowDuration() { + return windowDuration; + } + + public long getWindowSize() { + return observations.get().getCount(); + } + + public void recordUsage(long usage) { + observations.get().record(usage); + } + + @Override + protected void doStart() { + scheduledFuture = threadPool.scheduleWithFixedDelay(() -> { + long usage = getUsage(); + recordUsage(usage); + }, pollingInterval, ThreadPool.Names.GENERIC); + } + + @Override + protected void doStop() { + if (scheduledFuture != null) { + scheduledFuture.cancel(); + } + } + + @Override + protected void doClose() {} +} diff --git a/server/src/main/java/org/opensearch/node/resource/tracker/AverageCpuUsageTracker.java b/server/src/main/java/org/opensearch/node/resource/tracker/AverageCpuUsageTracker.java new file mode 100644 index 0000000000000..160d385762eb0 --- /dev/null +++ b/server/src/main/java/org/opensearch/node/resource/tracker/AverageCpuUsageTracker.java @@ -0,0 +1,38 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.node.resource.tracker; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.monitor.process.ProcessProbe; +import org.opensearch.threadpool.ThreadPool; + +/** + * AverageCpuUsageTracker tracks the average CPU usage by polling the CPU usage every (pollingInterval) + * and keeping track of the rolling average over a defined time window (windowDuration). + */ +public class AverageCpuUsageTracker extends AbstractAverageUsageTracker { + private static final Logger LOGGER = LogManager.getLogger(AverageCpuUsageTracker.class); + + public AverageCpuUsageTracker(ThreadPool threadPool, TimeValue pollingInterval, TimeValue windowDuration) { + super(threadPool, pollingInterval, windowDuration); + } + + /** + * Returns the process CPU usage in percent + */ + @Override + public long getUsage() { + long usage = ProcessProbe.getInstance().getProcessCpuPercent(); + LOGGER.debug("Recording cpu usage: {}%", usage); + return usage; + } + +} diff --git a/server/src/main/java/org/opensearch/node/resource/tracker/AverageMemoryUsageTracker.java b/server/src/main/java/org/opensearch/node/resource/tracker/AverageMemoryUsageTracker.java new file mode 100644 index 0000000000000..c1d1c83656859 --- /dev/null +++ b/server/src/main/java/org/opensearch/node/resource/tracker/AverageMemoryUsageTracker.java @@ -0,0 +1,42 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.node.resource.tracker; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.threadpool.ThreadPool; + +import java.lang.management.ManagementFactory; +import java.lang.management.MemoryMXBean; + +/** + * AverageMemoryUsageTracker tracks the average JVM usage by polling the JVM usage every (pollingInterval) + * and keeping track of the rolling average over a defined time window (windowDuration). + */ +public class AverageMemoryUsageTracker extends AbstractAverageUsageTracker { + + private static final Logger LOGGER = LogManager.getLogger(AverageMemoryUsageTracker.class); + + private static final MemoryMXBean MEMORY_MX_BEAN = ManagementFactory.getMemoryMXBean(); + + public AverageMemoryUsageTracker(ThreadPool threadPool, TimeValue pollingInterval, TimeValue windowDuration) { + super(threadPool, pollingInterval, windowDuration); + } + + /** + * Get current memory usage percentage calculated against max heap memory + */ + @Override + public long getUsage() { + long usage = MEMORY_MX_BEAN.getHeapMemoryUsage().getUsed() * 100 / MEMORY_MX_BEAN.getHeapMemoryUsage().getMax(); + LOGGER.debug("Recording memory usage: {}%", usage); + return usage; + } +} diff --git a/server/src/main/java/org/opensearch/node/resource/tracker/NodeResourceUsageTracker.java b/server/src/main/java/org/opensearch/node/resource/tracker/NodeResourceUsageTracker.java new file mode 100644 index 0000000000000..cf5f38c1b004c --- /dev/null +++ b/server/src/main/java/org/opensearch/node/resource/tracker/NodeResourceUsageTracker.java @@ -0,0 +1,118 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.node.resource.tracker; + +import org.opensearch.common.lifecycle.AbstractLifecycleComponent; +import org.opensearch.common.settings.ClusterSettings; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.threadpool.ThreadPool; + +/** + * This tracks the usage of node resources such as CPU, IO and memory + */ +public class NodeResourceUsageTracker extends AbstractLifecycleComponent { + private ThreadPool threadPool; + private final ClusterSettings clusterSettings; + private AverageCpuUsageTracker cpuUsageTracker; + private AverageMemoryUsageTracker memoryUsageTracker; + + private ResourceTrackerSettings resourceTrackerSettings; + + public NodeResourceUsageTracker(ThreadPool threadPool, Settings settings, ClusterSettings clusterSettings) { + this.threadPool = threadPool; + this.clusterSettings = clusterSettings; + this.resourceTrackerSettings = new ResourceTrackerSettings(settings); + initialize(); + } + + /** + * Return CPU utilization average if we have enough datapoints, otherwise return 0 + */ + public double getCpuUtilizationPercent() { + if (cpuUsageTracker.isReady()) { + return cpuUsageTracker.getAverage(); + } + return 0.0; + } + + /** + * Return memory utilization average if we have enough datapoints, otherwise return 0 + */ + public double getMemoryUtilizationPercent() { + if (memoryUsageTracker.isReady()) { + return memoryUsageTracker.getAverage(); + } + return 0.0; + } + + /** + * Checks if all of the resource usage trackers are ready + */ + public boolean isReady() { + return memoryUsageTracker.isReady() && cpuUsageTracker.isReady(); + } + + void initialize() { + cpuUsageTracker = new AverageCpuUsageTracker( + threadPool, + resourceTrackerSettings.getCpuPollingInterval(), + resourceTrackerSettings.getCpuWindowDuration() + ); + clusterSettings.addSettingsUpdateConsumer( + ResourceTrackerSettings.GLOBAL_CPU_USAGE_AC_WINDOW_DURATION_SETTING, + this::setCpuWindowDuration + ); + + memoryUsageTracker = new AverageMemoryUsageTracker( + threadPool, + resourceTrackerSettings.getMemoryPollingInterval(), + resourceTrackerSettings.getMemoryWindowDuration() + ); + clusterSettings.addSettingsUpdateConsumer( + ResourceTrackerSettings.GLOBAL_JVM_USAGE_AC_WINDOW_DURATION_SETTING, + this::setMemoryWindowDuration + ); + } + + private void setMemoryWindowDuration(TimeValue windowDuration) { + memoryUsageTracker.setWindowSize(windowDuration); + resourceTrackerSettings.setMemoryWindowDuration(windowDuration); + } + + private void setCpuWindowDuration(TimeValue windowDuration) { + cpuUsageTracker.setWindowSize(windowDuration); + resourceTrackerSettings.setCpuWindowDuration(windowDuration); + } + + /** + * Visible for testing + */ + ResourceTrackerSettings getResourceTrackerSettings() { + return resourceTrackerSettings; + } + + @Override + protected void doStart() { + cpuUsageTracker.doStart(); + memoryUsageTracker.doStart(); + } + + @Override + protected void doStop() { + cpuUsageTracker.doStop(); + memoryUsageTracker.doStop(); + } + + @Override + protected void doClose() { + cpuUsageTracker.doClose(); + memoryUsageTracker.doClose(); + } +} diff --git a/server/src/main/java/org/opensearch/node/resource/tracker/ResourceTrackerSettings.java b/server/src/main/java/org/opensearch/node/resource/tracker/ResourceTrackerSettings.java new file mode 100644 index 0000000000000..f81b008ba7e8b --- /dev/null +++ b/server/src/main/java/org/opensearch/node/resource/tracker/ResourceTrackerSettings.java @@ -0,0 +1,90 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.node.resource.tracker; + +import org.opensearch.common.settings.Setting; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.unit.TimeValue; + +/** + * Settings related to resource usage trackers such as polling interval, window duration etc + */ +public class ResourceTrackerSettings { + + private static class Defaults { + /** + * This is the default polling interval of usage trackers to get the resource utilization data + */ + private static final long POLLING_INTERVAL_IN_MILLIS = 500; + /** + * This is the default window duration on which the average resource utilization values will be calculated + */ + private static final long WINDOW_DURATION_IN_SECONDS = 30; + } + + public static final Setting GLOBAL_CPU_USAGE_AC_POLLING_INTERVAL_SETTING = Setting.positiveTimeSetting( + "node.resource.tracker.global_cpu_usage.polling_interval", + TimeValue.timeValueMillis(Defaults.POLLING_INTERVAL_IN_MILLIS), + Setting.Property.NodeScope + ); + public static final Setting GLOBAL_CPU_USAGE_AC_WINDOW_DURATION_SETTING = Setting.positiveTimeSetting( + "node.resource.tracker.global_cpu_usage.window_duration", + TimeValue.timeValueSeconds(Defaults.WINDOW_DURATION_IN_SECONDS), + Setting.Property.Dynamic, + Setting.Property.NodeScope + ); + + public static final Setting GLOBAL_JVM_USAGE_AC_POLLING_INTERVAL_SETTING = Setting.positiveTimeSetting( + "node.resource.tracker.global_jvmmp.polling_interval", + TimeValue.timeValueMillis(Defaults.POLLING_INTERVAL_IN_MILLIS), + Setting.Property.NodeScope + ); + + public static final Setting GLOBAL_JVM_USAGE_AC_WINDOW_DURATION_SETTING = Setting.positiveTimeSetting( + "node.resource.tracker.global_jvmmp.window_duration", + TimeValue.timeValueSeconds(Defaults.WINDOW_DURATION_IN_SECONDS), + Setting.Property.Dynamic, + Setting.Property.NodeScope + ); + private volatile TimeValue cpuWindowDuration; + private volatile TimeValue cpuPollingInterval; + private volatile TimeValue memoryWindowDuration; + private volatile TimeValue memoryPollingInterval; + + public ResourceTrackerSettings(Settings settings) { + this.cpuPollingInterval = GLOBAL_CPU_USAGE_AC_POLLING_INTERVAL_SETTING.get(settings); + this.cpuWindowDuration = GLOBAL_CPU_USAGE_AC_WINDOW_DURATION_SETTING.get(settings); + this.memoryPollingInterval = GLOBAL_JVM_USAGE_AC_POLLING_INTERVAL_SETTING.get(settings); + this.memoryWindowDuration = GLOBAL_JVM_USAGE_AC_WINDOW_DURATION_SETTING.get(settings); + } + + public TimeValue getCpuWindowDuration() { + return this.cpuWindowDuration; + } + + public TimeValue getCpuPollingInterval() { + return cpuPollingInterval; + } + + public TimeValue getMemoryPollingInterval() { + return memoryPollingInterval; + } + + public TimeValue getMemoryWindowDuration() { + return memoryWindowDuration; + } + + public void setCpuWindowDuration(TimeValue cpuWindowDuration) { + this.cpuWindowDuration = cpuWindowDuration; + } + + public void setMemoryWindowDuration(TimeValue memoryWindowDuration) { + this.memoryWindowDuration = memoryWindowDuration; + } +} diff --git a/server/src/main/java/org/opensearch/node/resource/tracker/package-info.java b/server/src/main/java/org/opensearch/node/resource/tracker/package-info.java new file mode 100644 index 0000000000000..aace2a019973e --- /dev/null +++ b/server/src/main/java/org/opensearch/node/resource/tracker/package-info.java @@ -0,0 +1,12 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/** + * Node level resource usage stats tracker package + */ +package org.opensearch.node.resource.tracker; diff --git a/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java b/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java index e0b35c69cc3c0..7a1b6f113d0e8 100644 --- a/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java +++ b/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java @@ -59,6 +59,8 @@ import org.opensearch.monitor.os.OsStats; import org.opensearch.monitor.process.ProcessStats; import org.opensearch.node.AdaptiveSelectionStats; +import org.opensearch.node.NodeResourceUsageStats; +import org.opensearch.node.NodesResourceUsageStats; import org.opensearch.node.ResponseCollectorService; import org.opensearch.script.ScriptCacheStats; import org.opensearch.script.ScriptStats; @@ -393,6 +395,24 @@ public void testSerialization() throws IOException { assertEquals(aStats.responseTime, bStats.responseTime, 0.01); }); } + NodesResourceUsageStats resourceUsageStats = nodeStats.getResourceUsageStats(); + NodesResourceUsageStats deserializedResourceUsageStats = deserializedNodeStats.getResourceUsageStats(); + if (resourceUsageStats == null) { + assertNull(deserializedResourceUsageStats); + } else { + resourceUsageStats.getNodeIdToResourceUsageStatsMap().forEach((k, v) -> { + NodeResourceUsageStats aResourceUsageStats = resourceUsageStats.getNodeIdToResourceUsageStatsMap().get(k); + NodeResourceUsageStats bResourceUsageStats = deserializedResourceUsageStats.getNodeIdToResourceUsageStatsMap() + .get(k); + assertEquals( + aResourceUsageStats.getMemoryUtilizationPercent(), + bResourceUsageStats.getMemoryUtilizationPercent(), + 0.0 + ); + assertEquals(aResourceUsageStats.getCpuUtilizationPercent(), bResourceUsageStats.getCpuUtilizationPercent(), 0.0); + assertEquals(aResourceUsageStats.getTimestamp(), bResourceUsageStats.getTimestamp()); + }); + } ScriptCacheStats scriptCacheStats = nodeStats.getScriptCacheStats(); ScriptCacheStats deserializedScriptCacheStats = deserializedNodeStats.getScriptCacheStats(); if (scriptCacheStats == null) { @@ -756,6 +776,30 @@ public static NodeStats createNodeStats(boolean remoteStoreStats) { } adaptiveSelectionStats = new AdaptiveSelectionStats(nodeConnections, nodeStats); } + NodesResourceUsageStats nodesResourceUsageStats = null; + if (frequently()) { + int numNodes = randomIntBetween(0, 10); + Map nodeConnections = new HashMap<>(); + Map resourceUsageStatsMap = new HashMap<>(); + for (int i = 0; i < numNodes; i++) { + String nodeId = randomAlphaOfLengthBetween(3, 10); + // add outgoing connection info + if (frequently()) { + nodeConnections.put(nodeId, randomLongBetween(0, 100)); + } + // add node calculations + if (frequently()) { + NodeResourceUsageStats stats = new NodeResourceUsageStats( + nodeId, + System.currentTimeMillis(), + randomDoubleBetween(1.0, 100.0, true), + randomDoubleBetween(1.0, 100.0, true) + ); + resourceUsageStatsMap.put(nodeId, stats); + } + } + nodesResourceUsageStats = new NodesResourceUsageStats(resourceUsageStatsMap); + } ClusterManagerThrottlingStats clusterManagerThrottlingStats = null; if (frequently()) { clusterManagerThrottlingStats = new ClusterManagerThrottlingStats(); @@ -787,6 +831,7 @@ public static NodeStats createNodeStats(boolean remoteStoreStats) { discoveryStats, ingestStats, adaptiveSelectionStats, + nodesResourceUsageStats, scriptCacheStats, null, null, diff --git a/server/src/test/java/org/opensearch/cluster/DiskUsageTests.java b/server/src/test/java/org/opensearch/cluster/DiskUsageTests.java index 8ba965b3df1ab..64949cf861f70 100644 --- a/server/src/test/java/org/opensearch/cluster/DiskUsageTests.java +++ b/server/src/test/java/org/opensearch/cluster/DiskUsageTests.java @@ -190,6 +190,7 @@ public void testFillDiskUsage() { null, null, null, + null, null ), new NodeStats( @@ -216,6 +217,7 @@ public void testFillDiskUsage() { null, null, null, + null, null ), new NodeStats( @@ -242,6 +244,7 @@ public void testFillDiskUsage() { null, null, null, + null, null ) ); @@ -299,6 +302,7 @@ public void testFillDiskUsageSomeInvalidValues() { null, null, null, + null, null ), new NodeStats( @@ -325,6 +329,7 @@ public void testFillDiskUsageSomeInvalidValues() { null, null, null, + null, null ), new NodeStats( @@ -351,6 +356,7 @@ public void testFillDiskUsageSomeInvalidValues() { null, null, null, + null, null ) ); diff --git a/server/src/test/java/org/opensearch/node/ResourceUsageCollectorServiceTests.java b/server/src/test/java/org/opensearch/node/ResourceUsageCollectorServiceTests.java new file mode 100644 index 0000000000000..b2fa884afab69 --- /dev/null +++ b/server/src/test/java/org/opensearch/node/ResourceUsageCollectorServiceTests.java @@ -0,0 +1,190 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.node; + +import org.opensearch.cluster.ClusterChangedEvent; +import org.opensearch.cluster.ClusterName; +import org.opensearch.cluster.ClusterState; +import org.opensearch.cluster.node.DiscoveryNode; +import org.opensearch.cluster.node.DiscoveryNodes; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.settings.ClusterSettings; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.core.common.transport.TransportAddress; +import org.opensearch.node.resource.tracker.NodeResourceUsageTracker; +import org.opensearch.node.resource.tracker.ResourceTrackerSettings; +import org.opensearch.test.OpenSearchSingleNodeTestCase; +import org.opensearch.threadpool.TestThreadPool; +import org.opensearch.threadpool.ThreadPool; +import org.junit.After; +import org.junit.Before; + +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +import static org.opensearch.test.ClusterServiceUtils.createClusterService; +import static org.hamcrest.Matchers.greaterThan; + +/** + * Tests for ResourceUsageCollectorService where we test collect method, get method and whether schedulers + * are working as expected + */ +public class ResourceUsageCollectorServiceTests extends OpenSearchSingleNodeTestCase { + + private ClusterService clusterService; + private ResourceUsageCollectorService collector; + private ThreadPool threadpool; + NodeResourceUsageTracker tracker; + + @Before + public void setUp() throws Exception { + super.setUp(); + + threadpool = new TestThreadPool("resource_usage_collector_tests"); + + clusterService = createClusterService(threadpool); + + Settings settings = Settings.builder() + .put(ResourceTrackerSettings.GLOBAL_JVM_USAGE_AC_WINDOW_DURATION_SETTING.getKey(), new TimeValue(500, TimeUnit.MILLISECONDS)) + .build(); + tracker = new NodeResourceUsageTracker( + threadpool, + settings, + new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS) + ); + collector = new ResourceUsageCollectorService(tracker, clusterService, threadpool); + tracker.start(); + collector.start(); + } + + @After + public void tearDown() throws Exception { + super.tearDown(); + threadpool.shutdownNow(); + clusterService.close(); + collector.stop(); + tracker.stop(); + collector.close(); + tracker.close(); + } + + public void testResourceUsageStats() { + collector.collectNodeResourceUsageStats("node1", System.currentTimeMillis(), 97, 99); + Map nodeStats = collector.getAllNodeStatistics(); + assertTrue(nodeStats.containsKey("node1")); + assertEquals(99.0, nodeStats.get("node1").cpuUtilizationPercent, 0.0); + assertEquals(97.0, nodeStats.get("node1").memoryUtilizationPercent, 0.0); + + Optional nodeResourceUsageStatsOptional = collector.getNodeStatistics("node1"); + + assertNotNull(nodeResourceUsageStatsOptional.get()); + assertEquals(99.0, nodeResourceUsageStatsOptional.get().cpuUtilizationPercent, 0.0); + assertEquals(97.0, nodeResourceUsageStatsOptional.get().memoryUtilizationPercent, 0.0); + + nodeResourceUsageStatsOptional = collector.getNodeStatistics("node2"); + assertTrue(nodeResourceUsageStatsOptional.isEmpty()); + } + + public void testScheduler() throws Exception { + /** + * Wait for cluster state to be ready so that localNode().getId() is ready and we add the values to the map + */ + assertBusy(() -> assertTrue(collector.getNodeStatistics(clusterService.localNode().getId()).isPresent()), 1, TimeUnit.MINUTES); + assertTrue(collector.getNodeStatistics(clusterService.localNode().getId()).isPresent()); + /** + * Wait for memory utilization to be reported greater than 0 + */ + assertBusy( + () -> assertThat( + collector.getNodeStatistics(clusterService.localNode().getId()).get().getMemoryUtilizationPercent(), + greaterThan(0.0) + ), + 5, + TimeUnit.SECONDS + ); + assertTrue(collector.getNodeStatistics("Invalid").isEmpty()); + } + + /* + * Test that concurrently adding values and removing nodes does not cause exceptions + */ + public void testConcurrentAddingAndRemovingNodes() throws Exception { + String[] nodes = new String[] { "a", "b", "c", "d" }; + + final CountDownLatch latch = new CountDownLatch(5); + + Runnable f = () -> { + latch.countDown(); + try { + latch.await(); + } catch (InterruptedException e) { + fail("should not be interrupted"); + } + for (int i = 0; i < randomIntBetween(100, 200); i++) { + if (randomBoolean()) { + collector.removeNodeResourceUsageStats(randomFrom(nodes)); + } + collector.collectNodeResourceUsageStats( + randomFrom(nodes), + System.currentTimeMillis(), + randomIntBetween(1, 100), + randomIntBetween(1, 100) + ); + } + }; + + Thread t1 = new Thread(f); + Thread t2 = new Thread(f); + Thread t3 = new Thread(f); + Thread t4 = new Thread(f); + + t1.start(); + t2.start(); + t3.start(); + t4.start(); + latch.countDown(); + t1.join(); + t2.join(); + t3.join(); + t4.join(); + + final Map nodeStats = collector.getAllNodeStatistics(); + for (String nodeId : nodes) { + if (nodeStats.containsKey(nodeId)) { + assertThat(nodeStats.get(nodeId).memoryUtilizationPercent, greaterThan(0.0)); + assertThat(nodeStats.get(nodeId).cpuUtilizationPercent, greaterThan(0.0)); + } + } + } + + public void testNodeRemoval() { + collector.collectNodeResourceUsageStats("node1", System.currentTimeMillis(), randomIntBetween(1, 100), randomIntBetween(1, 100)); + collector.collectNodeResourceUsageStats("node2", System.currentTimeMillis(), randomIntBetween(1, 100), randomIntBetween(1, 100)); + + ClusterState previousState = ClusterState.builder(new ClusterName("cluster")) + .nodes( + DiscoveryNodes.builder() + .add(DiscoveryNode.createLocal(Settings.EMPTY, new TransportAddress(TransportAddress.META_ADDRESS, 9200), "node1")) + .add(DiscoveryNode.createLocal(Settings.EMPTY, new TransportAddress(TransportAddress.META_ADDRESS, 9201), "node2")) + ) + .build(); + ClusterState newState = ClusterState.builder(previousState) + .nodes(DiscoveryNodes.builder(previousState.nodes()).remove("node2")) + .build(); + ClusterChangedEvent event = new ClusterChangedEvent("test", newState, previousState); + + collector.clusterChanged(event); + final Map nodeStats = collector.getAllNodeStatistics(); + assertTrue(nodeStats.containsKey("node1")); + assertFalse(nodeStats.containsKey("node2")); + } +} diff --git a/server/src/test/java/org/opensearch/node/resource/tracker/AverageUsageTrackerTests.java b/server/src/test/java/org/opensearch/node/resource/tracker/AverageUsageTrackerTests.java new file mode 100644 index 0000000000000..374c993a264d4 --- /dev/null +++ b/server/src/test/java/org/opensearch/node/resource/tracker/AverageUsageTrackerTests.java @@ -0,0 +1,99 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.node.resource.tracker; + +import org.opensearch.common.unit.TimeValue; +import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.threadpool.TestThreadPool; +import org.opensearch.threadpool.ThreadPool; +import org.junit.After; +import org.junit.Before; + +import java.util.concurrent.TimeUnit; + +/** + * Tests to validate AverageMemoryUsageTracker and AverageCpuUsageTracker implementation + */ +public class AverageUsageTrackerTests extends OpenSearchTestCase { + ThreadPool threadPool; + AverageMemoryUsageTracker averageMemoryUsageTracker; + AverageCpuUsageTracker averageCpuUsageTracker; + + @Before + public void setup() { + threadPool = new TestThreadPool(getClass().getName()); + averageMemoryUsageTracker = new AverageMemoryUsageTracker( + threadPool, + new TimeValue(500, TimeUnit.MILLISECONDS), + new TimeValue(1000, TimeUnit.MILLISECONDS) + ); + averageCpuUsageTracker = new AverageCpuUsageTracker( + threadPool, + new TimeValue(500, TimeUnit.MILLISECONDS), + new TimeValue(1000, TimeUnit.MILLISECONDS) + ); + } + + @After + public void cleanup() { + ThreadPool.terminate(threadPool, 5, TimeUnit.SECONDS); + } + + public void testBasicUsage() { + + assertAverageUsageStats(averageMemoryUsageTracker); + assertAverageUsageStats(averageCpuUsageTracker); + } + + public void testUpdateWindowSize() { + assertUpdateWindowSize(averageMemoryUsageTracker); + assertUpdateWindowSize(averageCpuUsageTracker); + } + + private void assertAverageUsageStats(AbstractAverageUsageTracker usageTracker) { + usageTracker.recordUsage(1); + assertFalse(usageTracker.isReady()); + usageTracker.recordUsage(2); + assertTrue(usageTracker.isReady()); + assertEquals(2, usageTracker.getWindowSize()); + assertEquals(1.5, usageTracker.getAverage(), 0.0); + usageTracker.recordUsage(5); + // ( 2 + 5 ) / 2 = 3.5 + assertEquals(3.5, usageTracker.getAverage(), 0.0); + } + + private void assertUpdateWindowSize(AbstractAverageUsageTracker usageTracker) { + usageTracker.recordUsage(1); + usageTracker.recordUsage(2); + + assertEquals(2, usageTracker.getWindowSize()); + assertEquals(1.5, usageTracker.getAverage(), 0.0); + usageTracker.recordUsage(5); + // ( 2 + 5 ) / 2 = 3.5 + assertEquals(3.5, usageTracker.getAverage(), 0.0); + + usageTracker.setWindowSize(new TimeValue(2000, TimeUnit.MILLISECONDS)); + assertEquals(0, usageTracker.getWindowSize()); + assertEquals(0.0, usageTracker.getAverage(), 0.0); + // verify 2000/500 = 4 is the window size and average is calculated on window size of 4 + usageTracker.recordUsage(1); + usageTracker.recordUsage(2); + usageTracker.recordUsage(1); + assertFalse(usageTracker.isReady()); + usageTracker.recordUsage(2); + assertTrue(usageTracker.isReady()); + assertEquals(4, usageTracker.getWindowSize()); + // (1 + 2 + 1 + 2 ) / 4 = 1.5 + assertEquals(1.5, usageTracker.getAverage(), 0.0); + usageTracker.recordUsage(2); + assertTrue(usageTracker.isReady()); + // ( 2 + 1 + 2 + 2 ) / 4 = 1.75 + assertEquals(1.75, usageTracker.getAverage(), 0.0); + } +} diff --git a/server/src/test/java/org/opensearch/node/resource/tracker/NodeResourceUsageTrackerTests.java b/server/src/test/java/org/opensearch/node/resource/tracker/NodeResourceUsageTrackerTests.java new file mode 100644 index 0000000000000..1ce68b9f29062 --- /dev/null +++ b/server/src/test/java/org/opensearch/node/resource/tracker/NodeResourceUsageTrackerTests.java @@ -0,0 +1,96 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.node.resource.tracker; + +import org.opensearch.action.admin.cluster.settings.ClusterUpdateSettingsResponse; +import org.opensearch.common.settings.ClusterSettings; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.test.OpenSearchSingleNodeTestCase; +import org.opensearch.threadpool.TestThreadPool; +import org.opensearch.threadpool.ThreadPool; +import org.junit.After; +import org.junit.Before; + +import java.util.concurrent.TimeUnit; + +import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked; +import static org.hamcrest.Matchers.greaterThan; + +/** + * Tests to assert resource usage trackers retrieving resource utilization averages + */ +public class NodeResourceUsageTrackerTests extends OpenSearchSingleNodeTestCase { + ThreadPool threadPool; + + @Before + public void setup() { + threadPool = new TestThreadPool(getClass().getName()); + } + + @After + public void cleanup() { + ThreadPool.terminate(threadPool, 5, TimeUnit.SECONDS); + assertAcked( + client().admin() + .cluster() + .prepareUpdateSettings() + .setPersistentSettings(Settings.builder().putNull("*")) + .setTransientSettings(Settings.builder().putNull("*")) + ); + } + + public void testStats() throws Exception { + Settings settings = Settings.builder() + .put(ResourceTrackerSettings.GLOBAL_JVM_USAGE_AC_WINDOW_DURATION_SETTING.getKey(), new TimeValue(500, TimeUnit.MILLISECONDS)) + .build(); + NodeResourceUsageTracker tracker = new NodeResourceUsageTracker( + threadPool, + settings, + new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS) + ); + tracker.start(); + /** + * Asserting memory utilization to be greater than 0 + * cpu percent used is mostly 0, so skipping assertion for that + */ + assertBusy(() -> assertThat(tracker.getMemoryUtilizationPercent(), greaterThan(0.0)), 5, TimeUnit.SECONDS); + tracker.stop(); + tracker.close(); + } + + public void testUpdateSettings() { + NodeResourceUsageTracker tracker = new NodeResourceUsageTracker( + threadPool, + Settings.EMPTY, + new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS) + ); + + assertEquals(tracker.getResourceTrackerSettings().getCpuWindowDuration().getSeconds(), 30); + assertEquals(tracker.getResourceTrackerSettings().getMemoryWindowDuration().getSeconds(), 30); + + Settings settings = Settings.builder() + .put(ResourceTrackerSettings.GLOBAL_CPU_USAGE_AC_WINDOW_DURATION_SETTING.getKey(), "10s") + .build(); + ClusterUpdateSettingsResponse response = client().admin().cluster().prepareUpdateSettings().setPersistentSettings(settings).get(); + assertEquals( + "10s", + response.getPersistentSettings().get(ResourceTrackerSettings.GLOBAL_CPU_USAGE_AC_WINDOW_DURATION_SETTING.getKey()) + ); + + Settings jvmsettings = Settings.builder() + .put(ResourceTrackerSettings.GLOBAL_JVM_USAGE_AC_WINDOW_DURATION_SETTING.getKey(), "5s") + .build(); + response = client().admin().cluster().prepareUpdateSettings().setPersistentSettings(jvmsettings).get(); + assertEquals( + "5s", + response.getPersistentSettings().get(ResourceTrackerSettings.GLOBAL_JVM_USAGE_AC_WINDOW_DURATION_SETTING.getKey()) + ); + } +} diff --git a/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java b/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java index 6354cf18e8b62..a520b6278ea47 100644 --- a/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java +++ b/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java @@ -112,6 +112,7 @@ List adjustNodesStats(List nodesStats) { nodeStats.getDiscoveryStats(), nodeStats.getIngestStats(), nodeStats.getAdaptiveSelectionStats(), + nodeStats.getResourceUsageStats(), nodeStats.getScriptCacheStats(), nodeStats.getIndexingPressureStats(), nodeStats.getShardIndexingPressureStats(), diff --git a/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java b/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java index acdc5b94804c6..3c7423f73685f 100644 --- a/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java +++ b/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java @@ -2718,6 +2718,7 @@ public void ensureEstimatedStats() { false, false, false, + false, false ); assertThat(