From 5d93ce07698907ec928e0121016ac8ca69674a8f Mon Sep 17 00:00:00 2001 From: Ayush Saxena Date: Thu, 9 Apr 2020 23:36:53 +0530 Subject: [PATCH 1/4] HDFS-15261. RBF: Add Block Related Metrics. Contributed by Ayush Saxena. --- .../federation/metrics/FederationMBean.java | 41 +++++++ .../metrics/NamenodeBeanMetrics.java | 32 +++++- .../server/federation/metrics/RBFMetrics.java | 29 +++++ .../resolver/MembershipNamenodeResolver.java | 9 ++ .../resolver/NamenodeStatusReport.java | 84 +++++++++++++- .../router/NamenodeHeartbeatService.java | 105 ++++++++++++------ .../store/records/MembershipStats.java | 24 ++++ .../impl/pb/MembershipStatsPBImpl.java | 56 ++++++++++ .../src/main/proto/FederationProtocol.proto | 5 + .../federation/metrics/TestRBFMetrics.java | 22 ++++ .../router/TestRouterNamenodeMonitoring.java | 4 +- .../store/records/TestMembershipState.java | 22 ++++ 12 files changed, 396 insertions(+), 37 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java index e78ae4c4fa07c..b9ea8709e90f9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java @@ -302,4 +302,45 @@ public interface FederationMBean { */ @Deprecated boolean isSecurityEnabled(); + + /** + * Get the number of corrupts files. + * + * @return the total number of corrupt files. + */ + int getCorruptFilesCount(); + + /** + * Blocks scheduled for replication. + * + * @return num of blocks scheduled for replication. + */ + long getScheduledReplicationBlocks(); + + /** + * Gets the total number of missing blocks on the cluster with + * replication factor 1. + * + * @return the total number of missing blocks on the cluster with + * replication factor 1. + */ + long getNumberOfMissingBlocksWithReplicationFactorOne(); + + /** + * Gets the total number of replicated low redundancy blocks on the cluster + * with the highest risk of loss. + * + * @return the total number of low redundancy blocks on the cluster + * with the highest risk of loss. + */ + long getHighestPriorityLowRedundancyReplicatedBlocks(); + + /** + * Gets the total number of erasure coded low redundancy blocks on the cluster + * with the highest risk of loss. + * + * @return the total number of low redundancy blocks on the cluster + * with the highest risk of loss. + */ + long getHighestPriorityLowRedundancyECBlocks(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/NamenodeBeanMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/NamenodeBeanMetrics.java index 9fdccad46e6a6..0e6e12a0d3e14 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/NamenodeBeanMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/NamenodeBeanMetrics.java @@ -366,21 +366,46 @@ public long getPendingDeletionBlocks() { @Override public long getScheduledReplicationBlocks() { - return -1; + try { + return getRBFMetrics().getScheduledReplicationBlocks(); + } catch (IOException e) { + LOG.debug("Failed to get number of scheduled replication blocks.", + e.getMessage()); + } + return 0; } @Override public long getNumberOfMissingBlocksWithReplicationFactorOne() { + try { + return getRBFMetrics().getNumberOfMissingBlocksWithReplicationFactorOne(); + } catch (IOException e) { + LOG.debug("Failed to get number of missing blocks with replication " + + "factor one.", e.getMessage()); + } return 0; } @Override public long getHighestPriorityLowRedundancyReplicatedBlocks() { + try { + return getRBFMetrics().getHighestPriorityLowRedundancyReplicatedBlocks(); + } catch (IOException e) { + LOG.debug("Failed to get number of highest priority low redundancy " + + "replicated blocks.", e.getMessage()); + } return 0; } @Override public long getHighestPriorityLowRedundancyECBlocks() { + try { + return getRBFMetrics().getHighestPriorityLowRedundancyECBlocks(); + } catch (IOException e) { + LOG.debug("Failed to get number of highest priority low redundancy EC " + + "blocks.", + e.getMessage()); + } return 0; } @@ -391,6 +416,11 @@ public String getCorruptFiles() { @Override public int getCorruptFilesCount() { + try { + return getRBFMetrics().getCorruptFilesCount(); + } catch (IOException e) { + LOG.debug("Failed to get number of corrupt files.", e.getMessage()); + } return 0; } diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java index 1eae105b82127..a8422f17be7ca 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java @@ -670,6 +670,35 @@ public boolean isSecurityEnabled() { return UserGroupInformation.isSecurityEnabled(); } + @Override + public int getCorruptFilesCount() { + return getNameserviceAggregatedInt(MembershipStats::getCorruptFilesCount); + } + + @Override + public long getScheduledReplicationBlocks() { + return getNameserviceAggregatedLong( + MembershipStats::getScheduledReplicationBlocks); + } + + @Override + public long getNumberOfMissingBlocksWithReplicationFactorOne() { + return getNameserviceAggregatedLong( + MembershipStats::getNumberOfMissingBlocksWithReplicationFactorOne); + } + + @Override + public long getHighestPriorityLowRedundancyReplicatedBlocks() { + return getNameserviceAggregatedLong( + MembershipStats::getHighestPriorityLowRedundancyReplicatedBlocks); + } + + @Override + public long getHighestPriorityLowRedundancyECBlocks() { + return getNameserviceAggregatedLong( + MembershipStats::getHighestPriorityLowRedundancyECBlocks); + } + @Override public String getSafemode() { if (this.router.isRouterState(RouterServiceState.SAFEMODE)) { diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java index 9d2dd1651a02d..a31247c1c8077 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java @@ -301,6 +301,15 @@ public boolean registerNamenode(NamenodeStatusReport report) report.getNumInMaintenanceDeadDataNodes()); stats.setNumOfEnteringMaintenanceDataNodes( report.getNumEnteringMaintenanceDataNodes()); + stats.setCorruptFilesCount(report.getCorruptFilesCount()); + stats.setScheduledReplicationBlocks( + report.getScheduledReplicationBlocks()); + stats.setNumberOfMissingBlocksWithReplicationFactorOne( + report.getNumberOfMissingBlocksWithReplicationFactorOne()); + stats.setHighestPriorityLowRedundancyReplicatedBlocks( + report.getHighestPriorityLowRedundancyReplicatedBlocks()); + stats.setHighestPriorityLowRedundancyECBlocks( + report.getHighestPriorityLowRedundancyECBlocks()); record.setStats(stats); } diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/NamenodeStatusReport.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/NamenodeStatusReport.java index 7b05e4842350a..feb5a86dba83b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/NamenodeStatusReport.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/NamenodeStatusReport.java @@ -70,6 +70,11 @@ public class NamenodeStatusReport { private long numOfBlocksPendingDeletion = -1; private long totalSpace = -1; private long providedSpace = -1; + private int corruptFilesCount = -1; + private long scheduledReplicationBlocks = -1; + private long numberOfMissingBlocksWithReplicationFactorOne = -1; + private long highestPriorityLowRedundancyReplicatedBlocks = -1; + private long highestPriorityLowRedundancyECBlocks = -1; /** If the fields are valid. */ private boolean registrationValid = false; @@ -251,11 +256,12 @@ public boolean getSafemode() { * @param numInMaintenanceLive Number of in maintenance live nodes. * @param numInMaintenanceDead Number of in maintenance dead nodes. * @param numEnteringMaintenance Number of entering maintenance nodes. + * @param numScheduledReplicationBlocks Number of scheduled rep. blocks. */ public void setDatanodeInfo(int numLive, int numDead, int numStale, int numDecom, int numLiveDecom, int numDeadDecom, int numInMaintenanceLive, int numInMaintenanceDead, - int numEnteringMaintenance) { + int numEnteringMaintenance, long numScheduledReplicationBlocks) { this.liveDatanodes = numLive; this.deadDatanodes = numDead; this.staleDatanodes = numStale; @@ -266,6 +272,7 @@ public void setDatanodeInfo(int numLive, int numDead, int numStale, this.inMaintenanceDeadDataNodes = numInMaintenanceDead; this.enteringMaintenanceDataNodes = numEnteringMaintenance; this.statsValid = true; + this.scheduledReplicationBlocks = numScheduledReplicationBlocks; } /** @@ -378,6 +385,81 @@ public void setNamesystemInfo(long available, long total, this.providedSpace = providedSpace; } + /** + * Set the namenode blocks information. + * + * @param numCorruptFiles number of corrupt files. + * @param numOfMissingBlocksWithReplicationFactorOne number of missing + * blocks with rep one. + * @param highestPriorityLowRedundancyRepBlocks number of high priority low + * redundancy rep blocks. + * @param highPriorityLowRedundancyECBlocks number of high priority low + * redundancy EC blocks. + */ + public void setNamenodeInfo(int numCorruptFiles, + long numOfMissingBlocksWithReplicationFactorOne, + long highestPriorityLowRedundancyRepBlocks, + long highPriorityLowRedundancyECBlocks) { + this.corruptFilesCount = numCorruptFiles; + this.numberOfMissingBlocksWithReplicationFactorOne = + numOfMissingBlocksWithReplicationFactorOne; + this.highestPriorityLowRedundancyReplicatedBlocks = + highestPriorityLowRedundancyRepBlocks; + this.highestPriorityLowRedundancyECBlocks = + highPriorityLowRedundancyECBlocks; + } + + /** + * Get the number of corrupt files. + * + * @return the total number of corrupt files + */ + public int getCorruptFilesCount() { + return this.corruptFilesCount; + } + + /** + * Blocks scheduled for replication. + * + * @return - num of blocks scheduled for replication + */ + public long getScheduledReplicationBlocks() { + return this.scheduledReplicationBlocks; + } + + /** + * Gets the total number of missing blocks on the cluster with + * replication factor 1. + * + * @return the total number of missing blocks on the cluster with + * replication factor 1. + */ + public long getNumberOfMissingBlocksWithReplicationFactorOne() { + return this.numberOfMissingBlocksWithReplicationFactorOne; + } + + /** + * Gets the total number of replicated low redundancy blocks on the cluster + * with the highest risk of loss. + * + * @return the total number of low redundancy blocks on the cluster + * with the highest risk of loss. + */ + public long getHighestPriorityLowRedundancyReplicatedBlocks() { + return this.highestPriorityLowRedundancyReplicatedBlocks; + } + + /** + * Gets the total number of erasure coded low redundancy blocks on the cluster + * with the highest risk of loss. + * + * @return the total number of low redundancy blocks on the cluster + * with the highest risk of loss. + */ + public long getHighestPriorityLowRedundancyECBlocks() { + return this.highestPriorityLowRedundancyECBlocks; + } + /** * Get the number of blocks. * diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NamenodeHeartbeatService.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NamenodeHeartbeatService.java index ffae90e84895c..8cf51386587e5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NamenodeHeartbeatService.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NamenodeHeartbeatService.java @@ -42,6 +42,7 @@ import org.apache.hadoop.hdfs.tools.NNHAServiceTarget; import org.apache.hadoop.hdfs.web.URLConnectionFactory; import org.codehaus.jettison.json.JSONArray; +import org.codehaus.jettison.json.JSONException; import org.codehaus.jettison.json.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -348,44 +349,82 @@ private void updateJMXParameters( String address, NamenodeStatusReport report) { try { // TODO part of this should be moved to its own utility - String query = "Hadoop:service=NameNode,name=FSNamesystem*"; - JSONArray aux = FederationUtil.getJmx( - query, address, connectionFactory, scheme); - if (aux != null) { - for (int i = 0; i < aux.length(); i++) { - JSONObject jsonObject = aux.getJSONObject(i); - String name = jsonObject.getString("name"); - if (name.equals("Hadoop:service=NameNode,name=FSNamesystemState")) { - report.setDatanodeInfo( - jsonObject.getInt("NumLiveDataNodes"), - jsonObject.getInt("NumDeadDataNodes"), - jsonObject.getInt("NumStaleDataNodes"), - jsonObject.getInt("NumDecommissioningDataNodes"), - jsonObject.getInt("NumDecomLiveDataNodes"), - jsonObject.getInt("NumDecomDeadDataNodes"), - jsonObject.optInt("NumInMaintenanceLiveDataNodes"), - jsonObject.optInt("NumInMaintenanceDeadDataNodes"), - jsonObject.optInt("NumEnteringMaintenanceDataNodes")); - } else if (name.equals( - "Hadoop:service=NameNode,name=FSNamesystem")) { - report.setNamesystemInfo( - jsonObject.getLong("CapacityRemaining"), - jsonObject.getLong("CapacityTotal"), - jsonObject.getLong("FilesTotal"), - jsonObject.getLong("BlocksTotal"), - jsonObject.getLong("MissingBlocks"), - jsonObject.getLong("PendingReplicationBlocks"), - jsonObject.getLong("UnderReplicatedBlocks"), - jsonObject.getLong("PendingDeletionBlocks"), - jsonObject.optLong("ProvidedCapacityTotal")); - } - } - } + getFsNamesystemMetrics(address, report); + getNamenodeInfoMetrics(address, report); } catch (Exception e) { LOG.error("Cannot get stat from {} using JMX", getNamenodeDesc(), e); } } + /** + * Fetches NamenodeInfo metrics from namenode. + * @param address Web interface of the Namenode to monitor. + * @param report Namenode status report to update with JMX data. + * @throws JSONException + */ + private void getNamenodeInfoMetrics(String address, + NamenodeStatusReport report) throws JSONException { + String query = "Hadoop:service=NameNode,name=NameNodeInfo"; + JSONArray aux = + FederationUtil.getJmx(query, address, connectionFactory, scheme); + if (aux != null && aux.length() > 0) { + JSONObject jsonObject = aux.getJSONObject(0); + String name = jsonObject.getString("name"); + if (name.equals("Hadoop:service=NameNode,name=NameNodeInfo")) { + report.setNamenodeInfo(jsonObject.optInt("CorruptFilesCount"), + jsonObject + .optLong("NumberOfMissingBlocksWithReplicationFactorOne"), + jsonObject + .optLong("HighestPriorityLowRedundancyReplicatedBlocks"), + jsonObject.optLong("HighestPriorityLowRedundancyECBlocks")); + } + } + } + + /** + * Fetches FSNamesystem* metrics from namenode. + * @param address Web interface of the Namenode to monitor. + * @param report Namenode status report to update with JMX data. + * @throws JSONException + */ + private void getFsNamesystemMetrics(String address, + NamenodeStatusReport report) throws JSONException { + String query = "Hadoop:service=NameNode,name=FSNamesystem*"; + JSONArray aux = FederationUtil.getJmx( + query, address, connectionFactory, scheme); + if (aux != null) { + for (int i = 0; i < aux.length(); i++) { + JSONObject jsonObject = aux.getJSONObject(i); + String name = jsonObject.getString("name"); + if (name.equals("Hadoop:service=NameNode,name=FSNamesystemState")) { + report.setDatanodeInfo( + jsonObject.getInt("NumLiveDataNodes"), + jsonObject.getInt("NumDeadDataNodes"), + jsonObject.getInt("NumStaleDataNodes"), + jsonObject.getInt("NumDecommissioningDataNodes"), + jsonObject.getInt("NumDecomLiveDataNodes"), + jsonObject.getInt("NumDecomDeadDataNodes"), + jsonObject.optInt("NumInMaintenanceLiveDataNodes"), + jsonObject.optInt("NumInMaintenanceDeadDataNodes"), + jsonObject.optInt("NumEnteringMaintenanceDataNodes"), + jsonObject.optLong("ScheduledReplicationBlocks")); + } else if (name.equals( + "Hadoop:service=NameNode,name=FSNamesystem")) { + report.setNamesystemInfo( + jsonObject.getLong("CapacityRemaining"), + jsonObject.getLong("CapacityTotal"), + jsonObject.getLong("FilesTotal"), + jsonObject.getLong("BlocksTotal"), + jsonObject.getLong("MissingBlocks"), + jsonObject.getLong("PendingReplicationBlocks"), + jsonObject.getLong("UnderReplicatedBlocks"), + jsonObject.getLong("PendingDeletionBlocks"), + jsonObject.optLong("ProvidedCapacityTotal")); + } + } + } + } + @Override protected void serviceStop() throws Exception { LOG.info("Stopping NamenodeHeartbeat service for, NS {} NN {} ", diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/MembershipStats.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/MembershipStats.java index 95c790cc95c59..21c8c2f79fce4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/MembershipStats.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/MembershipStats.java @@ -109,6 +109,30 @@ public static MembershipStats newInstance() throws IOException { public abstract int getNumOfEnteringMaintenanceDataNodes(); + public abstract void setCorruptFilesCount(int num); + + public abstract int getCorruptFilesCount(); + + public abstract void setScheduledReplicationBlocks(long blocks); + + public abstract long getScheduledReplicationBlocks(); + + public abstract void setNumberOfMissingBlocksWithReplicationFactorOne( + long blocks); + + public abstract long getNumberOfMissingBlocksWithReplicationFactorOne(); + + public abstract void setHighestPriorityLowRedundancyReplicatedBlocks( + long blocks); + + public abstract long getHighestPriorityLowRedundancyReplicatedBlocks(); + + + public abstract void setHighestPriorityLowRedundancyECBlocks( + long blocks); + + public abstract long getHighestPriorityLowRedundancyECBlocks(); + @Override public SortedMap getPrimaryKeys() { // This record is not stored directly, no key needed diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/impl/pb/MembershipStatsPBImpl.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/impl/pb/MembershipStatsPBImpl.java index 73d3e969ebf56..2caa59dfca7e5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/impl/pb/MembershipStatsPBImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/impl/pb/MembershipStatsPBImpl.java @@ -241,4 +241,60 @@ public int getNumOfEnteringMaintenanceDataNodes() { return this.translator.getProtoOrBuilder() .getNumOfEnteringMaintenanceDataNodes(); } + + @Override + public void setCorruptFilesCount(int num) { + this.translator.getBuilder().setCorruptFilesCount(num); + } + + @Override + public int getCorruptFilesCount() { + return this.translator.getProtoOrBuilder().getCorruptFilesCount(); + } + + @Override + public void setScheduledReplicationBlocks(long blocks) { + this.translator.getBuilder().setScheduledReplicationBlocks(blocks); + } + + @Override + public long getScheduledReplicationBlocks() { + return this.translator.getProtoOrBuilder().getScheduledReplicationBlocks(); + } + + @Override + public void setNumberOfMissingBlocksWithReplicationFactorOne(long blocks) { + this.translator.getBuilder() + .setNumberOfMissingBlocksWithReplicationFactorOne(blocks); + } + + @Override + public long getNumberOfMissingBlocksWithReplicationFactorOne() { + return this.translator.getProtoOrBuilder() + .getNumberOfMissingBlocksWithReplicationFactorOne(); + } + + @Override + public void setHighestPriorityLowRedundancyReplicatedBlocks(long blocks) { + this.translator.getBuilder() + .setHighestPriorityLowRedundancyReplicatedBlocks(blocks); + } + + @Override + public long getHighestPriorityLowRedundancyReplicatedBlocks() { + return this.translator.getProtoOrBuilder() + .getHighestPriorityLowRedundancyReplicatedBlocks(); + } + + @Override + public void setHighestPriorityLowRedundancyECBlocks(long blocks) { + this.translator.getBuilder() + .setHighestPriorityLowRedundancyECBlocks(blocks); + } + + @Override + public long getHighestPriorityLowRedundancyECBlocks() { + return this.translator.getProtoOrBuilder() + .getHighestPriorityLowRedundancyECBlocks(); + } } \ No newline at end of file diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/proto/FederationProtocol.proto b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/proto/FederationProtocol.proto index ad391e72eb0fe..00deff4ed1e42 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/proto/FederationProtocol.proto +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/proto/FederationProtocol.proto @@ -49,6 +49,11 @@ message NamenodeMembershipStatsRecordProto { optional uint32 numOfInMaintenanceLiveDataNodes = 26; optional uint32 numOfInMaintenanceDeadDataNodes = 27; optional uint32 numOfEnteringMaintenanceDataNodes = 28; + optional uint32 corruptFilesCount = 29; + optional uint64 scheduledReplicationBlocks = 30; + optional uint64 numberOfMissingBlocksWithReplicationFactorOne = 31; + optional uint64 highestPriorityLowRedundancyReplicatedBlocks = 32; + optional uint64 HighestPriorityLowRedundancyECBlocks = 33; } message NamenodeMembershipRecordProto { diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java index eed41c7ba396b..25473f8df9233 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java @@ -290,6 +290,11 @@ private void validateClusterStatsFederationBean(FederationMBean bean) { long numInMaintenanceLive = 0; long numInMaintenanceDead = 0; long numEnteringMaintenance = 0; + int numCorruptsFilesCount = 0; + long scheduledReplicationBlocks = 0; + long numberOfMissingBlocksWithReplicationFactorOne = 0; + long highestPriorityLowRedundancyReplicatedBlocks = 0; + long highestPriorityLowRedundancyECBlocks = 0; long numFiles = 0; for (MembershipState mock : getActiveMemberships()) { MembershipStats stats = mock.getStats(); @@ -303,6 +308,14 @@ private void validateClusterStatsFederationBean(FederationMBean bean) { numInMaintenanceLive += stats.getNumOfInMaintenanceLiveDataNodes(); numInMaintenanceDead += stats.getNumOfInMaintenanceLiveDataNodes(); numEnteringMaintenance += stats.getNumOfEnteringMaintenanceDataNodes(); + numCorruptsFilesCount += stats.getCorruptFilesCount(); + scheduledReplicationBlocks += stats.getScheduledReplicationBlocks(); + numberOfMissingBlocksWithReplicationFactorOne += + stats.getNumberOfMissingBlocksWithReplicationFactorOne(); + highestPriorityLowRedundancyReplicatedBlocks += + stats.getHighestPriorityLowRedundancyReplicatedBlocks(); + highestPriorityLowRedundancyECBlocks += + stats.getHighestPriorityLowRedundancyECBlocks(); } assertEquals(numBlocks, bean.getNumBlocks()); @@ -320,6 +333,15 @@ private void validateClusterStatsFederationBean(FederationMBean bean) { assertEquals(getActiveMemberships().size() + getStandbyMemberships().size(), bean.getNumNamenodes()); assertEquals(getNameservices().size(), bean.getNumNameservices()); + assertEquals(numCorruptsFilesCount, bean.getCorruptFilesCount()); + assertEquals(scheduledReplicationBlocks, + bean.getScheduledReplicationBlocks()); + assertEquals(numberOfMissingBlocksWithReplicationFactorOne, + bean.getNumberOfMissingBlocksWithReplicationFactorOne()); + assertEquals(highestPriorityLowRedundancyReplicatedBlocks, + bean.getHighestPriorityLowRedundancyReplicatedBlocks()); + assertEquals(highestPriorityLowRedundancyECBlocks, + bean.getHighestPriorityLowRedundancyECBlocks()); } private void validateClusterStatsRouterBean(RouterMBean bean) { diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterNamenodeMonitoring.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterNamenodeMonitoring.java index b5f5b6701f3e9..d2b337c1b7a3b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterNamenodeMonitoring.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterNamenodeMonitoring.java @@ -320,10 +320,10 @@ private void verifyUrlSchemes(String scheme) { heartbeatService.getNamenodeStatusReport(); } if (HttpConfig.Policy.HTTPS_ONLY.name().equals(scheme)) { - assertEquals(1, appender.countLinesWithMessage("JMX URL: https://")); + assertEquals(2, appender.countLinesWithMessage("JMX URL: https://")); assertEquals(0, appender.countLinesWithMessage("JMX URL: http://")); } else { - assertEquals(1, appender.countLinesWithMessage("JMX URL: http://")); + assertEquals(2, appender.countLinesWithMessage("JMX URL: http://")); assertEquals(0, appender.countLinesWithMessage("JMX URL: https://")); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/store/records/TestMembershipState.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/store/records/TestMembershipState.java index 02a42c4703af9..4f5cdca3a0caa 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/store/records/TestMembershipState.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/store/records/TestMembershipState.java @@ -57,6 +57,11 @@ public class TestMembershipState { private static final int NUM_MAIN_DEAD = 303; private static final int NUM_ENTER_MAIN = 144; private static final long NUM_BLOCK_MISSING = 1000; + private static final int CORRUPT_FILES_COUNT = 123; + private static final long SCHEDULED_REPLICATION_BLOCKS = 112; + private static final long MISSING_BLOCK_WITH_REPLICATION_ONE = 221; + private static final long HIGHEST_PRIORITY_LOW_REDUNDANCY_REPL_BLOCK = 212; + private static final long HIGHEST_PRIORITY_LOW_REDUNDANCY_EC_BLOCK = 122; private static final long TOTAL_SPACE = 1100; private static final long AVAILABLE_SPACE = 1200; @@ -88,6 +93,14 @@ private MembershipState createRecord() throws IOException { stats.setNumOfBlocksMissing(NUM_BLOCK_MISSING); stats.setTotalSpace(TOTAL_SPACE); stats.setAvailableSpace(AVAILABLE_SPACE); + stats.setCorruptFilesCount(CORRUPT_FILES_COUNT); + stats.setScheduledReplicationBlocks(SCHEDULED_REPLICATION_BLOCKS); + stats.setNumberOfMissingBlocksWithReplicationFactorOne( + MISSING_BLOCK_WITH_REPLICATION_ONE); + stats.setHighestPriorityLowRedundancyReplicatedBlocks( + HIGHEST_PRIORITY_LOW_REDUNDANCY_REPL_BLOCK); + stats.setHighestPriorityLowRedundancyECBlocks( + HIGHEST_PRIORITY_LOW_REDUNDANCY_EC_BLOCK); record.setStats(stats); return record; } @@ -120,6 +133,15 @@ private void validateRecord(MembershipState record) throws IOException { assertEquals(NUM_ENTER_MAIN, stats.getNumOfEnteringMaintenanceDataNodes()); assertEquals(TOTAL_SPACE, stats.getTotalSpace()); assertEquals(AVAILABLE_SPACE, stats.getAvailableSpace()); + assertEquals(CORRUPT_FILES_COUNT, stats.getCorruptFilesCount()); + assertEquals(SCHEDULED_REPLICATION_BLOCKS, + stats.getScheduledReplicationBlocks()); + assertEquals(MISSING_BLOCK_WITH_REPLICATION_ONE, + stats.getNumberOfMissingBlocksWithReplicationFactorOne()); + assertEquals(HIGHEST_PRIORITY_LOW_REDUNDANCY_REPL_BLOCK, + stats.getHighestPriorityLowRedundancyReplicatedBlocks()); + assertEquals(HIGHEST_PRIORITY_LOW_REDUNDANCY_EC_BLOCK, + stats.getHighestPriorityLowRedundancyECBlocks()); } @Test From 60bc95a2db7fd90bbb6f0ab81c540d44e96c4459 Mon Sep 17 00:00:00 2001 From: Prateek Sane <61072843+PrateekSane@users.noreply.github.com> Date: Mon, 25 Nov 2024 11:58:02 -0800 Subject: [PATCH 2/4] HDFS-17641. Add badly distributed blocks metric (#7123) --- .../src/site/markdown/Metrics.md | 1 + .../hdfs/protocol/ECBlockGroupStats.java | 26 +++++++++--- .../hdfs/protocol/ReplicatedBlockStats.java | 23 +++++++--- .../hdfs/protocolPB/PBHelperClient.java | 13 ++++-- .../main/proto/ClientNamenodeProtocol.proto | 2 + .../federation/metrics/FederationMBean.java | 7 ++++ .../metrics/NamenodeBeanMetrics.java | 10 +++++ .../server/federation/metrics/RBFMetrics.java | 6 +++ .../resolver/MembershipNamenodeResolver.java | 2 + .../resolver/NamenodeStatusReport.java | 15 +++++++ .../router/NamenodeHeartbeatService.java | 5 ++- .../store/records/MembershipStats.java | 7 +++- .../impl/pb/MembershipStatsPBImpl.java | 14 ++++++- .../src/main/proto/FederationProtocol.proto | 3 +- .../federation/metrics/TestRBFMetrics.java | 4 ++ .../server/blockmanagement/BlockManager.java | 12 +++++- .../blockmanagement/LowRedundancyBlocks.java | 20 ++++++++- .../hdfs/server/namenode/FSNamesystem.java | 22 +++++++++- .../hdfs/server/namenode/NameNodeMXBean.java | 8 ++++ .../metrics/ReplicatedBlocksMBean.java | 5 +++ .../apache/hadoop/hdfs/tools/DFSAdmin.java | 6 ++- .../TestLowRedundancyBlockQueues.java | 42 ++++++++++++------- .../namenode/metrics/TestNameNodeMetrics.java | 7 ++++ 23 files changed, 218 insertions(+), 42 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md index fac26cee7c878..3ba53ce3c6538 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md @@ -285,6 +285,7 @@ Each metrics record contains tags such as HAState and Hostname as additional inf | `StaleDataNodes` | Current number of DataNodes marked stale due to delayed heartbeat | | `NumStaleStorages` | Number of storages marked as content stale (after NameNode restart/failover before first block report is received) | | `MissingReplOneBlocks` | Current number of missing blocks with replication factor 1 | +| `BadlyDistributedBlocks` | Current number of blocks that are badly distributed across racks. | | `HighestPriorityLowRedundancyReplicatedBlocks` | Current number of non-corrupt, low redundancy replicated blocks with the highest risk of loss (have 0 or 1 replica). Will be recovered with the highest priority. | | `HighestPriorityLowRedundancyECBlocks` | Current number of non-corrupt, low redundancy EC blocks with the highest risk of loss. Will be recovered with the highest priority. | | `NumFilesUnderConstruction` | Current number of files under construction | diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/ECBlockGroupStats.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/ECBlockGroupStats.java index 1ead5c1fd3421..da156d136a906 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/ECBlockGroupStats.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/ECBlockGroupStats.java @@ -38,24 +38,28 @@ public final class ECBlockGroupStats { private final long missingBlockGroups; private final long bytesInFutureBlockGroups; private final long pendingDeletionBlocks; + private final long badlyDistributedBlocks; private final Long highestPriorityLowRedundancyBlocks; public ECBlockGroupStats(long lowRedundancyBlockGroups, long corruptBlockGroups, long missingBlockGroups, - long bytesInFutureBlockGroups, long pendingDeletionBlocks) { + long bytesInFutureBlockGroups, long pendingDeletionBlocks, + long badlyDistributedBlocks) { this(lowRedundancyBlockGroups, corruptBlockGroups, missingBlockGroups, - bytesInFutureBlockGroups, pendingDeletionBlocks, null); + bytesInFutureBlockGroups, pendingDeletionBlocks, + badlyDistributedBlocks, null); } public ECBlockGroupStats(long lowRedundancyBlockGroups, long corruptBlockGroups, long missingBlockGroups, long bytesInFutureBlockGroups, long pendingDeletionBlocks, - Long highestPriorityLowRedundancyBlocks) { + long badlyDistributedBlocks, Long highestPriorityLowRedundancyBlocks) { this.lowRedundancyBlockGroups = lowRedundancyBlockGroups; this.corruptBlockGroups = corruptBlockGroups; this.missingBlockGroups = missingBlockGroups; this.bytesInFutureBlockGroups = bytesInFutureBlockGroups; this.pendingDeletionBlocks = pendingDeletionBlocks; + this.badlyDistributedBlocks = badlyDistributedBlocks; this.highestPriorityLowRedundancyBlocks = highestPriorityLowRedundancyBlocks; } @@ -80,6 +84,10 @@ public long getPendingDeletionBlocks() { return pendingDeletionBlocks; } + public long getBadlyDistributedBlocks() { + return badlyDistributedBlocks; + } + public boolean hasHighestPriorityLowRedundancyBlocks() { return getHighestPriorityLowRedundancyBlocks() != null; } @@ -99,7 +107,8 @@ public String toString() { .append(", BytesInFutureBlockGroups=").append( getBytesInFutureBlockGroups()) .append(", PendingDeletionBlocks=").append( - getPendingDeletionBlocks()); + getPendingDeletionBlocks()) + .append(" , BadlyDistributedBlocks=").append(getBadlyDistributedBlocks()); if (hasHighestPriorityLowRedundancyBlocks()) { statsBuilder.append(", HighestPriorityLowRedundancyBlocks=") .append(getHighestPriorityLowRedundancyBlocks()); @@ -116,6 +125,7 @@ public int hashCode() { .append(missingBlockGroups) .append(bytesInFutureBlockGroups) .append(pendingDeletionBlocks) + .append(badlyDistributedBlocks) .append(highestPriorityLowRedundancyBlocks) .toHashCode(); } @@ -135,6 +145,7 @@ public boolean equals(Object o) { .append(missingBlockGroups, other.missingBlockGroups) .append(bytesInFutureBlockGroups, other.bytesInFutureBlockGroups) .append(pendingDeletionBlocks, other.pendingDeletionBlocks) + .append(badlyDistributedBlocks, other.badlyDistributedBlocks) .append(highestPriorityLowRedundancyBlocks, other.highestPriorityLowRedundancyBlocks) .isEquals(); @@ -151,6 +162,7 @@ public static ECBlockGroupStats merge(Collection stats) { long missingBlockGroups = 0; long bytesInFutureBlockGroups = 0; long pendingDeletionBlocks = 0; + long badlyDistributedBlocks = 0; long highestPriorityLowRedundancyBlocks = 0; boolean hasHighestPriorityLowRedundancyBlocks = false; @@ -160,6 +172,7 @@ public static ECBlockGroupStats merge(Collection stats) { missingBlockGroups += stat.getMissingBlockGroups(); bytesInFutureBlockGroups += stat.getBytesInFutureBlockGroups(); pendingDeletionBlocks += stat.getPendingDeletionBlocks(); + badlyDistributedBlocks += stat.getBadlyDistributedBlocks(); if (stat.hasHighestPriorityLowRedundancyBlocks()) { hasHighestPriorityLowRedundancyBlocks = true; highestPriorityLowRedundancyBlocks += @@ -169,9 +182,10 @@ public static ECBlockGroupStats merge(Collection stats) { if (hasHighestPriorityLowRedundancyBlocks) { return new ECBlockGroupStats(lowRedundancyBlockGroups, corruptBlockGroups, missingBlockGroups, bytesInFutureBlockGroups, pendingDeletionBlocks, - highestPriorityLowRedundancyBlocks); + badlyDistributedBlocks, highestPriorityLowRedundancyBlocks); } return new ECBlockGroupStats(lowRedundancyBlockGroups, corruptBlockGroups, - missingBlockGroups, bytesInFutureBlockGroups, pendingDeletionBlocks); + missingBlockGroups, bytesInFutureBlockGroups, pendingDeletionBlocks, + badlyDistributedBlocks); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/ReplicatedBlockStats.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/ReplicatedBlockStats.java index d878a27168bdc..800161eab1823 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/ReplicatedBlockStats.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/ReplicatedBlockStats.java @@ -37,27 +37,30 @@ public final class ReplicatedBlockStats { private final long missingReplicationOneBlocks; private final long bytesInFutureBlocks; private final long pendingDeletionBlocks; + private final long badlyDistributedBlocks; private final Long highestPriorityLowRedundancyBlocks; public ReplicatedBlockStats(long lowRedundancyBlocks, long corruptBlocks, long missingBlocks, long missingReplicationOneBlocks, long bytesInFutureBlocks, - long pendingDeletionBlocks) { + long pendingDeletionBlocks, long badlyDistributedBlocks) { this(lowRedundancyBlocks, corruptBlocks, missingBlocks, missingReplicationOneBlocks, bytesInFutureBlocks, pendingDeletionBlocks, - null); + badlyDistributedBlocks, null); } public ReplicatedBlockStats(long lowRedundancyBlocks, long corruptBlocks, long missingBlocks, long missingReplicationOneBlocks, long bytesInFutureBlocks, - long pendingDeletionBlocks, Long highestPriorityLowRedundancyBlocks) { + long pendingDeletionBlocks, long badlyDistributedBlocks, + Long highestPriorityLowRedundancyBlocks) { this.lowRedundancyBlocks = lowRedundancyBlocks; this.corruptBlocks = corruptBlocks; this.missingBlocks = missingBlocks; this.missingReplicationOneBlocks = missingReplicationOneBlocks; this.bytesInFutureBlocks = bytesInFutureBlocks; this.pendingDeletionBlocks = pendingDeletionBlocks; + this.badlyDistributedBlocks = badlyDistributedBlocks; this.highestPriorityLowRedundancyBlocks = highestPriorityLowRedundancyBlocks; } @@ -86,6 +89,10 @@ public long getPendingDeletionBlocks() { return pendingDeletionBlocks; } + public long getBadlyDistributedBlocks() { + return badlyDistributedBlocks; + } + public boolean hasHighestPriorityLowRedundancyBlocks() { return getHighestPriorityLowRedundancyBlocks() != null; } @@ -94,6 +101,7 @@ public Long getHighestPriorityLowRedundancyBlocks(){ return highestPriorityLowRedundancyBlocks; } + @Override public String toString() { StringBuilder statsBuilder = new StringBuilder(); @@ -105,7 +113,8 @@ public String toString() { getMissingReplicationOneBlocks()) .append(", BytesInFutureBlocks=").append(getBytesInFutureBlocks()) .append(", PendingDeletionBlocks=").append( - getPendingDeletionBlocks()); + getPendingDeletionBlocks()) + .append(" , badlyDistributedBlocks=").append(getBadlyDistributedBlocks()); if (hasHighestPriorityLowRedundancyBlocks()) { statsBuilder.append(", HighestPriorityLowRedundancyBlocks=").append( getHighestPriorityLowRedundancyBlocks()); @@ -127,6 +136,7 @@ public static ReplicatedBlockStats merge( long missingReplicationOneBlocks = 0; long bytesInFutureBlocks = 0; long pendingDeletionBlocks = 0; + long badlyDistributedBlocks = 0; long highestPriorityLowRedundancyBlocks = 0; boolean hasHighestPriorityLowRedundancyBlocks = false; @@ -138,6 +148,7 @@ public static ReplicatedBlockStats merge( missingReplicationOneBlocks += stat.getMissingReplicationOneBlocks(); bytesInFutureBlocks += stat.getBytesInFutureBlocks(); pendingDeletionBlocks += stat.getPendingDeletionBlocks(); + badlyDistributedBlocks += stat.getBadlyDistributedBlocks(); if (stat.hasHighestPriorityLowRedundancyBlocks()) { hasHighestPriorityLowRedundancyBlocks = true; highestPriorityLowRedundancyBlocks += @@ -147,10 +158,10 @@ public static ReplicatedBlockStats merge( if (hasHighestPriorityLowRedundancyBlocks) { return new ReplicatedBlockStats(lowRedundancyBlocks, corruptBlocks, missingBlocks, missingReplicationOneBlocks, bytesInFutureBlocks, - pendingDeletionBlocks, highestPriorityLowRedundancyBlocks); + pendingDeletionBlocks, badlyDistributedBlocks, highestPriorityLowRedundancyBlocks); } return new ReplicatedBlockStats(lowRedundancyBlocks, corruptBlocks, missingBlocks, missingReplicationOneBlocks, bytesInFutureBlocks, - pendingDeletionBlocks); + pendingDeletionBlocks, badlyDistributedBlocks); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelperClient.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelperClient.java index fb0fce76bb6c5..9a8ab046897dd 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelperClient.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelperClient.java @@ -1975,13 +1975,13 @@ public static ReplicatedBlockStats convert( return new ReplicatedBlockStats(res.getLowRedundancy(), res.getCorruptBlocks(), res.getMissingBlocks(), res.getMissingReplOneBlocks(), res.getBlocksInFuture(), - res.getPendingDeletionBlocks(), + res.getPendingDeletionBlocks(), res.getBadlyDistributedBlocks(), res.getHighestPrioLowRedundancyBlocks()); } return new ReplicatedBlockStats(res.getLowRedundancy(), res.getCorruptBlocks(), res.getMissingBlocks(), res.getMissingReplOneBlocks(), res.getBlocksInFuture(), - res.getPendingDeletionBlocks()); + res.getBadlyDistributedBlocks(), res.getPendingDeletionBlocks()); } public static ECBlockGroupStats convert( @@ -1990,11 +1990,12 @@ public static ECBlockGroupStats convert( return new ECBlockGroupStats(res.getLowRedundancy(), res.getCorruptBlocks(), res.getMissingBlocks(), res.getBlocksInFuture(), res.getPendingDeletionBlocks(), - res.getHighestPrioLowRedundancyBlocks()); + res.getBadlyDistributedBlocks(), res.getHighestPrioLowRedundancyBlocks()); } return new ECBlockGroupStats(res.getLowRedundancy(), res.getCorruptBlocks(), res.getMissingBlocks(), - res.getBlocksInFuture(), res.getPendingDeletionBlocks()); + res.getBlocksInFuture(), res.getPendingDeletionBlocks(), + res.getBadlyDistributedBlocks()); } public static DatanodeReportTypeProto convert(DatanodeReportType t) { @@ -2440,6 +2441,8 @@ public static GetFsReplicatedBlockStatsResponseProto convert( replicatedBlockStats.getBytesInFutureBlocks()); result.setPendingDeletionBlocks( replicatedBlockStats.getPendingDeletionBlocks()); + result.setBadlyDistributedBlocks( + replicatedBlockStats.getBadlyDistributedBlocks()); if (replicatedBlockStats.hasHighestPriorityLowRedundancyBlocks()) { result.setHighestPrioLowRedundancyBlocks( replicatedBlockStats.getHighestPriorityLowRedundancyBlocks()); @@ -2459,6 +2462,8 @@ public static GetFsECBlockGroupStatsResponseProto convert( ecBlockGroupStats.getBytesInFutureBlockGroups()); result.setPendingDeletionBlocks( ecBlockGroupStats.getPendingDeletionBlocks()); + result.setBadlyDistributedBlocks( + ecBlockGroupStats.getBadlyDistributedBlocks()); if (ecBlockGroupStats.hasHighestPriorityLowRedundancyBlocks()) { result.setHighestPrioLowRedundancyBlocks( ecBlockGroupStats.getHighestPriorityLowRedundancyBlocks()); diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/proto/ClientNamenodeProtocol.proto b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/proto/ClientNamenodeProtocol.proto index df47bb7c5b7e9..1392a46c491eb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/proto/ClientNamenodeProtocol.proto +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/proto/ClientNamenodeProtocol.proto @@ -363,6 +363,7 @@ message GetFsReplicatedBlockStatsResponseProto { required uint64 blocks_in_future = 5; required uint64 pending_deletion_blocks = 6; optional uint64 highest_prio_low_redundancy_blocks = 7; + required uint64 badly_distributed_blocks = 8; } @@ -376,6 +377,7 @@ message GetFsECBlockGroupStatsResponseProto { required uint64 blocks_in_future = 4; required uint64 pending_deletion_blocks = 5; optional uint64 highest_prio_low_redundancy_blocks = 6; + required uint64 badly_distributed_blocks = 7; } enum DatanodeReportTypeProto { // type of the datanode report diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java index b9ea8709e90f9..4bc80ab64ca00 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java @@ -326,6 +326,13 @@ public interface FederationMBean { */ long getNumberOfMissingBlocksWithReplicationFactorOne(); + /** + * Gets the total number of badly distributed blocks. + * + * @return the total number of badly distrubted blocks. + */ + long getNumberOfBadlyDistributedBlocks(); + /** * Gets the total number of replicated low redundancy blocks on the cluster * with the highest risk of loss. diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/NamenodeBeanMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/NamenodeBeanMetrics.java index 0e6e12a0d3e14..3d79f84f1414f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/NamenodeBeanMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/NamenodeBeanMetrics.java @@ -386,6 +386,16 @@ public long getNumberOfMissingBlocksWithReplicationFactorOne() { return 0; } + @Override + public long getNumberOfBadlyDistributedBlocks() { + try { + return getRBFMetrics().getNumberOfBadlyDistributedBlocks(); + } catch (IOException e) { + LOG.debug("Failed to get number of badly distributed blocks", e); + } + return 0; + } + @Override public long getHighestPriorityLowRedundancyReplicatedBlocks() { try { diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java index a8422f17be7ca..2c5c1146232ac 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java @@ -693,6 +693,12 @@ public long getHighestPriorityLowRedundancyReplicatedBlocks() { MembershipStats::getHighestPriorityLowRedundancyReplicatedBlocks); } + @Override + public long getNumberOfBadlyDistributedBlocks() { + return getNameserviceAggregatedLong( + MembershipStats::getNumberOfBadlyDistributedBlocks); + } + @Override public long getHighestPriorityLowRedundancyECBlocks() { return getNameserviceAggregatedLong( diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java index a31247c1c8077..60c8e4e7194ae 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java @@ -306,6 +306,8 @@ public boolean registerNamenode(NamenodeStatusReport report) report.getScheduledReplicationBlocks()); stats.setNumberOfMissingBlocksWithReplicationFactorOne( report.getNumberOfMissingBlocksWithReplicationFactorOne()); + stats.setNumberOfBadlyDistributedBlocks( + report.getNumberOfBadlyDistributedBlocks()); stats.setHighestPriorityLowRedundancyReplicatedBlocks( report.getHighestPriorityLowRedundancyReplicatedBlocks()); stats.setHighestPriorityLowRedundancyECBlocks( diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/NamenodeStatusReport.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/NamenodeStatusReport.java index feb5a86dba83b..2fbf7843b6e41 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/NamenodeStatusReport.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/NamenodeStatusReport.java @@ -73,6 +73,7 @@ public class NamenodeStatusReport { private int corruptFilesCount = -1; private long scheduledReplicationBlocks = -1; private long numberOfMissingBlocksWithReplicationFactorOne = -1; + private long numberOfBadlyDistributedBlocks = -1; private long highestPriorityLowRedundancyReplicatedBlocks = -1; private long highestPriorityLowRedundancyECBlocks = -1; @@ -391,6 +392,7 @@ public void setNamesystemInfo(long available, long total, * @param numCorruptFiles number of corrupt files. * @param numOfMissingBlocksWithReplicationFactorOne number of missing * blocks with rep one. + * @param numOfBadlyDistributedBlocks number of badly distributed blocks * @param highestPriorityLowRedundancyRepBlocks number of high priority low * redundancy rep blocks. * @param highPriorityLowRedundancyECBlocks number of high priority low @@ -398,11 +400,14 @@ public void setNamesystemInfo(long available, long total, */ public void setNamenodeInfo(int numCorruptFiles, long numOfMissingBlocksWithReplicationFactorOne, + long numOfBadlyDistributedBlocks, long highestPriorityLowRedundancyRepBlocks, long highPriorityLowRedundancyECBlocks) { this.corruptFilesCount = numCorruptFiles; this.numberOfMissingBlocksWithReplicationFactorOne = numOfMissingBlocksWithReplicationFactorOne; + this.numberOfBadlyDistributedBlocks = + numOfBadlyDistributedBlocks; this.highestPriorityLowRedundancyReplicatedBlocks = highestPriorityLowRedundancyRepBlocks; this.highestPriorityLowRedundancyECBlocks = @@ -438,6 +443,16 @@ public long getNumberOfMissingBlocksWithReplicationFactorOne() { return this.numberOfMissingBlocksWithReplicationFactorOne; } + /** + * Gets the total number of badly distributed blocks. + * + * @return the total number of badly distrubted blocks. + */ + public long getNumberOfBadlyDistributedBlocks() { + return this.numberOfBadlyDistributedBlocks; + } + + /** * Gets the total number of replicated low redundancy blocks on the cluster * with the highest risk of loss. diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NamenodeHeartbeatService.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NamenodeHeartbeatService.java index 8cf51386587e5..342917176a8f4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NamenodeHeartbeatService.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NamenodeHeartbeatService.java @@ -376,7 +376,8 @@ private void getNamenodeInfoMetrics(String address, .optLong("NumberOfMissingBlocksWithReplicationFactorOne"), jsonObject .optLong("HighestPriorityLowRedundancyReplicatedBlocks"), - jsonObject.optLong("HighestPriorityLowRedundancyECBlocks")); + jsonObject.optLong("HighestPriorityLowRedundancyECBlocks"), + jsonObject.optLong("BadlyDistributedBlocks")); } } } @@ -434,4 +435,4 @@ protected void serviceStop() throws Exception { } super.serviceStop(); } -} \ No newline at end of file +} diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/MembershipStats.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/MembershipStats.java index 21c8c2f79fce4..615c16a65e8f2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/MembershipStats.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/MembershipStats.java @@ -122,6 +122,11 @@ public abstract void setNumberOfMissingBlocksWithReplicationFactorOne( public abstract long getNumberOfMissingBlocksWithReplicationFactorOne(); + public abstract void setNumberOfBadlyDistributedBlocks( + long blocks); + + public abstract long getNumberOfBadlyDistributedBlocks(); + public abstract void setHighestPriorityLowRedundancyReplicatedBlocks( long blocks); @@ -167,4 +172,4 @@ public long getDateCreated() { // We don't store this record directly return 0; } -} \ No newline at end of file +} diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/impl/pb/MembershipStatsPBImpl.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/impl/pb/MembershipStatsPBImpl.java index 2caa59dfca7e5..7c7e2cd52183a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/impl/pb/MembershipStatsPBImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/records/impl/pb/MembershipStatsPBImpl.java @@ -274,6 +274,18 @@ public long getNumberOfMissingBlocksWithReplicationFactorOne() { .getNumberOfMissingBlocksWithReplicationFactorOne(); } + @Override + public void setNumberOfBadlyDistributedBlocks(long blocks) { + this.translator.getBuilder() + .setBadlyDistributedBlocks(blocks); + } + + @Override + public long getNumberOfBadlyDistributedBlocks() { + return this.translator.getProtoOrBuilder() + .getBadlyDistributedBlocks(); + } + @Override public void setHighestPriorityLowRedundancyReplicatedBlocks(long blocks) { this.translator.getBuilder() @@ -297,4 +309,4 @@ public long getHighestPriorityLowRedundancyECBlocks() { return this.translator.getProtoOrBuilder() .getHighestPriorityLowRedundancyECBlocks(); } -} \ No newline at end of file +} diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/proto/FederationProtocol.proto b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/proto/FederationProtocol.proto index 00deff4ed1e42..02c69ad321bb7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/proto/FederationProtocol.proto +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/proto/FederationProtocol.proto @@ -54,6 +54,7 @@ message NamenodeMembershipStatsRecordProto { optional uint64 numberOfMissingBlocksWithReplicationFactorOne = 31; optional uint64 highestPriorityLowRedundancyReplicatedBlocks = 32; optional uint64 HighestPriorityLowRedundancyECBlocks = 33; + optional uint64 badlyDistributedBlocks = 35; } message NamenodeMembershipRecordProto { @@ -323,4 +324,4 @@ message GetDisabledNameservicesResponseProto { */ message RouterFederatedStateProto { map namespaceStateIds = 1; // Last seen state IDs for multiple namespaces. -} \ No newline at end of file +} diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java index 25473f8df9233..a6d855d1ed078 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java @@ -293,6 +293,7 @@ private void validateClusterStatsFederationBean(FederationMBean bean) { int numCorruptsFilesCount = 0; long scheduledReplicationBlocks = 0; long numberOfMissingBlocksWithReplicationFactorOne = 0; + long numberOfBadlyDistributedBlocks = 0; long highestPriorityLowRedundancyReplicatedBlocks = 0; long highestPriorityLowRedundancyECBlocks = 0; long numFiles = 0; @@ -312,6 +313,7 @@ private void validateClusterStatsFederationBean(FederationMBean bean) { scheduledReplicationBlocks += stats.getScheduledReplicationBlocks(); numberOfMissingBlocksWithReplicationFactorOne += stats.getNumberOfMissingBlocksWithReplicationFactorOne(); + numberOfBadlyDistributedBlocks += stats.getNumberOfBadlyDistributedBlocks(); highestPriorityLowRedundancyReplicatedBlocks += stats.getHighestPriorityLowRedundancyReplicatedBlocks(); highestPriorityLowRedundancyECBlocks += @@ -338,6 +340,8 @@ private void validateClusterStatsFederationBean(FederationMBean bean) { bean.getScheduledReplicationBlocks()); assertEquals(numberOfMissingBlocksWithReplicationFactorOne, bean.getNumberOfMissingBlocksWithReplicationFactorOne()); + assertEquals(numberOfBadlyDistributedBlocks, + bean.getNumberOfBadlyDistributedBlocks()); assertEquals(highestPriorityLowRedundancyReplicatedBlocks, bean.getHighestPriorityLowRedundancyReplicatedBlocks()); assertEquals(highestPriorityLowRedundancyECBlocks, diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index b786de53ffa2b..1098bb4d143c2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -262,6 +262,11 @@ public long getMissingReplicationOneBlocks() { return neededReconstruction.getCorruptReplicationOneBlocks(); } + /** Used by metrics. */ + public long getBadlyDistributedBlocks() { + return neededReconstruction.getBadlyDistributedBlocks(); + } + /** Used by metrics. */ public long getPendingDeletionReplicatedBlocks() { return invalidateBlocks.getBlocks(); @@ -714,7 +719,7 @@ public BlockStoragePolicySuite getStoragePolicySuite() { return storagePolicySuite; } - /** get the BlockTokenSecretManager */ + /** @return get the BlockTokenSecretManager */ @VisibleForTesting public BlockTokenSecretManager getBlockTokenSecretManager() { return blockTokenSecretManager; @@ -4877,6 +4882,11 @@ public long getMissingReplOneBlocksCount() { return this.neededReconstruction.getCorruptReplicationOneBlockSize(); } + public long getBadlyDistributedBlocksCount() { + // not locking + return this.neededReconstruction.getBadlyDistributedBlocks(); + } + public long getHighestPriorityReplicatedBlockCount(){ return this.neededReconstruction.getHighestPriorityReplicatedBlockCount(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/LowRedundancyBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/LowRedundancyBlocks.java index 77480db71ed8f..bbc7d6bc8da96 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/LowRedundancyBlocks.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/LowRedundancyBlocks.java @@ -93,6 +93,7 @@ class LowRedundancyBlocks implements Iterable { private final LongAdder corruptReplicationOneBlocks = new LongAdder(); private final LongAdder lowRedundancyECBlockGroups = new LongAdder(); private final LongAdder corruptECBlockGroups = new LongAdder(); + private final LongAdder badlyDistributedBlocks = new LongAdder(); private final LongAdder highestPriorityLowRedundancyReplicatedBlocks = new LongAdder(); private final LongAdder highestPriorityLowRedundancyECBlocks @@ -167,6 +168,11 @@ long getCorruptReplicationOneBlocks() { return corruptReplicationOneBlocks.longValue(); } + /** Return badly distributed block count. */ + long getBadlyDistributedBlocks() { + return badlyDistributedBlocks.longValue(); + } + /** Return the number of under replicated blocks * with the highest priority to recover */ long getHighestPriorityReplicatedBlockCount() { @@ -320,6 +326,9 @@ private void incrementBlockStat(BlockInfo blockInfo, int priLevel, if (priLevel == QUEUE_HIGHEST_PRIORITY) { highestPriorityLowRedundancyECBlocks.increment(); } + if (priLevel == QUEUE_REPLICAS_BADLY_DISTRIBUTED) { + badlyDistributedBlocks.increment(); + } } else { lowRedundancyBlocks.increment(); if (priLevel == QUEUE_WITH_CORRUPT_BLOCKS) { @@ -331,6 +340,9 @@ private void incrementBlockStat(BlockInfo blockInfo, int priLevel, if (priLevel == QUEUE_HIGHEST_PRIORITY) { highestPriorityLowRedundancyReplicatedBlocks.increment(); } + if (priLevel == QUEUE_REPLICAS_BADLY_DISTRIBUTED) { + badlyDistributedBlocks.increment(); + } } } @@ -407,6 +419,9 @@ private void decrementBlockStat(BlockInfo blockInfo, int priLevel, if (priLevel == QUEUE_HIGHEST_PRIORITY) { highestPriorityLowRedundancyECBlocks.decrement(); } + if (priLevel == QUEUE_REPLICAS_BADLY_DISTRIBUTED) { + badlyDistributedBlocks.decrement(); + } } else { lowRedundancyBlocks.decrement(); if (priLevel == QUEUE_WITH_CORRUPT_BLOCKS) { @@ -421,6 +436,9 @@ private void decrementBlockStat(BlockInfo blockInfo, int priLevel, if (priLevel == QUEUE_HIGHEST_PRIORITY) { highestPriorityLowRedundancyReplicatedBlocks.decrement(); } + if (priLevel == QUEUE_REPLICAS_BADLY_DISTRIBUTED) { + badlyDistributedBlocks.decrement(); + } } } @@ -588,4 +606,4 @@ public void remove() { } }; } -} \ No newline at end of file +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 5ffe50caa61fd..4cb5ab9a612d6 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -4617,6 +4617,12 @@ public long getMissingReplOneBlocksCount() { // not locking return blockManager.getMissingReplOneBlocksCount(); } + + @Metric({"BadlyDistBlocks", "Number of Badly Distributed Blocks"}) + public long getBadlyDistributedBlocksCount() { + // not locking + return blockManager.getBadlyDistributedBlocksCount(); + } @Metric(value = {"ExpiredHeartbeats", "Number of expired heartbeats"}, type = Metric.Type.COUNTER) @@ -4695,7 +4701,8 @@ ReplicatedBlockStats getReplicatedBlockStats() { getCorruptReplicatedBlocks(), getMissingReplicatedBlocks(), getMissingReplicationOneBlocks(), getBytesInFutureReplicatedBlocks(), getPendingDeletionReplicatedBlocks(), - getHighestPriorityLowRedundancyReplicatedBlocks()); + getHighestPriorityLowRedundancyReplicatedBlocks(), + getBadlyDistributedBlocks()); } /** @@ -4708,7 +4715,7 @@ ECBlockGroupStats getECBlockGroupStats() { return new ECBlockGroupStats(getLowRedundancyECBlockGroups(), getCorruptECBlockGroups(), getMissingECBlockGroups(), getBytesInFutureECBlockGroups(), getPendingDeletionECBlocks(), - getHighestPriorityLowRedundancyECBlocks()); + getBadlyDistributedBlocks(), getHighestPriorityLowRedundancyECBlocks()); } @Override // FSNamesystemMBean @@ -5349,6 +5356,12 @@ public long getMissingReplicationOneBlocks() { return blockManager.getMissingReplicationOneBlocks(); } + @Override // ReplicatedBlocksMBean + @Metric({"BadlyDistributedBlocks", "Number of badly distributed blocks"}) + public long getBadlyDistributedBlocks() { + return blockManager.getBadlyDistributedBlocks(); + } + @Override // ReplicatedBlocksMBean @Metric({"HighestPriorityLowRedundancyReplicatedBlocks", "Number of " + "replicated blocks which have the highest risk of loss."}) @@ -6465,6 +6478,11 @@ public long getNumberOfMissingBlocksWithReplicationFactorOne() { return getMissingReplOneBlocksCount(); } + @Override // NameNodeMXBean + public long getNumberOfBadlyDistributedBlocks() { + return getBadlyDistributedBlocks(); + } + @Override // NameNodeMXBean public int getThreads() { return ManagementFactory.getThreadMXBean().getThreadCount(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeMXBean.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeMXBean.java index 69fa01023a59b..797fdaedbcb19 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeMXBean.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeMXBean.java @@ -171,6 +171,14 @@ public interface NameNodeMXBean { */ long getNumberOfMissingBlocksWithReplicationFactorOne(); + + /** + * Gets the total number of badly distributed blocks. + * + * @return the total number of badly distrubted blocks. + */ + long getNumberOfBadlyDistributedBlocks(); + /** * Gets the total number of replicated low redundancy blocks on the cluster * with the highest risk of loss. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/ReplicatedBlocksMBean.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/ReplicatedBlocksMBean.java index a20dd4c0bb2da..6b08e4c0fa7e6 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/ReplicatedBlocksMBean.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/ReplicatedBlocksMBean.java @@ -51,6 +51,11 @@ public interface ReplicatedBlocksMBean { */ long getMissingReplicationOneBlocks(); + /** + * Return count of badly distributed blocks + */ + long getBadlyDistributedBlocks(); + /** * Return total bytes of future blocks. */ diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java index 640af66e4b7f0..4df3e25bf5b6d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java @@ -120,7 +120,7 @@ public class DFSAdmin extends FsShell { private static final Logger LOG = LoggerFactory.getLogger(DFSAdmin.class); /** - * An abstract class for the execution of a file system command + * An abstract class for the execution of a file system command. */ abstract private static class DFSAdminCommand extends Command { protected DistributedFileSystem dfs; @@ -547,6 +547,8 @@ public void report(String[] argv, int i) throws IOException { replicatedBlockStats.getMissingReplicaBlocks()); System.out.println("\tMissing blocks (with replication factor 1): " + replicatedBlockStats.getMissingReplicationOneBlocks()); + System.out.println("\tBadly Distributed Blocks: " + + replicatedBlockStats.getBadlyDistributedBlocks()); if (replicatedBlockStats.hasHighestPriorityLowRedundancyBlocks()) { System.out.println("\tLow redundancy blocks with highest priority " + "to recover: " + @@ -564,6 +566,8 @@ public void report(String[] argv, int i) throws IOException { ecBlockGroupStats.getCorruptBlockGroups()); System.out.println("\tMissing block groups: " + ecBlockGroupStats.getMissingBlockGroups()); + System.out.println("\tBadly Distributed Blocks: " + + ecBlockGroupStats.getBadlyDistributedBlocks()); if (ecBlockGroupStats.hasHighestPriorityLowRedundancyBlocks()) { System.out.println("\tLow redundancy blocks with highest priority " + "to recover: " + diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestLowRedundancyBlockQueues.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestLowRedundancyBlockQueues.java index e33e24fe28559..cdd40b961ee45 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestLowRedundancyBlockQueues.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestLowRedundancyBlockQueues.java @@ -75,7 +75,7 @@ private void verifyBlockStats(LowRedundancyBlocks queues, int lowRedundancyReplicaCount, int corruptReplicaCount, int corruptReplicationOneCount, int lowRedundancyStripedCount, int corruptStripedCount, int highestPriorityReplicatedBlockCount, - int highestPriorityECBlockCount) { + int highestPriorityECBlockCount, int badlyDistributedBlockCount) { assertEquals("Low redundancy replica count incorrect!", lowRedundancyReplicaCount, queues.getLowRedundancyBlocks()); assertEquals("Corrupt replica count incorrect!", @@ -93,6 +93,8 @@ private void verifyBlockStats(LowRedundancyBlocks queues, assertEquals("LowRedundancyBlocks queue size incorrect!", (lowRedundancyReplicaCount + corruptReplicaCount + lowRedundancyStripedCount + corruptStripedCount), queues.size()); + assertEquals("Badly Distributed Blocks queue size incorrect!", + badlyDistributedBlockCount, queues.getBadlyDistributedBlocks()); assertEquals("Highest priority replicated low redundancy " + "blocks count is incorrect!", highestPriorityReplicatedBlockCount, @@ -177,50 +179,58 @@ public void testBlockPriorities() throws Throwable { BlockInfo block_very_low_redundancy = genBlockInfo(3); BlockInfo block_corrupt = genBlockInfo(4); BlockInfo block_corrupt_repl_one = genBlockInfo(5); + BlockInfo blockBadlyDistributed = genBlockInfo(6); // Add a block with a single entry assertAdded(queues, block1, 1, 0, 3); assertInLevel(queues, block1, LowRedundancyBlocks.QUEUE_HIGHEST_PRIORITY); - verifyBlockStats(queues, 1, 0, 0, 0, 0, 1, 0); + verifyBlockStats(queues, 1, 0, 0, 0, 0, 1, 0, 0); // Repeated additions fail assertFalse(queues.add(block1, 1, 0, 0, 3)); - verifyBlockStats(queues, 1, 0, 0, 0, 0, 1, 0); + verifyBlockStats(queues, 1, 0, 0, 0, 0, 1, 0, 0); // Add a second block with two replicas assertAdded(queues, block2, 2, 0, 3); assertInLevel(queues, block2, LowRedundancyBlocks.QUEUE_LOW_REDUNDANCY); - verifyBlockStats(queues, 2, 0, 0, 0, 0, 1, 0); + verifyBlockStats(queues, 2, 0, 0, 0, 0, 1, 0, 0); // Now try to add a block that is corrupt assertAdded(queues, block_corrupt, 0, 0, 3); assertInLevel(queues, block_corrupt, LowRedundancyBlocks.QUEUE_WITH_CORRUPT_BLOCKS); - verifyBlockStats(queues, 2, 1, 0, 0, 0, 1, 0); + verifyBlockStats(queues, 2, 1, 0, 0, 0, 1, 0, 0); // Insert a very insufficiently redundancy block assertAdded(queues, block_very_low_redundancy, 4, 0, 25); assertInLevel(queues, block_very_low_redundancy, LowRedundancyBlocks.QUEUE_VERY_LOW_REDUNDANCY); - verifyBlockStats(queues, 3, 1, 0, 0, 0, 1, 0); + verifyBlockStats(queues, 3, 1, 0, 0, 0, 1, 0, 0); // Insert a corrupt block with replication factor 1 assertAdded(queues, block_corrupt_repl_one, 0, 0, 1); - verifyBlockStats(queues, 3, 2, 1, 0, 0, 1, 0); + verifyBlockStats(queues, 3, 2, 1, 0, 0, 1, 0, 0); // Bump up the expected count for corrupt replica one block from 1 to 3 queues.update(block_corrupt_repl_one, 0, 0, 0, 3, 0, 2); - verifyBlockStats(queues, 3, 2, 0, 0, 0, 1, 0); + verifyBlockStats(queues, 3, 2, 0, 0, 0, 1, 0, 0); // Reduce the expected replicas to 1 queues.update(block_corrupt, 0, 0, 0, 1, 0, -2); - verifyBlockStats(queues, 3, 2, 1, 0, 0, 1, 0); + verifyBlockStats(queues, 3, 2, 1, 0, 0, 1, 0, 0); queues.update(block_very_low_redundancy, 0, 0, 0, 1, -4, -24); - verifyBlockStats(queues, 2, 3, 2, 0, 0, 1, 0); + verifyBlockStats(queues, 2, 3, 2, 0, 0, 1, 0, 0); // Reduce the expected replicas to 1 for block1 queues.update(block1, 1, 0, 0, 1, 0, 0); - verifyBlockStats(queues, 2, 3, 2, 0, 0, 0, 0); + // expect 1 badly distributed block + verifyBlockStats(queues, 2, 3, 2, 0, 0, 0, 0, 1); + + // insert a block with too many replicas to make badly distributed + assertAdded(queues, blockBadlyDistributed, 2, 0, 1); + assertInLevel(queues, blockBadlyDistributed, + LowRedundancyBlocks.QUEUE_REPLICAS_BADLY_DISTRIBUTED); + verifyBlockStats(queues, 3, 3, 2, 0, 0, 0, 0, 2); } @Test @@ -230,12 +240,12 @@ public void testRemoveWithWrongPriority() { assertAdded(queues, corruptBlock, 0, 0, 3); assertInLevel(queues, corruptBlock, LowRedundancyBlocks.QUEUE_WITH_CORRUPT_BLOCKS); - verifyBlockStats(queues, 0, 1, 0, 0, 0, 0, 0); + verifyBlockStats(queues, 0, 1, 0, 0, 0, 0, 0, 0); // Remove with wrong priority queues.remove(corruptBlock, LowRedundancyBlocks.QUEUE_LOW_REDUNDANCY); // Verify the number of corrupt block is decremented - verifyBlockStats(queues, 0, 0, 0, 0, 0, 0, 0); + verifyBlockStats(queues, 0, 0, 0, 0, 0, 0, 0, 0); } @Test @@ -271,17 +281,17 @@ private void doTestStripedBlockPriorities(int dataBlkNum, int parityBlkNum) assertInLevel(queues, block, LowRedundancyBlocks.QUEUE_LOW_REDUNDANCY); } - verifyBlockStats(queues, 0, 0, 0, numUR, 0, 0, 1); + verifyBlockStats(queues, 0, 0, 0, numUR, 0, 0, 1, 0); } // add a corrupted block BlockInfo block_corrupt = genStripedBlockInfo(-10, numBytes); assertEquals(numCorrupt, queues.getCorruptBlockSize()); - verifyBlockStats(queues, 0, 0, 0, numUR, numCorrupt, 0, 1); + verifyBlockStats(queues, 0, 0, 0, numUR, numCorrupt, 0, 1, 0); assertAdded(queues, block_corrupt, dataBlkNum - 1, 0, groupSize); numCorrupt++; - verifyBlockStats(queues, 0, 0, 0, numUR, numCorrupt, 0, 1); + verifyBlockStats(queues, 0, 0, 0, numUR, numCorrupt, 0, 1, 0); assertInLevel(queues, block_corrupt, LowRedundancyBlocks.QUEUE_WITH_CORRUPT_BLOCKS); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java index aaedb8288e410..c1d07d6ca6820 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java @@ -457,6 +457,9 @@ private void verifyAggregatedMetricsTally() throws Exception { assertEquals("Missing blocks with replication factor one not matching!", namesystem.getMissingReplOneBlocksCount(), namesystem.getMissingReplicationOneBlocks()); + assertEquals("Blocks with badly distributed are not matching!", + namesystem.getBadlyDistributedBlocksCount(), + namesystem.getBadlyDistributedBlocks()); assertEquals("Bytes in future blocks metrics not matching!", namesystem.getBytesInFuture(), namesystem.getBytesInFutureReplicatedBlocks() + @@ -507,6 +510,7 @@ public void testCorruptBlock() throws Exception { assertGauge("LowRedundancyReplicatedBlocks", 1L, rb); assertGauge("CorruptReplicatedBlocks", 1L, rb); assertGauge("HighestPriorityLowRedundancyReplicatedBlocks", 1L, rb); + assertGauge("BadlyDistributedBlocks", 0L, rb); // Verify striped blocks metrics assertGauge("LowRedundancyECBlockGroups", 0L, rb); assertGauge("CorruptECBlockGroups", 0L, rb); @@ -534,6 +538,7 @@ public void testCorruptBlock() throws Exception { assertGauge("LowRedundancyReplicatedBlocks", 0L, rb); assertGauge("CorruptReplicatedBlocks", 0L, rb); assertGauge("HighestPriorityLowRedundancyReplicatedBlocks", 0L, rb); + assertGauge("BadlyDistributedBlocks", 0L, rb); // Verify striped blocks metrics assertGauge("LowRedundancyECBlockGroups", 0L, rb); assertGauge("CorruptECBlockGroups", 0L, rb); @@ -599,6 +604,7 @@ public void testStripedFileCorruptBlocks() throws Exception { assertGauge("LowRedundancyReplicatedBlocks", 0L, rb); assertGauge("CorruptReplicatedBlocks", 0L, rb); assertGauge("HighestPriorityLowRedundancyReplicatedBlocks", 0L, rb); + assertGauge("BadlyDistributedBlocks", 0L, rb); // Verify striped block groups metrics assertGauge("LowRedundancyECBlockGroups", 1L, rb); assertGauge("CorruptECBlockGroups", 1L, rb); @@ -692,6 +698,7 @@ public void testMissingBlock() throws Exception { assertGauge("MissingReplOneBlocks", 1L, rb); assertGauge("HighestPriorityLowRedundancyReplicatedBlocks", 0L, rb); assertGauge("HighestPriorityLowRedundancyECBlocks", 0L, rb); + assertGauge("BadlyDistributedBlocks", 0L, rb); fs.delete(file, true); waitForDnMetricValue(NS_METRICS, "UnderReplicatedBlocks", 0L); } From b7a3139e78f420a4c79eae7fe189c1c0da55fee6 Mon Sep 17 00:00:00 2001 From: PrateekSane Date: Thu, 5 Dec 2024 10:37:42 -0800 Subject: [PATCH 3/4] Trigger Build From e68c0ac5ef6d5b91d0178cc4c793895ed395a7cf Mon Sep 17 00:00:00 2001 From: PrateekSane Date: Tue, 10 Dec 2024 11:06:54 -0800 Subject: [PATCH 4/4] Trigger Build