diff --git a/dockers/dn1/hdfs-site.xml b/dockers/dn1/hdfs-site.xml
index 90712dff08189..b6d0e79be0fd0 100644
--- a/dockers/dn1/hdfs-site.xml
+++ b/dockers/dn1/hdfs-site.xml
@@ -5,7 +5,7 @@
dfs.datanode.data.dir
- [ZFS]/data/dataNode1
+ [ZFS]/data/dataNode1,[ZFS_RECON_BUFFER]/data/recon_buffer1
dfs.namenode.heartbeat.recheck-interval
diff --git a/dockers/dn2/hdfs-site.xml b/dockers/dn2/hdfs-site.xml
index a49dd22508370..9201f45d21de3 100644
--- a/dockers/dn2/hdfs-site.xml
+++ b/dockers/dn2/hdfs-site.xml
@@ -17,7 +17,7 @@
dfs.datanode.data.dir
- [DISK]/data/dataNode2
+ [DISK]/data/dataNode2,[ZFS_RECON_BUFFER]/data/recon_buffer2
dfs.namenode.heartbeat.recheck-interval
diff --git a/dockers/dn3/hdfs-site.xml b/dockers/dn3/hdfs-site.xml
index 008a3235a5d16..777a6aa4546d2 100644
--- a/dockers/dn3/hdfs-site.xml
+++ b/dockers/dn3/hdfs-site.xml
@@ -17,7 +17,7 @@
dfs.datanode.data.dir
- [DISK]/data/dataNode3
+ [DISK]/data/dataNode3,[ZFS_RECON_BUFFER]/data/recon_buffer3
dfs.namenode.heartbeat.recheck-interval
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
index 585dcb90d21a2..e29e47d18bb18 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@@ -2187,7 +2187,7 @@ int computeReconstructionWorkForBlocks(
// Check whether rw is ZFS
Set blockStorageTypes = new HashSet<>();
rw.getBlock().getStorageInfos().forEachRemaining(info -> blockStorageTypes.add(info.getStorageType()));
- if (blockStorageTypes.size() == 1 && blockStorageTypes.contains(StorageType.ZFS)) {
+ if (blockStorageTypes.contains(StorageType.ZFS)) {
// The block is ZFS, we would need to restore it back to the same datanode that it is coming from
// However, we cannot directly write, because that IO pool would be suspended, the write will fail
// We need to instruct the writer to call a different API
@@ -2316,7 +2316,9 @@ BlockReconstructionWork scheduleReconstruction(BlockInfo block,
if (block.isStriped()) {
BlockInfoStriped stripedBlock = (BlockInfoStriped) block;
if (stripedBlock.getRealDataBlockNum() > srcNodes.length) {
- LOG.info("Block {} cannot be reconstructed due to shortage of source datanodes ", block);
+ LOG.info("Block {} cannot be reconstructed due to shortage of source datanodes, " +
+ "real data block num {}, src nodes {}", block, stripedBlock.getRealDataBlockNum(), srcNodes.length);
+ LOG.info("Source nodes {}", srcNodes);
NameNode.getNameNodeMetrics().incNumTimesReReplicationNotScheduled();
return null;
}
@@ -2660,6 +2662,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block,
}
for (DatanodeStorageInfo storage : blocksMap.getStorages(block)) {
+ LOG.info("Checking source storage {}", storage);
final DatanodeDescriptor node = getDatanodeDescriptorFromStorage(storage);
final StoredReplicaState state = checkReplicaOnStorage(numReplicas, block,
storage, corruptReplicas.getNodes(block), false);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockStoragePolicySuite.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockStoragePolicySuite.java
index 167e95af8e645..19edf4464b7a5 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockStoragePolicySuite.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockStoragePolicySuite.java
@@ -86,7 +86,7 @@ public static BlockStoragePolicySuite createDefaultSuite(
policies[hotId] = new BlockStoragePolicy(hotId,
HdfsConstants.StoragePolicy.HOT.name(),
new StorageType[]{StorageType.DISK, StorageType.ZFS},
- new StorageType[]{StorageType.ZFS},
+ new StorageType[]{StorageType.DISK, StorageType.ZFS},
new StorageType[]{StorageType.ARCHIVE, StorageType.ZFS});
final byte warmId = HdfsConstants.StoragePolicy.WARM.value();
policies[warmId] = new BlockStoragePolicy(warmId,
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java
index 6e8c35d042772..e321e26200647 100755
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java
@@ -558,55 +558,60 @@ HeartbeatResponse sendHeartBeat(boolean requestBlockReportLease)
LOG.info("**********Heartbeating************");
// If there is a repair on-going, we do not call failed chunks
// This is to work around the kernel panic on the dn->dn_holds on the ZFS side
- if (ErasureCodingWorker.ongoingRepairs.isEmpty()) {
- List dnodes = new Tools().getFailedChunks("pool");
- // This means that there is ZFS local failure
- for (DnodeAttributes dnode : dnodes) {
- // Enable this during real testing
-// if (dnode.numFailedCols() == 0) {
-// continue;
-// }
-
- // Check whether this dnode is actually on this datanode instance
- DatanodeVolumeInfo volumeInfo = this.dn.getVolumeReport().stream()
- .filter(vi -> vi.getStorageType() == StorageType.ZFS)
- .collect(Collectors.toList())
- .get(0);
-
- // 1. Check whether the dnode actually belongs to this datanode volume (for local testing env)
- Path path = Paths.get(volumeInfo.getPath(), dnode.path);
- if (!Files.exists(path)) {
- continue;
- }
+ // Loop around all the volumes on this datanode, and check for ZFS failures
+ for (DatanodeVolumeInfo volInfo : dn.getVolumeReport()) {
+ if (volInfo.getStorageType() == StorageType.ZFS) {
+ if (ErasureCodingWorker.ongoingRepairs.isEmpty()) {
+ List dnodes = new Tools().getFailedChunks("pool");
+ // This means that there is ZFS local failure
+ for (DnodeAttributes dnode : dnodes) {
+ // Enable this during real testing
+ // if (dnode.numFailedCols() == 0) {
+ // continue;
+ // }
+
+ // Check whether this dnode is actually on this datanode instance
+ DatanodeVolumeInfo volumeInfo = this.dn.getVolumeReport().stream()
+ .filter(vi -> vi.getStorageType() == StorageType.ZFS)
+ .collect(Collectors.toList())
+ .get(0);
+
+ // 1. Check whether the dnode actually belongs to this datanode volume (for local testing env)
+ Path path = Paths.get(volumeInfo.getPath(), dnode.path);
+ if (!Files.exists(path)) {
+ continue;
+ }
- // 2. Check whether this dnode is a hdfs block (rather than directory, metadata, etc)
- Optional matcher = ZfsFailureTuple.isHdfsBlock(dnode);
- if (!matcher.isPresent()) {
- continue;
- }
+ // 2. Check whether this dnode is a hdfs block (rather than directory, metadata, etc)
+ Optional matcher = ZfsFailureTuple.isHdfsBlock(dnode);
+ if (!matcher.isPresent()) {
+ continue;
+ }
- // This means that this is a block file, not directory, not anything else
- // Get the block from the file name
- long hdfsBlockId = Long.parseLong(matcher.get().group(1));
+ // This means that this is a block file, not directory, not anything else
+ // Get the block from the file name
+ long hdfsBlockId = Long.parseLong(matcher.get().group(1));
- // If this is already a known issue, we ignore
- if (this.mlecDnMgmt.knownFailures.containsKey(hdfsBlockId)) {
- continue;
- }
+ // If this is already a known issue, we ignore
+ if (this.mlecDnMgmt.knownFailures.containsKey(hdfsBlockId)) {
+ continue;
+ }
- LOG.info("Failed hdfs block on {}:{} {} corresponding to zfs dn {}", dn.getDatanodeHostname(), dn.getHttpPort(),
- hdfsBlockId, dnode);
+ LOG.info("Failed hdfs block on {}:{} {} corresponding to zfs dn {}", dn.getDatanodeHostname(), dn.getHttpPort(),
+ hdfsBlockId, dnode);
- // MLEC: testing purpose
- dnode.childStatus.set(0, 1);
+ // MLEC: testing purpose
+ dnode.childStatus.set(0, 1);
- ZfsFailureTuple failureTuple = new ZfsFailureTuple(hdfsBlockId, dnode.childStatus);
- zfsReport.getFailedHdfsBlocks().add(failureTuple);
+ ZfsFailureTuple failureTuple = new ZfsFailureTuple(hdfsBlockId, dnode.childStatus);
+ zfsReport.getFailedHdfsBlocks().add(failureTuple);
- this.mlecDnMgmt.knownFailures.put(hdfsBlockId, failureTuple);
+ this.mlecDnMgmt.knownFailures.put(hdfsBlockId, failureTuple);
+ }
+ } else {
+ LOG.warn("REPAIR ON-GOING, skipping");
+ }
}
- } else {
- LOG.warn("REPAIR ON-GOING, skipping");
}
HeartbeatResponse response = bpNamenode.sendHeartbeat(bpRegistration,