diff --git a/dockers/dn1/hdfs-site.xml b/dockers/dn1/hdfs-site.xml index 90712dff08189..b6d0e79be0fd0 100644 --- a/dockers/dn1/hdfs-site.xml +++ b/dockers/dn1/hdfs-site.xml @@ -5,7 +5,7 @@ dfs.datanode.data.dir - [ZFS]/data/dataNode1 + [ZFS]/data/dataNode1,[ZFS_RECON_BUFFER]/data/recon_buffer1 dfs.namenode.heartbeat.recheck-interval diff --git a/dockers/dn2/hdfs-site.xml b/dockers/dn2/hdfs-site.xml index a49dd22508370..9201f45d21de3 100644 --- a/dockers/dn2/hdfs-site.xml +++ b/dockers/dn2/hdfs-site.xml @@ -17,7 +17,7 @@ dfs.datanode.data.dir - [DISK]/data/dataNode2 + [DISK]/data/dataNode2,[ZFS_RECON_BUFFER]/data/recon_buffer2 dfs.namenode.heartbeat.recheck-interval diff --git a/dockers/dn3/hdfs-site.xml b/dockers/dn3/hdfs-site.xml index 008a3235a5d16..777a6aa4546d2 100644 --- a/dockers/dn3/hdfs-site.xml +++ b/dockers/dn3/hdfs-site.xml @@ -17,7 +17,7 @@ dfs.datanode.data.dir - [DISK]/data/dataNode3 + [DISK]/data/dataNode3,[ZFS_RECON_BUFFER]/data/recon_buffer3 dfs.namenode.heartbeat.recheck-interval diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 585dcb90d21a2..e29e47d18bb18 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -2187,7 +2187,7 @@ int computeReconstructionWorkForBlocks( // Check whether rw is ZFS Set blockStorageTypes = new HashSet<>(); rw.getBlock().getStorageInfos().forEachRemaining(info -> blockStorageTypes.add(info.getStorageType())); - if (blockStorageTypes.size() == 1 && blockStorageTypes.contains(StorageType.ZFS)) { + if (blockStorageTypes.contains(StorageType.ZFS)) { // The block is ZFS, we would need to restore it back to the same datanode that it is coming from // However, we cannot directly write, because that IO pool would be suspended, the write will fail // We need to instruct the writer to call a different API @@ -2316,7 +2316,9 @@ BlockReconstructionWork scheduleReconstruction(BlockInfo block, if (block.isStriped()) { BlockInfoStriped stripedBlock = (BlockInfoStriped) block; if (stripedBlock.getRealDataBlockNum() > srcNodes.length) { - LOG.info("Block {} cannot be reconstructed due to shortage of source datanodes ", block); + LOG.info("Block {} cannot be reconstructed due to shortage of source datanodes, " + + "real data block num {}, src nodes {}", block, stripedBlock.getRealDataBlockNum(), srcNodes.length); + LOG.info("Source nodes {}", srcNodes); NameNode.getNameNodeMetrics().incNumTimesReReplicationNotScheduled(); return null; } @@ -2660,6 +2662,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, } for (DatanodeStorageInfo storage : blocksMap.getStorages(block)) { + LOG.info("Checking source storage {}", storage); final DatanodeDescriptor node = getDatanodeDescriptorFromStorage(storage); final StoredReplicaState state = checkReplicaOnStorage(numReplicas, block, storage, corruptReplicas.getNodes(block), false); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockStoragePolicySuite.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockStoragePolicySuite.java index 167e95af8e645..19edf4464b7a5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockStoragePolicySuite.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockStoragePolicySuite.java @@ -86,7 +86,7 @@ public static BlockStoragePolicySuite createDefaultSuite( policies[hotId] = new BlockStoragePolicy(hotId, HdfsConstants.StoragePolicy.HOT.name(), new StorageType[]{StorageType.DISK, StorageType.ZFS}, - new StorageType[]{StorageType.ZFS}, + new StorageType[]{StorageType.DISK, StorageType.ZFS}, new StorageType[]{StorageType.ARCHIVE, StorageType.ZFS}); final byte warmId = HdfsConstants.StoragePolicy.WARM.value(); policies[warmId] = new BlockStoragePolicy(warmId, diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java index 6e8c35d042772..e321e26200647 100755 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java @@ -558,55 +558,60 @@ HeartbeatResponse sendHeartBeat(boolean requestBlockReportLease) LOG.info("**********Heartbeating************"); // If there is a repair on-going, we do not call failed chunks // This is to work around the kernel panic on the dn->dn_holds on the ZFS side - if (ErasureCodingWorker.ongoingRepairs.isEmpty()) { - List dnodes = new Tools().getFailedChunks("pool"); - // This means that there is ZFS local failure - for (DnodeAttributes dnode : dnodes) { - // Enable this during real testing -// if (dnode.numFailedCols() == 0) { -// continue; -// } - - // Check whether this dnode is actually on this datanode instance - DatanodeVolumeInfo volumeInfo = this.dn.getVolumeReport().stream() - .filter(vi -> vi.getStorageType() == StorageType.ZFS) - .collect(Collectors.toList()) - .get(0); - - // 1. Check whether the dnode actually belongs to this datanode volume (for local testing env) - Path path = Paths.get(volumeInfo.getPath(), dnode.path); - if (!Files.exists(path)) { - continue; - } + // Loop around all the volumes on this datanode, and check for ZFS failures + for (DatanodeVolumeInfo volInfo : dn.getVolumeReport()) { + if (volInfo.getStorageType() == StorageType.ZFS) { + if (ErasureCodingWorker.ongoingRepairs.isEmpty()) { + List dnodes = new Tools().getFailedChunks("pool"); + // This means that there is ZFS local failure + for (DnodeAttributes dnode : dnodes) { + // Enable this during real testing + // if (dnode.numFailedCols() == 0) { + // continue; + // } + + // Check whether this dnode is actually on this datanode instance + DatanodeVolumeInfo volumeInfo = this.dn.getVolumeReport().stream() + .filter(vi -> vi.getStorageType() == StorageType.ZFS) + .collect(Collectors.toList()) + .get(0); + + // 1. Check whether the dnode actually belongs to this datanode volume (for local testing env) + Path path = Paths.get(volumeInfo.getPath(), dnode.path); + if (!Files.exists(path)) { + continue; + } - // 2. Check whether this dnode is a hdfs block (rather than directory, metadata, etc) - Optional matcher = ZfsFailureTuple.isHdfsBlock(dnode); - if (!matcher.isPresent()) { - continue; - } + // 2. Check whether this dnode is a hdfs block (rather than directory, metadata, etc) + Optional matcher = ZfsFailureTuple.isHdfsBlock(dnode); + if (!matcher.isPresent()) { + continue; + } - // This means that this is a block file, not directory, not anything else - // Get the block from the file name - long hdfsBlockId = Long.parseLong(matcher.get().group(1)); + // This means that this is a block file, not directory, not anything else + // Get the block from the file name + long hdfsBlockId = Long.parseLong(matcher.get().group(1)); - // If this is already a known issue, we ignore - if (this.mlecDnMgmt.knownFailures.containsKey(hdfsBlockId)) { - continue; - } + // If this is already a known issue, we ignore + if (this.mlecDnMgmt.knownFailures.containsKey(hdfsBlockId)) { + continue; + } - LOG.info("Failed hdfs block on {}:{} {} corresponding to zfs dn {}", dn.getDatanodeHostname(), dn.getHttpPort(), - hdfsBlockId, dnode); + LOG.info("Failed hdfs block on {}:{} {} corresponding to zfs dn {}", dn.getDatanodeHostname(), dn.getHttpPort(), + hdfsBlockId, dnode); - // MLEC: testing purpose - dnode.childStatus.set(0, 1); + // MLEC: testing purpose + dnode.childStatus.set(0, 1); - ZfsFailureTuple failureTuple = new ZfsFailureTuple(hdfsBlockId, dnode.childStatus); - zfsReport.getFailedHdfsBlocks().add(failureTuple); + ZfsFailureTuple failureTuple = new ZfsFailureTuple(hdfsBlockId, dnode.childStatus); + zfsReport.getFailedHdfsBlocks().add(failureTuple); - this.mlecDnMgmt.knownFailures.put(hdfsBlockId, failureTuple); + this.mlecDnMgmt.knownFailures.put(hdfsBlockId, failureTuple); + } + } else { + LOG.warn("REPAIR ON-GOING, skipping"); + } } - } else { - LOG.warn("REPAIR ON-GOING, skipping"); } HeartbeatResponse response = bpNamenode.sendHeartbeat(bpRegistration,