Skip to content

Commit

Permalink
Bugfix with min/max chunksize, improved PatchSummary experience
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Tischner committed Aug 11, 2020
1 parent 68c4099 commit 49167a1
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 74 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
import java.io.UncheckedIOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.util.*;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Consumer;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
Expand Down Expand Up @@ -63,16 +65,16 @@ public static void main(final String[] args) {
.setHashTableOption(HashTableOption.NLFIEDLER_RUST)
.setExpectedChunkSize(1024 * 1024)
.build());
descriptionToChunker.put("FastCDC 1MB NlFiedlerRust",
new ChunkerBuilder().setChunkerOption(ChunkerOption.FAST_CDC)
.setHashTableOption(HashTableOption.NLFIEDLER_RUST)
.setExpectedChunkSize(1024 * 1024)
.build());
descriptionToChunker.put("NlFiedlerRust 1MB RTPal",
new ChunkerBuilder().setChunkerOption(ChunkerOption.NLFIEDLER_RUST)
.setHashTableOption(HashTableOption.RTPAL)
.setExpectedChunkSize(1024 * 1024)
.build());
// descriptionToChunker.put("FastCDC 1MB NlFiedlerRust",
// new ChunkerBuilder().setChunkerOption(ChunkerOption.FAST_CDC)
// .setHashTableOption(HashTableOption.NLFIEDLER_RUST)
// .setExpectedChunkSize(1024 * 1024)
// .build());
// descriptionToChunker.put("NlFiedlerRust 1MB RTPal",
// new ChunkerBuilder().setChunkerOption(ChunkerOption.NLFIEDLER_RUST)
// .setHashTableOption(HashTableOption.RTPAL)
// .setExpectedChunkSize(1024 * 1024)
// .build());

System.out.printf("Summary for patching from previous (%s) to current (%s):%n", previousBuild, currentBuild);
System.out.println();
Expand All @@ -81,30 +83,68 @@ public static void main(final String[] args) {
currentBuild));
}

private static void chunkPath(final Chunker chunker, final Path path, final Consumer<Chunk> chunkAction) {
AtomicLong processedBytesTotal = new AtomicLong(0);
AtomicLong processedBytesSincePrint = new AtomicLong(0);
AtomicLong timeStart = new AtomicLong(System.nanoTime());
ScheduledExecutorService service = Executors.newSingleThreadScheduledExecutor();
final long nanosPerSecond = 1_000_000_000L;
Runnable statPrinter = () -> {
AtomicLong timeEnd = new AtomicLong(System.nanoTime());
long timeDiff = timeEnd.get() - timeStart.get();
if (timeDiff < nanosPerSecond) {
return;
}
timeStart.set(timeEnd.get());
long bytesPerSecond = processedBytesSincePrint.get() / (timeDiff / nanosPerSecond);

System.out.printf("\t%12d b/s, %12d total%n", bytesPerSecond, processedBytesTotal.get());

processedBytesSincePrint.set(0);
};
var statPrintTask = service.scheduleAtFixedRate(statPrinter, 0, 1, TimeUnit.SECONDS);
private static String bytesToReadable(long bytes) {
if (bytes < 1_000) {
return bytes + " B";
}

double kiloBytes = bytes / 1_000.0;
if (kiloBytes < 1_000) {
return String.format("%.2f", kiloBytes) + " KB";
}

double megaBytes = kiloBytes / 1_000.0;
if (megaBytes < 1_000) {
return String.format("%.2f", megaBytes) + " MB";
}

double gigaBytes = megaBytes / 1_000.0;
if (gigaBytes < 1_000) {
return String.format("%.2f", gigaBytes) + " GB";
}
return "";
}

private static void chunkPath(final Chunker chunker, final Path path, final Consumer<Chunk> chunkAction) {
try {
Files.walk(path)
.parallel()
List<Path> files = Files.walk(path)
.filter(Files::isRegularFile)
.collect(Collectors.toList());

long totalBytes = files.stream()
.mapToLong(file -> {
try {
return Files.size(file);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
})
.sum();
AtomicLong processedBytesTotal = new AtomicLong(0);
AtomicLong processedBytesSincePrint = new AtomicLong(0);
AtomicLong timeStart = new AtomicLong(System.nanoTime());
ScheduledExecutorService service = Executors.newSingleThreadScheduledExecutor();
final long nanosPerSecond = 1_000_000_000L;
Runnable statPrinter = () -> {
AtomicLong timeEnd = new AtomicLong(System.nanoTime());
long timeDiff = timeEnd.get() - timeStart.get();
if (timeDiff < nanosPerSecond) {
return;
}
timeStart.set(timeEnd.get());
long bytesPerSecond = processedBytesSincePrint.get() / (timeDiff / nanosPerSecond);
long bytesLeft = totalBytes - processedBytesTotal.get();
long secondsLeft = bytesLeft / (bytesPerSecond == 0 ? 1 : bytesPerSecond);

System.out.printf("\t%12s/s, %12s ETC, %12s processed, %12s total\r", bytesToReadable(bytesPerSecond),
secondsToReadable(secondsLeft), bytesToReadable(processedBytesTotal.get()),
bytesToReadable(totalBytes));

processedBytesSincePrint.set(0);
};
var statPrintTask = service.scheduleAtFixedRate(statPrinter, 0, 1, TimeUnit.SECONDS);

files.parallelStream()
.filter(Files::isRegularFile)
.map(chunker::chunk)
.forEach(chunks -> chunks.forEach(chunk -> {
Expand All @@ -113,11 +153,11 @@ private static void chunkPath(final Chunker chunker, final Path path, final Cons

chunkAction.accept(chunk);
}));
statPrintTask.cancel(false);
service.shutdown();
} catch (IOException e) {
throw new UncheckedIOException(e);
}

statPrintTask.cancel(false);
}

private static void executePatchSummary(final String description, final Chunker chunker, final Path previousBuild,
Expand All @@ -132,18 +172,21 @@ private static void executePatchSummary(final String description, final Chunker

final PatchSummary summary = new PatchSummary(previousBuildSummary, currentBuildSummary);
System.out.println("==== " + description);
System.out.printf("%-25s %12d total size, %12d total chunks, %12d unique size, %12d unique chunks%n",
"Build summary previous:", previousBuildSummary.getTotalSize(),
previousBuildSummary.getTotalChunksCount(), previousBuildSummary.getTotalUniqueSize(),
System.out.printf("%-25s %12s total size, %12d total chunks, %12s unique size, %12d unique chunks%n",
"Build summary previous:", bytesToReadable(previousBuildSummary.getTotalSize()),
previousBuildSummary.getTotalChunksCount(), bytesToReadable(previousBuildSummary.getTotalUniqueSize()),
previousBuildSummary.getUniqueChunksCount());
System.out.printf("%-25s %12d total size, %12d total chunks, %12d unique size, %12d unique chunks%n",
"Build summary current:", currentBuildSummary.getTotalSize(), currentBuildSummary.getTotalChunksCount(),
currentBuildSummary.getTotalUniqueSize(), currentBuildSummary.getUniqueChunksCount());
System.out.printf("%-25s %12d average chunk size, %12.2f%% deduplication ratio%n", "Build metrics previous:",
previousBuildSummary.getAverageChunkSize(), previousBuildSummary.getDeduplicationRatio());
System.out.printf("%-25s %12d average chunk size, %12.2f%% deduplication ratio%n", "Build metrics current:",
currentBuildSummary.getAverageChunkSize(), currentBuildSummary.getDeduplicationRatio());
System.out.printf("%-25s %12d%n", "Patch size:", summary.getPatchSize());
System.out.printf("%-25s %12s total size, %12d total chunks, %12s unique size, %12d unique chunks%n",
"Build summary current:", bytesToReadable(currentBuildSummary.getTotalSize()),
currentBuildSummary.getTotalChunksCount(), bytesToReadable(currentBuildSummary.getTotalUniqueSize()),
currentBuildSummary.getUniqueChunksCount());
System.out.printf("%-25s %12s average chunk size, %12.2f%% deduplication ratio%n", "Build metrics previous:",
bytesToReadable(previousBuildSummary.getAverageChunkSize()),
previousBuildSummary.getDeduplicationRatio());
System.out.printf("%-25s %12s average chunk size, %12.2f%% deduplication ratio%n", "Build metrics current:",
bytesToReadable(currentBuildSummary.getAverageChunkSize()),
currentBuildSummary.getDeduplicationRatio());
System.out.printf("%-25s %12s%n", "Patch size:", bytesToReadable(summary.getPatchSize()));
System.out.printf("%-25s %12d%n", "Chunks to add:", summary.getChunksToAdd()
.size());
System.out.printf("%-25s %12d%n", "Chunks to remove:", summary.getChunksToRemove()
Expand All @@ -155,6 +198,36 @@ private static void executePatchSummary(final String description, final Chunker
System.out.println();
}

private static String secondsToReadable(long seconds) {
StringBuilder sb = new StringBuilder();
boolean entered = false;
Duration time = Duration.ofSeconds(seconds);

long days = time.toDays();
if (days != 0) {
sb.append(days)
.append("d ");
entered = true;
}

int hours = time.toHoursPart();
if (hours != 0 || entered) {
sb.append(hours)
.append("h ");
entered = true;
}

int minutes = time.toMinutesPart();
if (minutes != 0 || entered) {
sb.append(minutes)
.append("m ");
}

sb.append(time.toSecondsPart())
.append("s");
return sb.toString();
}

private final List<ChunkMetadata> chunksToAdd = new ArrayList<>();
private final List<ChunkMetadata> chunksToMove = new ArrayList<>();
private final List<ChunkMetadata> chunksToRemove = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* {@link #setChunkerOption(ChunkerOption)} can be used to choose from the predefined algorithms.
* <p>
* The algorithms will try to strive for an expected chunk size given by {@link #setExpectedChunkSize(int)},
* a minimal chunk size given by {@link #setMinimalChunkSize(int)} and a maximal chunk size given by {@link #setMaximalChunkSize(int)}.
* a minimal chunk size given by {@link #setMinimalChunkSizeFactor(double)} and a maximal chunk size given by {@link #setMaximalChunkSizeFactor(double)}.
* <p>
* Most of the algorithms internally use a hash table as source for predicted noise to steer the algorithm, a custom
* table can be provided by {@link #setHashTable(long[])}.
Expand All @@ -44,8 +44,8 @@
* <ul>
* <li>Chunker option: {@link ChunkerOption#FAST_CDC}</li>
* <li>Expected size: {@code 8 * 1024}</li>
* <li>Minimal size: {@code 2 * 1024}</li>
* <li>Maximal size: {@code 64 * 1024}</li>
* <li>Minimal size factor: {@code 0.25}</li>
* <li>Maximal size factor: {@code 8}</li>
* <li>Hash table option: {@link HashTableOption#RTPAL}</li>
* <li>Mask generation seed: {@code 941568351}</li>
* <li>Mask option: {@link MaskOption#FAST_CDC}</li>
Expand Down Expand Up @@ -128,15 +128,13 @@ public final class ChunkerBuilder {
*/
private Long maskSmall;
/**
* The expected size of chunks, in bytes.
* The factor to apply to the expected chunk size to receive the maximal chunk size.
*/
private int maximalChunkSize =
(int) (ChunkerBuilder.DEFAULT_EXPECTED_CHUNK_SIZE * ChunkerBuilder.DEFAULT_MAX_SIZE_FACTOR);
private double maximalChunkSizeFactor = ChunkerBuilder.DEFAULT_MAX_SIZE_FACTOR;
/**
* The expected size of chunks, in bytes.
* The factor to apply to the expected chunk size to receive the minimal chunk size.
*/
private int minimalChunkSize =
(int) (ChunkerBuilder.DEFAULT_EXPECTED_CHUNK_SIZE * ChunkerBuilder.DEFAULT_MIN_SIZE_FACTOR);
private double minimalChunkSizeFactor = ChunkerBuilder.DEFAULT_MIN_SIZE_FACTOR;
/**
* The normalization level to use for choosing the masks in certain chunkers.
*/
Expand Down Expand Up @@ -164,6 +162,9 @@ public Chunker build() {
long maskSmallToUse = maskSmall != null ? maskSmall : maskGenerator.generateSmallMask();
long maskLargeToUse = maskLarge != null ? maskLarge : maskGenerator.generateLargeMask();

int minimalChunkSize = (int) (expectedChunkSize * minimalChunkSizeFactor);
int maximalChunkSize = (int) (expectedChunkSize * maximalChunkSizeFactor);

final IterativeStreamChunkerCore coreToUse = chunkerCore != null ? chunkerCore : switch (chunkerOption) {
case FAST_CDC -> new FastCdcChunkerCore(expectedChunkSize, minimalChunkSize, maximalChunkSize,
hashTableToUse, maskSmallToUse, maskLargeToUse);
Expand All @@ -183,8 +184,8 @@ public ChunkerBuilder fastCdc() {
chunkerOption = ChunkerOption.FAST_CDC;
hashMethod = ChunkerBuilder.DEFAULT_HASH_METHOD;
expectedChunkSize = ChunkerBuilder.DEFAULT_EXPECTED_CHUNK_SIZE;
minimalChunkSize = (int) (ChunkerBuilder.DEFAULT_EXPECTED_CHUNK_SIZE * ChunkerBuilder.DEFAULT_MIN_SIZE_FACTOR);
maximalChunkSize = (int) (ChunkerBuilder.DEFAULT_EXPECTED_CHUNK_SIZE * ChunkerBuilder.DEFAULT_MAX_SIZE_FACTOR);
minimalChunkSizeFactor = ChunkerBuilder.DEFAULT_MIN_SIZE_FACTOR;
maximalChunkSizeFactor = ChunkerBuilder.DEFAULT_MAX_SIZE_FACTOR;
hashTableOption = HashTableOption.RTPAL;
normalizationLevel = 2;
maskOption = MaskOption.FAST_CDC;
Expand All @@ -201,8 +202,8 @@ public ChunkerBuilder fsc() {
chunkerOption = ChunkerOption.FIXED_SIZE_CHUNKING;
hashMethod = ChunkerBuilder.DEFAULT_HASH_METHOD;
expectedChunkSize = ChunkerBuilder.DEFAULT_EXPECTED_CHUNK_SIZE;
minimalChunkSize = (int) (ChunkerBuilder.DEFAULT_EXPECTED_CHUNK_SIZE * ChunkerBuilder.DEFAULT_MIN_SIZE_FACTOR);
maximalChunkSize = (int) (ChunkerBuilder.DEFAULT_EXPECTED_CHUNK_SIZE * ChunkerBuilder.DEFAULT_MAX_SIZE_FACTOR);
minimalChunkSizeFactor = ChunkerBuilder.DEFAULT_MIN_SIZE_FACTOR;
maximalChunkSizeFactor = ChunkerBuilder.DEFAULT_MAX_SIZE_FACTOR;
return this;
}

Expand All @@ -215,8 +216,8 @@ public ChunkerBuilder nlFiedlerRust() {
chunkerOption = ChunkerOption.NLFIEDLER_RUST;
hashMethod = ChunkerBuilder.DEFAULT_HASH_METHOD;
expectedChunkSize = ChunkerBuilder.DEFAULT_EXPECTED_CHUNK_SIZE;
minimalChunkSize = (int) (ChunkerBuilder.DEFAULT_EXPECTED_CHUNK_SIZE * ChunkerBuilder.DEFAULT_MIN_SIZE_FACTOR);
maximalChunkSize = (int) (ChunkerBuilder.DEFAULT_EXPECTED_CHUNK_SIZE * ChunkerBuilder.DEFAULT_MAX_SIZE_FACTOR);
minimalChunkSizeFactor = ChunkerBuilder.DEFAULT_MIN_SIZE_FACTOR;
maximalChunkSizeFactor = ChunkerBuilder.DEFAULT_MAX_SIZE_FACTOR;
hashTableOption = HashTableOption.NLFIEDLER_RUST;
normalizationLevel = 1;
maskOption = MaskOption.NLFIEDLER_RUST;
Expand Down Expand Up @@ -373,32 +374,26 @@ public ChunkerBuilder setMaskSmall(final long maskSmall) {
}

/**
* Sets the maximal size of chunks, in bytes.
* Sets the factor to apply to the expected chunk size to receive the maximal chunk size.
*
* @param maximalChunkSize The maximal size of chunks, in bytes. Must be positive.
* @param maximalChunkSizeFactor The factor to apply
*
* @return This builder instance
*/
public ChunkerBuilder setMaximalChunkSize(final int maximalChunkSize) {
if (maximalChunkSize < 0) {
throw new IllegalArgumentException("Maximal chunk size must be positive, was: " + maximalChunkSize);
}
this.maximalChunkSize = maximalChunkSize;
public ChunkerBuilder setMaximalChunkSizeFactor(final double maximalChunkSizeFactor) {
this.maximalChunkSizeFactor = maximalChunkSizeFactor;
return this;
}

/**
* Sets the minimal size of chunks, in bytes.
* Sets the factor to apply to the expected chunk size to receive the minimal chunk size.
*
* @param minimalChunkSize The minimal size of chunks, in bytes. Must be positive.
* @param minimalChunkSizeFactor The factor to apply
*
* @return This builder instance
*/
public ChunkerBuilder setMinimalChunkSize(final int minimalChunkSize) {
if (minimalChunkSize < 0) {
throw new IllegalArgumentException("Minimal chunk size must be positive, was: " + minimalChunkSize);
}
this.minimalChunkSize = minimalChunkSize;
public ChunkerBuilder setMinimalChunkSizeFactor(final double minimalChunkSizeFactor) {
this.minimalChunkSizeFactor = minimalChunkSizeFactor;
return this;
}

Expand Down

0 comments on commit 49167a1

Please sign in to comment.