From a405de3fa98a5355c2608ee466f1d5f01d89765c Mon Sep 17 00:00:00 2001 From: Quentin Kaiser Date: Fri, 15 Dec 2023 19:56:32 +0100 Subject: [PATCH] feat(handler): add multi-part gzip handler. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's possible to create multi-part gzip with 'split', which will create multiple gzip compressed files with a 'aa', 'ab', 'ac', .. suffix. We match on '.gz.aa' in a directory, get all the files with same name but different suffix, order them and feed them to 7z. This is very close to what we were already doing with multi-part 7zip archives. Co-authored-by: KrisztiƔn Fekete <1246751+e3krisztian@users.noreply.github.com> --- .../__input__/multi-volume-digit-hash.tar | 3 + .../gzip/__input__/multi-volume-digit.tar | 3 + .../multi-volume-split-then-gzip.tar | 3 + .../gzip/__input__/multi-volume.tar | 3 + .../multi-part-file.gz.01 | 3 + .../multi-part-file.gz.02 | 3 + .../multi-part-file.gz.md5 | 3 + .../multi-part-file.gz_extract/one.txt | 3 + .../multi-part-file.gz.01 | 3 + .../multi-part-file.gz.02 | 3 + .../multi-part-file.gz_extract/one.txt | 3 + .../one.txt.gz.aa | 3 + .../one.txt.gz.aa_extract/one.txt.aa | 3 + .../one.txt.gz.ab | 3 + .../one.txt.gz.ab_extract/one.txt.ab | 3 + .../one.txt.gz.ac | 3 + .../one.txt.gz.ac_extract/one.txt.ac | 3 + .../multi-part-file.gz.aa | 3 + .../multi-part-file.gz.ab | 3 + .../multi-part-file.gz_extract/one.txt | 3 + unblob/handlers/__init__.py | 5 +- unblob/handlers/compression/gzip.py | 71 ++++++++++++++++++- 22 files changed, 132 insertions(+), 4 deletions(-) create mode 100644 tests/integration/compression/gzip/__input__/multi-volume-digit-hash.tar create mode 100644 tests/integration/compression/gzip/__input__/multi-volume-digit.tar create mode 100644 tests/integration/compression/gzip/__input__/multi-volume-split-then-gzip.tar create mode 100644 tests/integration/compression/gzip/__input__/multi-volume.tar create mode 100644 tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.01 create mode 100644 tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.02 create mode 100644 tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.md5 create mode 100644 tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz_extract/one.txt create mode 100644 tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz.01 create mode 100644 tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz.02 create mode 100644 tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz_extract/one.txt create mode 100644 tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.aa create mode 100644 tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.aa_extract/one.txt.aa create mode 100644 tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ab create mode 100644 tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ab_extract/one.txt.ab create mode 100644 tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ac create mode 100644 tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ac_extract/one.txt.ac create mode 100644 tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.aa create mode 100644 tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.ab create mode 100644 tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz_extract/one.txt diff --git a/tests/integration/compression/gzip/__input__/multi-volume-digit-hash.tar b/tests/integration/compression/gzip/__input__/multi-volume-digit-hash.tar new file mode 100644 index 0000000000..7c42d73af9 --- /dev/null +++ b/tests/integration/compression/gzip/__input__/multi-volume-digit-hash.tar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a10c7419ea665deb88d4f7bd0cc7685e36e071562d8ce047feb91a9d993c9c87 +size 10240 diff --git a/tests/integration/compression/gzip/__input__/multi-volume-digit.tar b/tests/integration/compression/gzip/__input__/multi-volume-digit.tar new file mode 100644 index 0000000000..8e06622d05 --- /dev/null +++ b/tests/integration/compression/gzip/__input__/multi-volume-digit.tar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b916eb8cddce9d9f8e9b8ac9d3ea48b4fd73249dc86bf71ea59f83f6dda931 +size 10240 diff --git a/tests/integration/compression/gzip/__input__/multi-volume-split-then-gzip.tar b/tests/integration/compression/gzip/__input__/multi-volume-split-then-gzip.tar new file mode 100644 index 0000000000..9aa07bf7fd --- /dev/null +++ b/tests/integration/compression/gzip/__input__/multi-volume-split-then-gzip.tar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85481ea8c4c48236a25304d337d0b3f10900aac26194d15bba913bec90d18c3a +size 10240 diff --git a/tests/integration/compression/gzip/__input__/multi-volume.tar b/tests/integration/compression/gzip/__input__/multi-volume.tar new file mode 100644 index 0000000000..2f252aa65f --- /dev/null +++ b/tests/integration/compression/gzip/__input__/multi-volume.tar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d605c55b80ae5c57f6a8268d17a1d41d816dfeda7829ba8a488d62199e7c891 +size 10240 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.01 b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.01 new file mode 100644 index 0000000000..e94a5d1aab --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.01 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:727525bd9f74dfc7046a7767011b2952bee7031638b1d2fa01c7830beec8f200 +size 50 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.02 b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.02 new file mode 100644 index 0000000000..c1dbca68e6 --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.02 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bec36068029bb2e2158c0030e062b5350ea23e9acd8ae3f45256057b481401b +size 10 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.md5 b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.md5 new file mode 100644 index 0000000000..b6b445168d --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz.md5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c235d418bc607a12389cb89381279d33e56126847512cc05611333d653464345 +size 33 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz_extract/one.txt b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz_extract/one.txt new file mode 100644 index 0000000000..7d58ca049a --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-digit-hash.tar_extract/multi-part-file.gz_extract/one.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ff0f745425182c35616510e1ed2781339350102bcd1f0167842248e0649400 +size 47 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz.01 b/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz.01 new file mode 100644 index 0000000000..e94a5d1aab --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz.01 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:727525bd9f74dfc7046a7767011b2952bee7031638b1d2fa01c7830beec8f200 +size 50 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz.02 b/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz.02 new file mode 100644 index 0000000000..c1dbca68e6 --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz.02 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bec36068029bb2e2158c0030e062b5350ea23e9acd8ae3f45256057b481401b +size 10 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz_extract/one.txt b/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz_extract/one.txt new file mode 100644 index 0000000000..7d58ca049a --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-digit.tar_extract/multi-part-file.gz_extract/one.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ff0f745425182c35616510e1ed2781339350102bcd1f0167842248e0649400 +size 47 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.aa b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.aa new file mode 100644 index 0000000000..a2cb62308d --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.aa @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7decfa608a7decfc2ab1600ee52707caff556bd911dd6d8613baa43ac38b618 +size 49 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.aa_extract/one.txt.aa b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.aa_extract/one.txt.aa new file mode 100644 index 0000000000..7d58ca049a --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.aa_extract/one.txt.aa @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ff0f745425182c35616510e1ed2781339350102bcd1f0167842248e0649400 +size 47 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ab b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ab new file mode 100644 index 0000000000..8bfc9e3637 --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b2dadf5e5ed42b7e146a8eb24f3cba3593b579198a4d06e343c87c31454181e +size 49 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ab_extract/one.txt.ab b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ab_extract/one.txt.ab new file mode 100644 index 0000000000..ae4693fb04 --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ab_extract/one.txt.ab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aaf5e2eb21df15ec18a16bd9de00a78dfa7aa3afa9863b485928a5aadb39437 +size 20 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ac b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ac new file mode 100644 index 0000000000..bfdbe9bff4 --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ac @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97aa0d8b4b0aedef85fb1c6b902da2ea3dcf57ea94045e396b6dac68d796ffe8 +size 38 diff --git a/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ac_extract/one.txt.ac b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ac_extract/one.txt.ac new file mode 100644 index 0000000000..96c3141afc --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume-split-then-gzip.tar_extract/one.txt.gz.ac_extract/one.txt.ac @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5369cf22db555832933c052a3cb015042c939449738838f7aca9e6bde314a1fe +size 7 diff --git a/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.aa b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.aa new file mode 100644 index 0000000000..e94a5d1aab --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.aa @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:727525bd9f74dfc7046a7767011b2952bee7031638b1d2fa01c7830beec8f200 +size 50 diff --git a/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.ab b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.ab new file mode 100644 index 0000000000..c1dbca68e6 --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.ab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bec36068029bb2e2158c0030e062b5350ea23e9acd8ae3f45256057b481401b +size 10 diff --git a/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz_extract/one.txt b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz_extract/one.txt new file mode 100644 index 0000000000..7d58ca049a --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz_extract/one.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ff0f745425182c35616510e1ed2781339350102bcd1f0167842248e0649400 +size 47 diff --git a/unblob/handlers/__init__.py b/unblob/handlers/__init__.py index 6b60d73293..df12eb0df7 100644 --- a/unblob/handlers/__init__.py +++ b/unblob/handlers/__init__.py @@ -101,4 +101,7 @@ engenius.EngeniusHandler, ) -BUILTIN_DIR_HANDLERS: DirectoryHandlers = (sevenzip.MultiVolumeSevenZipHandler,) +BUILTIN_DIR_HANDLERS: DirectoryHandlers = ( + sevenzip.MultiVolumeSevenZipHandler, + gzip.MultiVolumeGzipHandler, +) diff --git a/unblob/handlers/compression/gzip.py b/unblob/handlers/compression/gzip.py index f0ec499bf9..06cdf6f93d 100644 --- a/unblob/handlers/compression/gzip.py +++ b/unblob/handlers/compression/gzip.py @@ -24,10 +24,21 @@ from structlog import get_logger from unblob.extractors import Command +from unblob.extractors.command import MultiFileCommand from unblob.models import Extractor from ...file_utils import InvalidInputFormat -from ...models import File, Handler, HexString, ValidChunk +from ...models import ( + DirectoryExtractor, + DirectoryHandler, + ExtractResult, + File, + Glob, + Handler, + HexString, + MultiFile, + ValidChunk, +) from ._gzip_reader import SingleMemberGzipReader logger = get_logger() @@ -71,10 +82,22 @@ class GZIPExtractor(Extractor): def get_dependencies(self) -> List[str]: return ["7z"] - def extract(self, inpath: Path, outdir: Path): + def extract(self, inpath: Path, outdir: Path) -> Optional[ExtractResult]: name = get_gzip_embedded_name(inpath) or "gzip.uncompressed" extractor = Command("7z", "x", "-y", "{inpath}", "-so", stdout=name) - extractor.extract(inpath, outdir) + return extractor.extract(inpath, outdir) + + +class MultiGZIPExtractor(DirectoryExtractor): + def get_dependencies(self) -> List[str]: + return ["7z"] + + def extract(self, paths: List[Path], outdir: Path) -> Optional[ExtractResult]: + name = get_gzip_embedded_name(paths[0]) or "gzip.uncompressed" + extractor = MultiFileCommand( + "7z", "x", "-p", "-y", "{inpath}", "-so", stdout=name + ) + return extractor.extract(paths, outdir) class GZIPHandler(Handler): @@ -124,3 +147,45 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] start_offset=start_offset, end_offset=file.tell(), ) + + +class MultiVolumeGzipHandler(DirectoryHandler): + NAME = "multi-gzip" + EXTRACTOR = MultiGZIPExtractor() + + PATTERN = Glob("*.gz.*") + + def is_valid_gzip(self, path: Path) -> bool: + with File.from_path(path) as f: + try: + fp = SingleMemberGzipReader(f) + if not fp.read_header(): + return False + except gzip.BadGzipFile: + return False + return True + + def calculate_multifile(self, file: Path) -> Optional[MultiFile]: + paths = sorted(file.parent.glob(f"{file.stem}.*")) + + # we 'discard' paths that are not the first in the ordered list, + # otherwise we will end up with colliding reports, one for every + # path in the list. + if file != paths[0]: + return None + + valid_gzips = sum([self.is_valid_gzip(path) for path in paths]) + + # the presence of multiple valid gzips with the same stem would mean each volume + # is independently compressed. + if valid_gzips == 1: + files_size = sum(path.stat().st_size for path in paths) + logger.debug( + "Multi-volume files", paths=paths, files_size=files_size, _verbosity=2 + ) + + return MultiFile( + name=paths[0].stem, + paths=paths, + ) + return None