From 59e1c99d43e089bac140a235673ee060057265ec Mon Sep 17 00:00:00 2001 From: Quentin Kaiser Date: Fri, 15 Dec 2023 19:56:32 +0100 Subject: [PATCH] feat(handler): add multi-part gzip handler. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's possible to create multi-part gzip with 'split', which will create multiple gzip compressed files with a 'aa', 'ab', 'ac', .. suffix. We match on '.gz.aa' in a directory, get all the files with same name but different suffix, order them and feed them to 7z. This is very close to what we were already doing with multi-part 7zip archives. Co-authored-by: KrisztiƔn Fekete <1246751+e3krisztian@users.noreply.github.com> --- .../gzip/__input__/multi-volume.tar | 3 + .../multi-part-file.gz.aa | 3 + .../multi-part-file.gz.ab | 3 + .../multi-part-file.gz_extract/one.txt | 3 + unblob/handlers/__init__.py | 5 +- unblob/handlers/compression/gzip.py | 59 ++++++++++++++++++- 6 files changed, 72 insertions(+), 4 deletions(-) create mode 100644 tests/integration/compression/gzip/__input__/multi-volume.tar create mode 100644 tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.aa create mode 100644 tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.ab create mode 100644 tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz_extract/one.txt diff --git a/tests/integration/compression/gzip/__input__/multi-volume.tar b/tests/integration/compression/gzip/__input__/multi-volume.tar new file mode 100644 index 0000000000..2f252aa65f --- /dev/null +++ b/tests/integration/compression/gzip/__input__/multi-volume.tar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d605c55b80ae5c57f6a8268d17a1d41d816dfeda7829ba8a488d62199e7c891 +size 10240 diff --git a/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.aa b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.aa new file mode 100644 index 0000000000..e94a5d1aab --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.aa @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:727525bd9f74dfc7046a7767011b2952bee7031638b1d2fa01c7830beec8f200 +size 50 diff --git a/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.ab b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.ab new file mode 100644 index 0000000000..c1dbca68e6 --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz.ab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bec36068029bb2e2158c0030e062b5350ea23e9acd8ae3f45256057b481401b +size 10 diff --git a/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz_extract/one.txt b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz_extract/one.txt new file mode 100644 index 0000000000..7d58ca049a --- /dev/null +++ b/tests/integration/compression/gzip/__output__/multi-volume.tar_extract/multi-part-file.gz_extract/one.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ff0f745425182c35616510e1ed2781339350102bcd1f0167842248e0649400 +size 47 diff --git a/unblob/handlers/__init__.py b/unblob/handlers/__init__.py index 6b60d73293..df12eb0df7 100644 --- a/unblob/handlers/__init__.py +++ b/unblob/handlers/__init__.py @@ -101,4 +101,7 @@ engenius.EngeniusHandler, ) -BUILTIN_DIR_HANDLERS: DirectoryHandlers = (sevenzip.MultiVolumeSevenZipHandler,) +BUILTIN_DIR_HANDLERS: DirectoryHandlers = ( + sevenzip.MultiVolumeSevenZipHandler, + gzip.MultiVolumeGzipHandler, +) diff --git a/unblob/handlers/compression/gzip.py b/unblob/handlers/compression/gzip.py index f0ec499bf9..290d51ace6 100644 --- a/unblob/handlers/compression/gzip.py +++ b/unblob/handlers/compression/gzip.py @@ -24,10 +24,21 @@ from structlog import get_logger from unblob.extractors import Command +from unblob.extractors.command import MultiFileCommand from unblob.models import Extractor from ...file_utils import InvalidInputFormat -from ...models import File, Handler, HexString, ValidChunk +from ...models import ( + DirectoryExtractor, + DirectoryHandler, + ExtractResult, + File, + Glob, + Handler, + HexString, + MultiFile, + ValidChunk, +) from ._gzip_reader import SingleMemberGzipReader logger = get_logger() @@ -71,10 +82,22 @@ class GZIPExtractor(Extractor): def get_dependencies(self) -> List[str]: return ["7z"] - def extract(self, inpath: Path, outdir: Path): + def extract(self, inpath: Path, outdir: Path) -> Optional[ExtractResult]: name = get_gzip_embedded_name(inpath) or "gzip.uncompressed" extractor = Command("7z", "x", "-y", "{inpath}", "-so", stdout=name) - extractor.extract(inpath, outdir) + return extractor.extract(inpath, outdir) + + +class MultiGZIPExtractor(DirectoryExtractor): + def get_dependencies(self) -> List[str]: + return ["7z"] + + def extract(self, paths: List[Path], outdir: Path) -> Optional[ExtractResult]: + name = get_gzip_embedded_name(paths[0]) or "gzip.uncompressed" + extractor = MultiFileCommand( + "7z", "x", "-p", "-y", "{inpath}", "-so", stdout=name + ) + return extractor.extract(paths, outdir) class GZIPHandler(Handler): @@ -124,3 +147,33 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] start_offset=start_offset, end_offset=file.tell(), ) + + +class MultiVolumeGzipHandler(DirectoryHandler): + NAME = "multi-gzip" + EXTRACTOR = MultiGZIPExtractor() + + PATTERN = Glob("*.gz.*") + + def calculate_multifile(self, file: Path) -> Optional[MultiFile]: + with File.from_path(file) as f: + # check that it's actually gzip + fp = SingleMemberGzipReader(f) + + try: + if not fp.read_header(): + return None + except gzip.BadGzipFile: + return None + + paths = sorted(file.parent.glob(f"{file.stem}.*")) + + files_size = sum(path.stat().st_size for path in paths) + logger.debug( + "Multi-volume files", paths=paths, files_size=files_size, _verbosity=2 + ) + + return MultiFile( + name=file.stem, + paths=paths, + )