Skip to content

Commit

Permalink
feat(handler): add multi-part gzip handler.
Browse files Browse the repository at this point in the history
It's possible to create multi-part gzip with 'split', which will create
multiple gzip compressed files with a 'aa', 'ab', 'ac', .. suffix.

We match on '.gz.aa' in a directory, get all the files with same name
but different suffix, order them and feed them to 7z.

This is very close to what we were already doing with multi-part 7zip
archives.

Co-authored-by: Krisztián Fekete <[email protected]>
  • Loading branch information
qkaiser and e3krisztian committed Jan 2, 2024
1 parent 2b48fc2 commit a29cedd
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 4 deletions.
3 changes: 3 additions & 0 deletions tests/integration/compression/gzip/__input__/multi-volume.tar
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
5 changes: 4 additions & 1 deletion unblob/handlers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,4 +101,7 @@
engenius.EngeniusHandler,
)

BUILTIN_DIR_HANDLERS: DirectoryHandlers = (sevenzip.MultiVolumeSevenZipHandler,)
BUILTIN_DIR_HANDLERS: DirectoryHandlers = (
sevenzip.MultiVolumeSevenZipHandler,
gzip.MultiVolumeGzipHandler,
)
59 changes: 56 additions & 3 deletions unblob/handlers/compression/gzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,21 @@
from structlog import get_logger

from unblob.extractors import Command
from unblob.extractors.command import MultiFileCommand
from unblob.models import Extractor

from ...file_utils import InvalidInputFormat
from ...models import File, Handler, HexString, ValidChunk
from ...models import (
DirectoryExtractor,
DirectoryHandler,
ExtractResult,
File,
Glob,
Handler,
HexString,
MultiFile,
ValidChunk,
)
from ._gzip_reader import SingleMemberGzipReader

logger = get_logger()
Expand Down Expand Up @@ -71,10 +82,22 @@ class GZIPExtractor(Extractor):
def get_dependencies(self) -> List[str]:
return ["7z"]

def extract(self, inpath: Path, outdir: Path):
def extract(self, inpath: Path, outdir: Path) -> Optional[ExtractResult]:
name = get_gzip_embedded_name(inpath) or "gzip.uncompressed"
extractor = Command("7z", "x", "-y", "{inpath}", "-so", stdout=name)
extractor.extract(inpath, outdir)
return extractor.extract(inpath, outdir)


class MultiGZIPExtractor(DirectoryExtractor):
def get_dependencies(self) -> List[str]:
return ["7z"]

def extract(self, paths: List[Path], outdir: Path) -> Optional[ExtractResult]:
name = get_gzip_embedded_name(paths[0]) or "gzip.uncompressed"
extractor = MultiFileCommand(
"7z", "x", "-p", "-y", "{inpath}", "-so", stdout=name
)
return extractor.extract(paths, outdir)


class GZIPHandler(Handler):
Expand Down Expand Up @@ -124,3 +147,33 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]
start_offset=start_offset,
end_offset=file.tell(),
)


class MultiVolumeGzipHandler(DirectoryHandler):
NAME = "multi-gzip"
EXTRACTOR = MultiGZIPExtractor()

PATTERN = Glob("*.gz.*")

def calculate_multifile(self, file: Path) -> Optional[MultiFile]:
with File.from_path(file) as f:
# check that it's actually gzip
fp = SingleMemberGzipReader(f)

try:
if not fp.read_header():
return None
except gzip.BadGzipFile:
return None

paths = sorted(file.parent.glob(f"{file.stem}.*"))

files_size = sum(path.stat().st_size for path in paths)
logger.debug(
"Multi-volume files", paths=paths, files_size=files_size, _verbosity=2
)

return MultiFile(
name=file.stem,
paths=paths,
)

0 comments on commit a29cedd

Please sign in to comment.