From 89f76dac547424a2e9a718a81e2a50dfeff37479 Mon Sep 17 00:00:00 2001 From: Quentin Kaiser Date: Fri, 7 Apr 2023 17:37:41 +0200 Subject: [PATCH] first shot at pattern auto-identification --- .../{0-1.unknown => 0-1.padding} | 0 .../{0-17.unknown => 0-17.padding} | 0 ...9-366113.unknown => 366109-366113.padding} | 0 ...9-366113.unknown => 366109-366113.padding} | 0 ...8160-32768.unknown => 28160-32768.padding} | 0 ...8160-32768.unknown => 28160-32768.padding} | 0 ...2097152.unknown => 551424-2097152.padding} | 0 tests/test_report.py | 3 +- unblob/extractor.py | 13 +++++-- unblob/models.py | 22 ++++++++++++ unblob/processing.py | 35 +++++++++++++++++-- unblob/report.py | 10 ++++++ 12 files changed, 77 insertions(+), 6 deletions(-) rename tests/integration/archive/cpio/cpio_portable_ascii/__output__/sample.nofilepad.unaligned.cpio-newc_extract/{0-1.unknown => 0-1.padding} (100%) rename tests/integration/compression/bzip2/__output__/collated.headpad.bzip2_extract/{0-17.unknown => 0-17.padding} (100%) rename tests/integration/executable/elf/elf64/__output__/kernel-initramfs-padding_extract/17-11651465.elf64_extract/initramfs_extract/{366109-366113.unknown => 366109-366113.padding} (100%) rename tests/integration/executable/elf/elf64/__output__/linux-kernel-with-initramfs-without-loadable-modules_extract/15039-1184915.xz_extract/xz.uncompressed_extract/initramfs_extract/{366109-366113.unknown => 366109-366113.padding} (100%) rename tests/integration/filesystem/fat/fat12/__output__/cherry.truncated.fat12_extract/{28160-32768.unknown => 28160-32768.padding} (100%) rename tests/integration/filesystem/fat/fat16/__output__/banana.truncated.fat16_extract/{28160-32768.unknown => 28160-32768.padding} (100%) rename tests/integration/filesystem/fat/fat32/__output__/banana.truncated.fat32_extract/{551424-2097152.unknown => 551424-2097152.padding} (100%) diff --git a/tests/integration/archive/cpio/cpio_portable_ascii/__output__/sample.nofilepad.unaligned.cpio-newc_extract/0-1.unknown b/tests/integration/archive/cpio/cpio_portable_ascii/__output__/sample.nofilepad.unaligned.cpio-newc_extract/0-1.padding similarity index 100% rename from tests/integration/archive/cpio/cpio_portable_ascii/__output__/sample.nofilepad.unaligned.cpio-newc_extract/0-1.unknown rename to tests/integration/archive/cpio/cpio_portable_ascii/__output__/sample.nofilepad.unaligned.cpio-newc_extract/0-1.padding diff --git a/tests/integration/compression/bzip2/__output__/collated.headpad.bzip2_extract/0-17.unknown b/tests/integration/compression/bzip2/__output__/collated.headpad.bzip2_extract/0-17.padding similarity index 100% rename from tests/integration/compression/bzip2/__output__/collated.headpad.bzip2_extract/0-17.unknown rename to tests/integration/compression/bzip2/__output__/collated.headpad.bzip2_extract/0-17.padding diff --git a/tests/integration/executable/elf/elf64/__output__/kernel-initramfs-padding_extract/17-11651465.elf64_extract/initramfs_extract/366109-366113.unknown b/tests/integration/executable/elf/elf64/__output__/kernel-initramfs-padding_extract/17-11651465.elf64_extract/initramfs_extract/366109-366113.padding similarity index 100% rename from tests/integration/executable/elf/elf64/__output__/kernel-initramfs-padding_extract/17-11651465.elf64_extract/initramfs_extract/366109-366113.unknown rename to tests/integration/executable/elf/elf64/__output__/kernel-initramfs-padding_extract/17-11651465.elf64_extract/initramfs_extract/366109-366113.padding diff --git a/tests/integration/executable/elf/elf64/__output__/linux-kernel-with-initramfs-without-loadable-modules_extract/15039-1184915.xz_extract/xz.uncompressed_extract/initramfs_extract/366109-366113.unknown b/tests/integration/executable/elf/elf64/__output__/linux-kernel-with-initramfs-without-loadable-modules_extract/15039-1184915.xz_extract/xz.uncompressed_extract/initramfs_extract/366109-366113.padding similarity index 100% rename from tests/integration/executable/elf/elf64/__output__/linux-kernel-with-initramfs-without-loadable-modules_extract/15039-1184915.xz_extract/xz.uncompressed_extract/initramfs_extract/366109-366113.unknown rename to tests/integration/executable/elf/elf64/__output__/linux-kernel-with-initramfs-without-loadable-modules_extract/15039-1184915.xz_extract/xz.uncompressed_extract/initramfs_extract/366109-366113.padding diff --git a/tests/integration/filesystem/fat/fat12/__output__/cherry.truncated.fat12_extract/28160-32768.unknown b/tests/integration/filesystem/fat/fat12/__output__/cherry.truncated.fat12_extract/28160-32768.padding similarity index 100% rename from tests/integration/filesystem/fat/fat12/__output__/cherry.truncated.fat12_extract/28160-32768.unknown rename to tests/integration/filesystem/fat/fat12/__output__/cherry.truncated.fat12_extract/28160-32768.padding diff --git a/tests/integration/filesystem/fat/fat16/__output__/banana.truncated.fat16_extract/28160-32768.unknown b/tests/integration/filesystem/fat/fat16/__output__/banana.truncated.fat16_extract/28160-32768.padding similarity index 100% rename from tests/integration/filesystem/fat/fat16/__output__/banana.truncated.fat16_extract/28160-32768.unknown rename to tests/integration/filesystem/fat/fat16/__output__/banana.truncated.fat16_extract/28160-32768.padding diff --git a/tests/integration/filesystem/fat/fat32/__output__/banana.truncated.fat32_extract/551424-2097152.unknown b/tests/integration/filesystem/fat/fat32/__output__/banana.truncated.fat32_extract/551424-2097152.padding similarity index 100% rename from tests/integration/filesystem/fat/fat32/__output__/banana.truncated.fat32_extract/551424-2097152.unknown rename to tests/integration/filesystem/fat/fat32/__output__/banana.truncated.fat32_extract/551424-2097152.padding diff --git a/tests/test_report.py b/tests/test_report.py index 3e96d73124..a920b5fdec 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -13,6 +13,7 @@ ChunkReport, FileMagicReport, HashReport, + PaddingChunkReport, StatReport, UnknownChunkReport, ) @@ -133,7 +134,7 @@ def hello_kitty_task_results( size=7, entropy=None, ), - UnknownChunkReport( + PaddingChunkReport( id=ANY, start_offset=263, end_offset=264, diff --git a/unblob/extractor.py b/unblob/extractor.py index f78a9ee8b7..8b4aac0d29 100644 --- a/unblob/extractor.py +++ b/unblob/extractor.py @@ -2,11 +2,12 @@ import errno import os from pathlib import Path +from typing import Union from structlog import get_logger from .file_utils import carve, is_safe_path -from .models import Chunk, File, TaskResult, UnknownChunk, ValidChunk +from .models import Chunk, File, PaddingChunk, TaskResult, UnknownChunk, ValidChunk from .report import MaliciousSymlinkRemoved logger = get_logger() @@ -113,8 +114,14 @@ def _fix_extracted_directory(directory: Path): _fix_extracted_directory(outdir) -def carve_unknown_chunk(extract_dir: Path, file: File, chunk: UnknownChunk) -> Path: - filename = f"{chunk.start_offset}-{chunk.end_offset}.unknown" +def carve_unknown_chunk( + extract_dir: Path, file: File, chunk: Union[UnknownChunk, PaddingChunk] +) -> Path: + extension = "unknown" + if isinstance(chunk, PaddingChunk): + extension = "padding" + + filename = f"{chunk.start_offset}-{chunk.end_offset}.{extension}" carve_path = extract_dir / filename logger.info("Extracting unknown chunk", path=carve_path, chunk=chunk) carve_chunk_to_file(carve_path, file, chunk) diff --git a/unblob/models.py b/unblob/models.py index ecf218c1d9..7264c08290 100644 --- a/unblob/models.py +++ b/unblob/models.py @@ -17,6 +17,7 @@ EntropyReport, ErrorReport, MultiFileReport, + PaddingChunkReport, Report, UnknownChunkReport, ) @@ -147,6 +148,27 @@ def as_report(self, entropy: Optional[EntropyReport]) -> UnknownChunkReport: ) +@attr.define(repr=False) +class PaddingChunk(Chunk): + r"""Gaps between valid chunks or otherwise unknown chunks. + + Important for manual analysis, and analytical certanity: for example + entropy, other chunks inside it, metadata, etc. + + These are not extracted, just logged for information purposes and further analysis, + like most common bytes (like \x00 and \xFF), ASCII strings, high entropy, etc. + """ + + def as_report(self, entropy: Optional[EntropyReport]) -> PaddingChunkReport: + return PaddingChunkReport( + id=self.id, + start_offset=self.start_offset, + end_offset=self.end_offset, + size=self.size, + entropy=entropy, + ) + + @attrs.define class MultiFile(Blob): name: str = attr.field(kw_only=True) diff --git a/unblob/processing.py b/unblob/processing.py index 72b500a301..b754e2ec2b 100644 --- a/unblob/processing.py +++ b/unblob/processing.py @@ -2,7 +2,7 @@ import shutil from operator import attrgetter from pathlib import Path -from typing import Iterable, List, Optional, Sequence, Set, Tuple, Type +from typing import Iterable, List, Optional, Sequence, Set, Tuple, Type, Union import attr import magic @@ -24,6 +24,7 @@ ExtractError, File, MultiFile, + PaddingChunk, ProcessResult, Task, TaskResult, @@ -450,6 +451,35 @@ def _iterate_directory(self, extract_dirs, processed_paths): ) +def is_padding(file: File, chunk: UnknownChunk): + return not any( + current_byte != next_byte + for current_byte, next_byte in zip( + file[chunk.start_offset : chunk.end_offset], + file[chunk.start_offset + 1 : chunk.end_offset], + ) + ) + + +def process_patterns( + unknown_chunks: List[UnknownChunk], file: File +) -> List[Union[UnknownChunk, PaddingChunk]]: + processed_chunks = [] + for unknown_chunk in unknown_chunks: + if is_padding(file, unknown_chunk): + processed_chunks.append( + PaddingChunk( + start_offset=unknown_chunk.start_offset, + end_offset=unknown_chunk.end_offset, + id=unknown_chunk.id, + file=unknown_chunk.file, + ) + ) + else: + processed_chunks.append(unknown_chunk) + return processed_chunks + + class _FileTask: def __init__( self, @@ -487,6 +517,7 @@ def process(self): ) outer_chunks = remove_inner_chunks(all_chunks) unknown_chunks = calculate_unknown_chunks(outer_chunks, self.size) + unknown_chunks = process_patterns(unknown_chunks, file) assign_file_to_chunks(outer_chunks, file=file) assign_file_to_chunks(unknown_chunks, file=file) @@ -503,7 +534,7 @@ def _process_chunks( self, file: File, outer_chunks: List[ValidChunk], - unknown_chunks: List[UnknownChunk], + unknown_chunks: List[Union[UnknownChunk, PaddingChunk]], ): if unknown_chunks: logger.warning("Found unknown Chunks", chunks=unknown_chunks) diff --git a/unblob/report.py b/unblob/report.py index 1a4fc5aef9..dea3f345c6 100644 --- a/unblob/report.py +++ b/unblob/report.py @@ -236,6 +236,16 @@ class UnknownChunkReport(Report): entropy: Optional[EntropyReport] +@final +@attr.define(kw_only=True, frozen=True) +class PaddingChunkReport(Report): + id: str # noqa: A003 + start_offset: int + end_offset: int + size: int + entropy: Optional[EntropyReport] + + @final @attr.define(kw_only=True, frozen=True) class MultiFileReport(Report):