From 6040ac4c048fa5e8f8cac28329821ec85d379977 Mon Sep 17 00:00:00 2001 From: Martin Raspaud Date: Tue, 27 Aug 2024 12:51:53 +0200 Subject: [PATCH 1/2] Enable decoding of tbm dataset name --- pygac/pod_reader.py | 17 +++++++++++------ pygac/tests/test_reader.py | 18 +++++++++--------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/pygac/pod_reader.py b/pygac/pod_reader.py index ae3f7e3..c272741 100644 --- a/pygac/pod_reader.py +++ b/pygac/pod_reader.py @@ -38,6 +38,7 @@ import datetime import logging + try: from enum import IntFlag except ImportError: @@ -45,13 +46,12 @@ IntFlag = object import numpy as np - -from pyorbital.geoloc_instrument_definitions import avhrr_gac from pyorbital.geoloc import compute_pixels, get_lonlatalt +from pyorbital.geoloc_instrument_definitions import avhrr_gac from pygac.clock_offsets_converter import get_offsets from pygac.correct_tsm_issue import TSM_AFFECTED_INTERVALS_POD, get_tsm_idx -from pygac.reader import Reader, ReaderError, NoTLEData +from pygac.reader import NoTLEData, Reader, ReaderError from pygac.slerp import slerp from pygac.utils import file_opener @@ -321,9 +321,14 @@ def read_header(cls, filename, fileobj=None, header_date="auto"): _tbm_head, = np.frombuffer( fd_.read(tbm_header.itemsize), dtype=tbm_header, count=1) - try: - data_set_name = _tbm_head['data_set_name'].decode() - except UnicodeDecodeError: + for encoding in ("utf-8", "cp500"): + try: + data_set_name = _tbm_head['data_set_name'].decode(encoding) + except ValueError: + continue + else: + break + else: data_set_name = '---' allowed_empty = (42*b'\x00' + b' ') if (cls.data_set_pattern.match(data_set_name) diff --git a/pygac/tests/test_reader.py b/pygac/tests/test_reader.py index aef9dcb..fded545 100644 --- a/pygac/tests/test_reader.py +++ b/pygac/tests/test_reader.py @@ -24,20 +24,20 @@ import os import sys import unittest -import pytest - from unittest import mock + import numpy as np import numpy.testing -from pygac.gac_reader import GACReader, ReaderError -from pygac.lac_reader import LACReader -from pygac.pod_reader import POD_QualityIndicator +import pytest + from pygac.gac_pod import scanline -from pygac.reader import NoTLEData +from pygac.gac_reader import GACReader, ReaderError from pygac.lac_pod import LACPODReader - -from pygac.pod_reader import tbm_header as tbm_header_dtype, header3 from pygac.lac_pod import scanline as lacpod_scanline +from pygac.lac_reader import LACReader +from pygac.pod_reader import POD_QualityIndicator, header3 +from pygac.pod_reader import tbm_header as tbm_header_dtype +from pygac.reader import NoTLEData class TestPath(os.PathLike): @@ -688,7 +688,7 @@ def pod_file_with_tbm_header(tmp_path): number_of_scans = 3 tbm_header = np.zeros(1, dtype=tbm_header_dtype) - tbm_header["data_set_name"] = b"BRN.HRPT.NJ.D00322.S0334.E0319.B3031919.BL " + tbm_header["data_set_name"] = "BRN.HRPT.NJ.D00322.S0334.E0319.B3031919.BL\x80\x80".encode("cp500") tbm_header["select_flag"] = b"S" tbm_header["beginning_latitude"] = b"+77" tbm_header["ending_latitude"] = b"+22" From 6e8a72617320f0fb2623b3dbc6ab8c2b2000d219 Mon Sep 17 00:00:00 2001 From: Martin Raspaud Date: Wed, 28 Aug 2024 11:08:04 +0200 Subject: [PATCH 2/2] Refactor tbm header validation --- pygac/pod_reader.py | 34 ++++++++++++++++++---------------- pygac/reader.py | 28 +++++++++++++++++----------- 2 files changed, 35 insertions(+), 27 deletions(-) diff --git a/pygac/pod_reader.py b/pygac/pod_reader.py index c272741..39fc900 100644 --- a/pygac/pod_reader.py +++ b/pygac/pod_reader.py @@ -51,7 +51,7 @@ from pygac.clock_offsets_converter import get_offsets from pygac.correct_tsm_issue import TSM_AFFECTED_INTERVALS_POD, get_tsm_idx -from pygac.reader import NoTLEData, Reader, ReaderError +from pygac.reader import DecodingError, NoTLEData, Reader, ReaderError from pygac.slerp import slerp from pygac.utils import file_opener @@ -321,24 +321,14 @@ def read_header(cls, filename, fileobj=None, header_date="auto"): _tbm_head, = np.frombuffer( fd_.read(tbm_header.itemsize), dtype=tbm_header, count=1) - for encoding in ("utf-8", "cp500"): - try: - data_set_name = _tbm_head['data_set_name'].decode(encoding) - except ValueError: - continue - else: - break - else: - data_set_name = '---' - allowed_empty = (42*b'\x00' + b' ') - if (cls.data_set_pattern.match(data_set_name) - or (_tbm_head['data_set_name'] == allowed_empty)): - tbm_head = _tbm_head.copy() + try: + tbm_head = cls._validate_tbm_header(_tbm_head) tbm_offset = tbm_header.itemsize - else: - fd_.seek(0) + except DecodingError: tbm_head = None tbm_offset = 0 + + fd_.seek(tbm_offset, 0) header = cls.choose_header_based_on_timestamp(header_date, fd_) fd_.seek(tbm_offset, 0) # need to copy frombuffer to have write access on head @@ -349,6 +339,18 @@ def read_header(cls, filename, fileobj=None, header_date="auto"): cls._validate_header(head) return tbm_head, head + @classmethod + def _validate_tbm_header(cls, potential_tbm_header): + data_set_name = potential_tbm_header['data_set_name'] + allowed_empty = (42*b'\x00' + b' ') + if data_set_name == allowed_empty: + return potential_tbm_header.copy() + + # This will raise a DecodingError if the data_set_name is not valid. + cls._decode_data_set_name(data_set_name) + return potential_tbm_header.copy() + + @classmethod def choose_header_based_on_timestamp(cls, header_date, fd_): """Choose the header dtype based on the timestamp.""" diff --git a/pygac/reader.py b/pygac/reader.py index 28f634c..88e4c87 100644 --- a/pygac/reader.py +++ b/pygac/reader.py @@ -209,16 +209,10 @@ def _correct_data_set_name(cls, header, filename): filename (str): path to file """ filename = str(filename) - for encoding in "utf-8", "cp500": - data_set_name = header['data_set_name'] - try: - data_set_name = cls._decode_data_set_name(data_set_name, encoding) - except DecodingError as err: - LOG.debug(str(err)) - else: - header["data_set_name"] = data_set_name - break - else: + data_set_name = header['data_set_name'] + try: + header["data_set_name"] = cls._decode_data_set_name(data_set_name) + except DecodingError: LOG.debug(f'The data_set_name in header {header["data_set_name"]} does not match.' ' Use filename instead.') match = cls.data_set_pattern.search(filename) @@ -232,7 +226,19 @@ def _correct_data_set_name(cls, header, filename): return header @classmethod - def _decode_data_set_name(cls, data_set_name, encoding): + def _decode_data_set_name(cls, data_set_name): + for encoding in "utf-8", "cp500": + try: + data_set_name = cls._decode_data_set_name_for_encoding(data_set_name, encoding) + except DecodingError as err: + LOG.debug(str(err)) + else: + return data_set_name + else: + raise DecodingError("Could not reliably decode the dataset name.") + + @classmethod + def _decode_data_set_name_for_encoding(cls, data_set_name, encoding): data_set_name = data_set_name.decode(encoding, errors='ignore') if not cls.data_set_pattern.match(data_set_name): raise DecodingError(f'The data_set_name in header {data_set_name} '