From 0f5a0a548f26dae1d0497301f13473d1183a0e7c Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Thu, 3 Aug 2023 12:26:02 +0100 Subject: [PATCH 01/16] hard-coded fix --- exifread/classes.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/exifread/classes.py b/exifread/classes.py index 7b0eb97..22c28a5 100644 --- a/exifread/classes.py +++ b/exifread/classes.py @@ -82,6 +82,9 @@ def s2n(self, offset, length: int, signed=False) -> int: fmt = '<' if self.endian == 'I' else '>' # Construct a format string from the requested length and signedness; # raise a ValueError if length is something silly like 3 + # Adding option for BigTiff, which uses long unsigned int + # https://www.awaresystems.be/imaging/tiff/bigtiff.html + # try: fmt += { (1, False): 'B', @@ -90,8 +93,10 @@ def s2n(self, offset, length: int, signed=False) -> int: (2, True): 'h', (4, False): 'I', (4, True): 'i', - (8, False): 'L', - (8, True): 'l', + (4, False): 'L', + (4, True): 'l', + (8, False): 'Q', + (8, True): 'q', }[(length, signed)] except KeyError as err: raise ValueError('unexpected unpacking length: %d' % length) from err @@ -122,15 +127,16 @@ def _first_ifd(self) -> int: def _next_ifd(self, ifd) -> int: """Return the pointer to next IFD.""" - entries = self.s2n(ifd, 2) - next_ifd = self.s2n(ifd + 2 + 12 * entries, 4) + entries = self.s2n(ifd, 8) + next_ifd = self.s2n(ifd + 8 + 20 * entries, 8) if next_ifd == ifd: return 0 return next_ifd def list_ifd(self) -> list: """Return the list of IFDs in the header.""" - i = self._first_ifd() + # i = self._first_ifd() + i = 16 ifds = [] set_ifds = set() while i: @@ -227,7 +233,7 @@ def _process_tag(self, ifd, ifd_name: str, tag_entry, entry, tag: int, tag_name, count = self.s2n(entry + 4, 4) # Adjust for tag id/type/count (2+2+4 bytes) # Now we point at either the data or the 2nd level offset - offset = entry + 8 + offset = entry + 12 # If the value fits in 4 bytes, it is inlined, else we # need to jump ahead again. @@ -297,14 +303,14 @@ def dump_ifd(self, ifd, ifd_name: str, tag_dict=None, relative=0, stop_tag=DEFAU if tag_dict is None: tag_dict = EXIF_TAGS try: - entries = self.s2n(ifd, 2) + entries = self.s2n(ifd, 8) except TypeError: logger.warning('Possibly corrupted IFD: %s', ifd) return for i in range(entries): # entry is index of start of this IFD in the file - entry = ifd + 2 + 12 * i + entry = ifd + 8 + 20 * i tag = self.s2n(entry, 2) # get tag name early to avoid errors, help debug From 0dcd1b696b98aea9702933b80ed5c5339852f12c Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Thu, 3 Aug 2023 15:24:41 +0100 Subject: [PATCH 02/16] add tag structure and length --- exifread/classes.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/exifread/classes.py b/exifread/classes.py index 22c28a5..e2f8926 100644 --- a/exifread/classes.py +++ b/exifread/classes.py @@ -58,6 +58,11 @@ class ExifHeader: def __init__(self, file_handle: BinaryIO, endian, offset, fake_exif, strict: bool, debug=False, detailed=True, truncate_tags=True): + file_handle.seek(2) + if file_handle.read(2) == 43: + self.length = 8 + else: + self.length = 4 self.file_handle = file_handle self.endian = endian self.offset = offset @@ -69,6 +74,12 @@ def __init__(self, file_handle: BinaryIO, endian, offset, fake_exif, strict: boo # TODO: get rid of 'Any' type self.tags = {} # type: Dict[str, Any] + def tag_structure(self, entries): + #https://www.awaresystems.be/imaging/tiff/bigtiff.html + if self.length == 8: + return 8+entries*20 + return 2+entries*12 + def s2n(self, offset, length: int, signed=False) -> int: """ Convert slice to integer, based on sign and endian flags. From c88d8ddc85e6fee3e6747fc72fb41042cea8d1a0 Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Thu, 3 Aug 2023 15:32:53 +0100 Subject: [PATCH 03/16] parameterized fix --- exifread/classes.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/exifread/classes.py b/exifread/classes.py index e2f8926..9852c92 100644 --- a/exifread/classes.py +++ b/exifread/classes.py @@ -134,20 +134,20 @@ def n2b(self, offset, length) -> bytes: def _first_ifd(self) -> int: """Return first IFD.""" - return self.s2n(4, 4) + return self.s2n(self.length, self.length) def _next_ifd(self, ifd) -> int: """Return the pointer to next IFD.""" - entries = self.s2n(ifd, 8) - next_ifd = self.s2n(ifd + 8 + 20 * entries, 8) + entries = self.s2n(ifd, self.length) + next_ifd = self.s2n(ifd + self.tag_structure(entries), self.length) if next_ifd == ifd: return 0 return next_ifd def list_ifd(self) -> list: """Return the list of IFDs in the header.""" - # i = self._first_ifd() - i = 16 + i = self._first_ifd() + # i = 16 ifds = [] set_ifds = set() while i: @@ -314,14 +314,14 @@ def dump_ifd(self, ifd, ifd_name: str, tag_dict=None, relative=0, stop_tag=DEFAU if tag_dict is None: tag_dict = EXIF_TAGS try: - entries = self.s2n(ifd, 8) + entries = self.s2n(ifd, self.length) except TypeError: logger.warning('Possibly corrupted IFD: %s', ifd) return for i in range(entries): # entry is index of start of this IFD in the file - entry = ifd + 8 + 20 * i + entry = ifd + self.tag_structure(i) tag = self.s2n(entry, 2) # get tag name early to avoid errors, help debug From f6372545ff98d86833a5399532054f5afcd2bc85 Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Mon, 21 Aug 2023 19:08:41 +0100 Subject: [PATCH 04/16] over-parameterize --- exifread/classes.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/exifread/classes.py b/exifread/classes.py index 9852c92..7d94fbd 100644 --- a/exifread/classes.py +++ b/exifread/classes.py @@ -58,11 +58,22 @@ class ExifHeader: def __init__(self, file_handle: BinaryIO, endian, offset, fake_exif, strict: bool, debug=False, detailed=True, truncate_tags=True): - file_handle.seek(2) - if file_handle.read(2) == 43: - self.length = 8 - else: - self.length = 4 + """ + based on https://www.awaresystems.be/imaging/tiff/bigtiff.html#structures + """ + _ = file_handle.read(2) # offset is 0 + self.magic_number = file_handle.read(2) # offset is 2 + if self.magic_number == 43: + # bigtiff + self.bytesize_of_offsets = file_handle.read(2) # offset is 4 + if file_handle.read(2) != 0: # offset is 6 + raise ValueError + self.offset_to_first_ifd = file_handle.read(2) # offset is 8 + # self.length = 8 + elif self.magic_number==42: + # regular tiff + self.offset_to_first_ifd = file_handle.read(2) # offset is 4 + # self.length = 4 self.file_handle = file_handle self.endian = endian self.offset = offset @@ -76,7 +87,7 @@ def __init__(self, file_handle: BinaryIO, endian, offset, fake_exif, strict: boo def tag_structure(self, entries): #https://www.awaresystems.be/imaging/tiff/bigtiff.html - if self.length == 8: + if self.magic_number == 43: #big_tiff return 8+entries*20 return 2+entries*12 @@ -137,9 +148,10 @@ def _first_ifd(self) -> int: return self.s2n(self.length, self.length) def _next_ifd(self, ifd) -> int: + """Return the pointer to next IFD.""" - entries = self.s2n(ifd, self.length) - next_ifd = self.s2n(ifd + self.tag_structure(entries), self.length) + entries = self.s2n(offset=ifd, length=self.length) + next_ifd = self.s2n(offset=ifd + self.tag_structure(entries), length=self.length) if next_ifd == ifd: return 0 return next_ifd From 382c1afed1228320d3c810851ea96ff6fd325157 Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Mon, 21 Aug 2023 19:31:14 +0100 Subject: [PATCH 05/16] add test --- test.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 test.py diff --git a/test.py b/test.py new file mode 100644 index 0000000..1b011eb --- /dev/null +++ b/test.py @@ -0,0 +1,55 @@ +from pathlib import Path +import zipfile +from exifread import process_file + +BIG_TIFF_PACKAGE: str = "compressed_asset.large.zip" +TIFF_PACKAGE: str = "compressed_asset.zip" +PARENT_PATH: str = "/Users/jose.delgado/delivery_data" +path = zipfile.Path(Path(PARENT_PATH).joinpath(TIFF_PACKAGE)) + +for file_name in path.root.namelist(): + if ".tif" not in file_name.lower(): + continue + path.at = file_name + with path.open("rb") as fo: + header = process_file(fo) + break +for key, value in header.items(): + if value.tag == 34737: + print(value.values) +# print(header) + +""" +Struggling with the following byte string: b'E\x00\x00\x7fC\xb0\xd4\x00' + +- `E` is a shift-out character with no practical use https://en.wikipedia.org/wiki/Shift_Out_and_Shift_In_characters +- `\x7f` is a delete charater https://en.wikipedia.org/wiki/Delete_character +- `C` is form feed, or page break https://en.wikipedia.org/wiki/Page_break +- `\x00` is NULL and every ASCII string should end in a NULL + +So the byte string should break into something like: +b'\x00\x00' +b'\xb0\xd4\x00' + +Also from the tiff specification: + +Note on ASCII Keys: + +Special handling is required for +ASCII-valued keys. While it is true that +TIFF 6.0 permits multiple NULL-delimited +strings within a single ASCII tag, the secondary +strings might not appear in the output of naive "tiffdump" +programs. For this reason, the null delimiter of each +ASCII Key value shall be converted to +a "|" (pipe) character before being +installed back into the ASCII holding +tag, so that a dump of the tag will look like this. + +"|" is b'\x7c' in hex + +values.replace(b'b'\x7c',) + + AsciiTag="first_value|second_value|etc...last_value|" + +""" \ No newline at end of file From 043ca360e4fca8ff00d356d6f1a50c31619e3d14 Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Mon, 21 Aug 2023 19:31:26 +0100 Subject: [PATCH 06/16] move s2n outside of class --- exifread/classes.py | 92 +++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 45 deletions(-) diff --git a/exifread/classes.py b/exifread/classes.py index 7d94fbd..710636c 100644 --- a/exifread/classes.py +++ b/exifread/classes.py @@ -8,6 +8,46 @@ logger = get_logger() + def s2n(self, file_handle, offset, length: int, signed=False, endian="<") -> int: + """ + Convert slice to integer, based on sign and endian flags. + + Usually this offset is assumed to be relative to the beginning of the + start of the EXIF information. + For some cameras that use relative tags, this offset may be relative + to some other starting point. + """ + # Little-endian if Intel, big-endian if Motorola + fmt = '<' if endian == 'I' else '>' + # Construct a format string from the requested length and signedness; + # raise a ValueError if length is something silly like 3 + # Adding option for BigTiff, which uses long unsigned int + # https://www.awaresystems.be/imaging/tiff/bigtiff.html + # + try: + fmt += { + (1, False): 'B', + (1, True): 'b', + (2, False): 'H', + (2, True): 'h', + (4, False): 'I', + (4, True): 'i', + (4, False): 'L', + (4, True): 'l', + (8, False): 'Q', + (8, True): 'q', + }[(length, signed)] + except KeyError as err: + raise ValueError('unexpected unpacking length: %d' % length) from err + file_handle.seek(self.offset + offset) + buf = file_handle.read(length) + + if buf: + # https://github.com/ianare/exif-py/pull/158 + # had to revert as this certain fields to be empty + # please provide test images + return struct.unpack(fmt, buf)[0] + return 0 class IfdTag: """ @@ -61,21 +101,24 @@ def __init__(self, file_handle: BinaryIO, endian, offset, fake_exif, strict: boo """ based on https://www.awaresystems.be/imaging/tiff/bigtiff.html#structures """ + self.endian = endian + self.endian_fmt = "<" if self.endian == 'I' else ">" + file_handle.seek(0) _ = file_handle.read(2) # offset is 0 - self.magic_number = file_handle.read(2) # offset is 2 + self.magic_number = struct.unpack(f"{self.endian_fmt}h", file_handle.read(2)) # offset is 2 + if self.magic_number == 43: # bigtiff self.bytesize_of_offsets = file_handle.read(2) # offset is 4 if file_handle.read(2) != 0: # offset is 6 raise ValueError self.offset_to_first_ifd = file_handle.read(2) # offset is 8 - # self.length = 8 + self.length = 8 elif self.magic_number==42: # regular tiff self.offset_to_first_ifd = file_handle.read(2) # offset is 4 - # self.length = 4 + self.length = 4 self.file_handle = file_handle - self.endian = endian self.offset = offset self.fake_exif = fake_exif self.strict = strict @@ -91,47 +134,6 @@ def tag_structure(self, entries): return 8+entries*20 return 2+entries*12 - def s2n(self, offset, length: int, signed=False) -> int: - """ - Convert slice to integer, based on sign and endian flags. - - Usually this offset is assumed to be relative to the beginning of the - start of the EXIF information. - For some cameras that use relative tags, this offset may be relative - to some other starting point. - """ - # Little-endian if Intel, big-endian if Motorola - fmt = '<' if self.endian == 'I' else '>' - # Construct a format string from the requested length and signedness; - # raise a ValueError if length is something silly like 3 - # Adding option for BigTiff, which uses long unsigned int - # https://www.awaresystems.be/imaging/tiff/bigtiff.html - # - try: - fmt += { - (1, False): 'B', - (1, True): 'b', - (2, False): 'H', - (2, True): 'h', - (4, False): 'I', - (4, True): 'i', - (4, False): 'L', - (4, True): 'l', - (8, False): 'Q', - (8, True): 'q', - }[(length, signed)] - except KeyError as err: - raise ValueError('unexpected unpacking length: %d' % length) from err - self.file_handle.seek(self.offset + offset) - buf = self.file_handle.read(length) - - if buf: - # https://github.com/ianare/exif-py/pull/158 - # had to revert as this certain fields to be empty - # please provide test images - return struct.unpack(fmt, buf)[0] - return 0 - def n2b(self, offset, length) -> bytes: """Convert offset to bytes.""" s = b'' From 8160fcc396fe8ba56c7da87a3bef71caf4e6634f Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Mon, 21 Aug 2023 20:31:09 +0100 Subject: [PATCH 07/16] simplify --- exifread/classes.py | 119 ++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 60 deletions(-) diff --git a/exifread/classes.py b/exifread/classes.py index 710636c..dcac6fe 100644 --- a/exifread/classes.py +++ b/exifread/classes.py @@ -8,46 +8,41 @@ logger = get_logger() - def s2n(self, file_handle, offset, length: int, signed=False, endian="<") -> int: - """ - Convert slice to integer, based on sign and endian flags. - - Usually this offset is assumed to be relative to the beginning of the - start of the EXIF information. - For some cameras that use relative tags, this offset may be relative - to some other starting point. - """ - # Little-endian if Intel, big-endian if Motorola - fmt = '<' if endian == 'I' else '>' - # Construct a format string from the requested length and signedness; - # raise a ValueError if length is something silly like 3 - # Adding option for BigTiff, which uses long unsigned int - # https://www.awaresystems.be/imaging/tiff/bigtiff.html - # - try: - fmt += { - (1, False): 'B', - (1, True): 'b', - (2, False): 'H', - (2, True): 'h', - (4, False): 'I', - (4, True): 'i', - (4, False): 'L', - (4, True): 'l', - (8, False): 'Q', - (8, True): 'q', - }[(length, signed)] - except KeyError as err: - raise ValueError('unexpected unpacking length: %d' % length) from err - file_handle.seek(self.offset + offset) - buf = file_handle.read(length) - - if buf: - # https://github.com/ianare/exif-py/pull/158 - # had to revert as this certain fields to be empty - # please provide test images - return struct.unpack(fmt, buf)[0] - return 0 +def s2n(file_handle, offset, length: int, signed=False, endian="<") -> int: + """ + Convert slice to integer, based on sign and endian flags. + """ + # Little-endian if Intel, big-endian if Motorola + fmt = '<' if endian == 'I' else '>' + # Construct a format string from the requested length and signedness; + # raise a ValueError if length is something silly like 3 + # Adding option for BigTiff, which uses long unsigned int + # https://www.awaresystems.be/imaging/tiff/bigtiff.html + # + try: + fmt += { + (1, False): 'B', + (1, True): 'b', + (2, False): 'H', + (2, True): 'h', + (4, False): 'I', + (4, True): 'i', + (4, False): 'L', + (4, True): 'l', + (8, False): 'Q', + (8, True): 'q', + }[(length, signed)] + except KeyError as err: + raise ValueError('unexpected unpacking length: %d' % length) from err + file_handle.seek(offset) + buf = file_handle.read(length) + + if buf: + # https://github.com/ianare/exif-py/pull/158 + # had to revert as this certain fields to be empty + # please provide test images + return struct.unpack(fmt, buf)[0] + return 0 class IfdTag: """ @@ -103,21 +98,7 @@ def __init__(self, file_handle: BinaryIO, endian, offset, fake_exif, strict: boo """ self.endian = endian self.endian_fmt = "<" if self.endian == 'I' else ">" - file_handle.seek(0) - _ = file_handle.read(2) # offset is 0 - self.magic_number = struct.unpack(f"{self.endian_fmt}h", file_handle.read(2)) # offset is 2 - - if self.magic_number == 43: - # bigtiff - self.bytesize_of_offsets = file_handle.read(2) # offset is 4 - if file_handle.read(2) != 0: # offset is 6 - raise ValueError - self.offset_to_first_ifd = file_handle.read(2) # offset is 8 - self.length = 8 - elif self.magic_number==42: - # regular tiff - self.offset_to_first_ifd = file_handle.read(2) # offset is 4 - self.length = 4 + self.magic_number = s2n(file_handle=file_handle, offset=2, length=1, signed=False, endian=self.endian_fmt) self.file_handle = file_handle self.offset = offset self.fake_exif = fake_exif @@ -132,6 +113,8 @@ def tag_structure(self, entries): #https://www.awaresystems.be/imaging/tiff/bigtiff.html if self.magic_number == 43: #big_tiff return 8+entries*20 + if self.magic_number == 42: #tiff + return 2+entries*12 return 2+entries*12 def n2b(self, offset, length) -> bytes: @@ -146,14 +129,30 @@ def n2b(self, offset, length) -> bytes: return s def _first_ifd(self) -> int: - """Return first IFD.""" - return self.s2n(self.length, self.length) + """Return the pointer to first IFD.""" + # TODO parameterize this + if self.magic_number == 42: + return s2n( + self.file_handle, + offset=4, + length=1, + signed=False, + endian=self.endian) + + if self.magic_number == 43: + return s2n( + self.file_handle, + offset=8, + length=1, + signed=False, + endian=self.endian) def _next_ifd(self, ifd) -> int: """Return the pointer to next IFD.""" - entries = self.s2n(offset=ifd, length=self.length) - next_ifd = self.s2n(offset=ifd + self.tag_structure(entries), length=self.length) + entries = s2n(file_handle=self.file_handle, offset=ifd, length=2, signed=False, endian=self.endian) + next_ifd = s2n(file_handle=self.file_handle, offset=ifd + self.tag_structure(entries), length=4, endian=self.endian) + #bytesize of offsets is 8 in bigtiff and 4 in tiff if next_ifd == ifd: return 0 return next_ifd @@ -328,7 +327,7 @@ def dump_ifd(self, ifd, ifd_name: str, tag_dict=None, relative=0, stop_tag=DEFAU if tag_dict is None: tag_dict = EXIF_TAGS try: - entries = self.s2n(ifd, self.length) + entries = self.s2n(ifd, 2) except TypeError: logger.warning('Possibly corrupted IFD: %s', ifd) return From f2ae730a3114567ba3f65e5bb2da2c5cab90b721 Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Mon, 21 Aug 2023 20:44:31 +0100 Subject: [PATCH 08/16] all done up to dump_ifd --- exifread/classes.py | 29 ++++++++++------------------- test.py | 2 +- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/exifread/classes.py b/exifread/classes.py index dcac6fe..6bc0eee 100644 --- a/exifread/classes.py +++ b/exifread/classes.py @@ -99,6 +99,7 @@ def __init__(self, file_handle: BinaryIO, endian, offset, fake_exif, strict: boo self.endian = endian self.endian_fmt = "<" if self.endian == 'I' else ">" self.magic_number = s2n(file_handle=file_handle, offset=2, length=1, signed=False, endian=self.endian_fmt) + self.bytesize_of_offset = 8 if self.magic_number==43 else 4 self.file_handle = file_handle self.offset = offset self.fake_exif = fake_exif @@ -130,28 +131,18 @@ def n2b(self, offset, length) -> bytes: def _first_ifd(self) -> int: """Return the pointer to first IFD.""" - # TODO parameterize this - if self.magic_number == 42: - return s2n( - self.file_handle, - offset=4, - length=1, - signed=False, - endian=self.endian) - - if self.magic_number == 43: - return s2n( - self.file_handle, - offset=8, - length=1, - signed=False, - endian=self.endian) + return s2n( + self.file_handle, + offset=self.bytesize_of_offset, + length=1, + signed=False, + endian=self.endian) def _next_ifd(self, ifd) -> int: """Return the pointer to next IFD.""" entries = s2n(file_handle=self.file_handle, offset=ifd, length=2, signed=False, endian=self.endian) - next_ifd = s2n(file_handle=self.file_handle, offset=ifd + self.tag_structure(entries), length=4, endian=self.endian) + next_ifd = s2n(file_handle=self.file_handle, offset=ifd + self.tag_structure(entries), length=self.bytesize_of_offset, endian=self.endian) #bytesize of offsets is 8 in bigtiff and 4 in tiff if next_ifd == ifd: return 0 @@ -327,7 +318,7 @@ def dump_ifd(self, ifd, ifd_name: str, tag_dict=None, relative=0, stop_tag=DEFAU if tag_dict is None: tag_dict = EXIF_TAGS try: - entries = self.s2n(ifd, 2) + entries = s2n(file_handle=self.file_handle, offset=ifd, length=2, signed=False, endian=self.endian) except TypeError: logger.warning('Possibly corrupted IFD: %s', ifd) return @@ -335,7 +326,7 @@ def dump_ifd(self, ifd, ifd_name: str, tag_dict=None, relative=0, stop_tag=DEFAU for i in range(entries): # entry is index of start of this IFD in the file entry = ifd + self.tag_structure(i) - tag = self.s2n(entry, 2) + tag = s2n(file_handle=self.file_handle, offset=ifd, length=2, signed=False, endian=self.endian) # get tag name early to avoid errors, help debug tag_entry = tag_dict.get(tag) diff --git a/test.py b/test.py index 1b011eb..678c90b 100644 --- a/test.py +++ b/test.py @@ -5,7 +5,7 @@ BIG_TIFF_PACKAGE: str = "compressed_asset.large.zip" TIFF_PACKAGE: str = "compressed_asset.zip" PARENT_PATH: str = "/Users/jose.delgado/delivery_data" -path = zipfile.Path(Path(PARENT_PATH).joinpath(TIFF_PACKAGE)) +path = zipfile.Path(Path(PARENT_PATH).joinpath(BIG_TIFF_PACKAGE)) for file_name in path.root.namelist(): if ".tif" not in file_name.lower(): From 872505c0c4981f6903f233f815386112e99393db Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Wed, 23 Aug 2023 13:01:23 +0200 Subject: [PATCH 09/16] replace "magic" numbers --- exifread/classes.py | 62 +++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/exifread/classes.py b/exifread/classes.py index 6bc0eee..6eedac7 100644 --- a/exifread/classes.py +++ b/exifread/classes.py @@ -99,7 +99,14 @@ def __init__(self, file_handle: BinaryIO, endian, offset, fake_exif, strict: boo self.endian = endian self.endian_fmt = "<" if self.endian == 'I' else ">" self.magic_number = s2n(file_handle=file_handle, offset=2, length=1, signed=False, endian=self.endian_fmt) - self.bytesize_of_offset = 8 if self.magic_number==43 else 4 + self.number_of_entries_length = 8 if self.magic_number==43 else 2 + self.bytesize_of_offset_value = 8 if self.magic_number==43 else 4 + self.tag_id_length = 2 + self.tag_fieldtype_length = 2 + self.tag_nvalues_length = 8 if self.magic_number==43 else 4 + self.inlining_threshold = 8 if self.magic_number==43 else 4 # In classic TIFF, the tag data was written inside the tag structure, in the IFD, if its size was smaller than or equal to 4 bytes. Otherwise, it's written elsewhere, and pointed to. In BigTIFF, the tag data is written inside the tag structure, in the IFD, if its size is smaller than or equal to 8 bytes. + self.max_tag_length = self.tag_id_length + self.tag_fieldtype_length + self.tag_nvalues_length + self.inlining_threshold # max tag length + self.file_handle = file_handle self.offset = offset self.fake_exif = fake_exif @@ -110,13 +117,9 @@ def __init__(self, file_handle: BinaryIO, endian, offset, fake_exif, strict: boo # TODO: get rid of 'Any' type self.tags = {} # type: Dict[str, Any] - def tag_structure(self, entries): + def offset_to_next_ifd(self, ifd, entries): #https://www.awaresystems.be/imaging/tiff/bigtiff.html - if self.magic_number == 43: #big_tiff - return 8+entries*20 - if self.magic_number == 42: #tiff - return 2+entries*12 - return 2+entries*12 + return ifd + self.number_of_entries_length+entries*self.max_tag_length def n2b(self, offset, length) -> bytes: """Convert offset to bytes.""" @@ -133,7 +136,7 @@ def _first_ifd(self) -> int: """Return the pointer to first IFD.""" return s2n( self.file_handle, - offset=self.bytesize_of_offset, + offset=self.bytesize_of_offset_value, length=1, signed=False, endian=self.endian) @@ -141,8 +144,8 @@ def _first_ifd(self) -> int: def _next_ifd(self, ifd) -> int: """Return the pointer to next IFD.""" - entries = s2n(file_handle=self.file_handle, offset=ifd, length=2, signed=False, endian=self.endian) - next_ifd = s2n(file_handle=self.file_handle, offset=ifd + self.tag_structure(entries), length=self.bytesize_of_offset, endian=self.endian) + entries = s2n(file_handle=self.file_handle, offset=ifd, length=self.number_of_entries_length, signed=False, endian=self.endian) + next_ifd = s2n(file_handle=self.file_handle, offset=self.offset_to_next_ifd(ifd, entries), length=self.bytesize_of_offset_value, endian=self.endian) #bytesize of offsets is 8 in bigtiff and 4 in tiff if next_ifd == ifd: return 0 @@ -236,8 +239,13 @@ def _process_field2(self, ifd_name, tag_name, count, offset): return values def _process_tag(self, ifd, ifd_name: str, tag_entry, entry, tag: int, tag_name, relative, stop_tag) -> None: - field_type = self.s2n(entry + 2, 2) - + field_type = s2n( + file_handle=self.file_handle, + offset=entry + 2, + length=self.tag_fieldtype_length, + signed=False, + endian=self.endian + ) # unknown field type if not 0 < field_type < len(FIELD_TYPES): if not self.strict: @@ -245,28 +253,38 @@ def _process_tag(self, ifd, ifd_name: str, tag_entry, entry, tag: int, tag_name, raise ValueError('Unknown type %d in tag 0x%04X' % (field_type, tag)) type_length = FIELD_TYPES[field_type][0] - count = self.s2n(entry + 4, 4) - # Adjust for tag id/type/count (2+2+4 bytes) + count = s2n( + file_handle=self.file_handle, + offset=entry + self.tag_id_length + self.tag_fieldtype_length, + length=self.number_of_entries_length, + signed=False, + endian=self.endian + ) + # the length of this field (number of values) is the same as the bytesize_of_offset by chance (or by design?) + # Adjust for tag id/type/count (2+2+4 bytes) or in the case of bigtiff (2+2+8) # Now we point at either the data or the 2nd level offset - offset = entry + 12 + offset = entry + self.tag_id_length + self.tag_fieldtype_length + self.tag_nvalues_length # If the value fits in 4 bytes, it is inlined, else we # need to jump ahead again. - if count * type_length > 4: + if count * type_length > self.inlining_threshold: # offset is not the value; it's a pointer to the value # if relative we set things up so s2n will seek to the right # place when it adds self.offset. Note that this 'relative' # is for the Nikon type 3 makernote. Other cameras may use # other relative offsets, which would have to be computed here # slightly differently. + tmp_offset = s2n( + file_handle=self.file_handle, + offset=offset, + length=self.bytesize_of_offset_value, + signed=False) if relative: - tmp_offset = self.s2n(offset, 4) offset = tmp_offset + ifd - 8 if self.fake_exif: offset += 18 else: - offset = self.s2n(offset, 4) - + offset = tmp_offset field_offset = offset values = None if field_type == 2: @@ -318,15 +336,15 @@ def dump_ifd(self, ifd, ifd_name: str, tag_dict=None, relative=0, stop_tag=DEFAU if tag_dict is None: tag_dict = EXIF_TAGS try: - entries = s2n(file_handle=self.file_handle, offset=ifd, length=2, signed=False, endian=self.endian) + entries = s2n(file_handle=self.file_handle, offset=ifd, length=self.number_of_entries_length, signed=False, endian=self.endian) except TypeError: logger.warning('Possibly corrupted IFD: %s', ifd) return for i in range(entries): # entry is index of start of this IFD in the file - entry = ifd + self.tag_structure(i) - tag = s2n(file_handle=self.file_handle, offset=ifd, length=2, signed=False, endian=self.endian) + entry = ifd + self.offset_to_next_ifd(i) + tag = s2n(file_handle=self.file_handle, offset=entry, length=2, signed=False, endian=self.endian) # get tag name early to avoid errors, help debug tag_entry = tag_dict.get(tag) From 4e1c86184e0b3b8ad3556464216973577da7293b Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Wed, 23 Aug 2023 14:30:30 +0200 Subject: [PATCH 10/16] parameterized all magic numbers, runs on bigtiff --- exifread/classes.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/exifread/classes.py b/exifread/classes.py index 6eedac7..9901943 100644 --- a/exifread/classes.py +++ b/exifread/classes.py @@ -8,7 +8,7 @@ logger = get_logger() -def s2n(file_handle, offset, length: int, signed=False, endian="<") -> int: +def s2n(file_handle, offset, length: int, signed=False, endian="I") -> int: """ Convert slice to integer, based on sign and endian flags. """ @@ -97,8 +97,7 @@ def __init__(self, file_handle: BinaryIO, endian, offset, fake_exif, strict: boo based on https://www.awaresystems.be/imaging/tiff/bigtiff.html#structures """ self.endian = endian - self.endian_fmt = "<" if self.endian == 'I' else ">" - self.magic_number = s2n(file_handle=file_handle, offset=2, length=1, signed=False, endian=self.endian_fmt) + self.magic_number = s2n(file_handle=file_handle, offset=2, length=1, signed=False, endian=self.endian) self.number_of_entries_length = 8 if self.magic_number==43 else 2 self.bytesize_of_offset_value = 8 if self.magic_number==43 else 4 self.tag_id_length = 2 @@ -200,7 +199,13 @@ def _process_field(self, tag_name, count, field_type, type_length, offset): # -1 means corrupted value = -1 else: - value = self.s2n(offset, type_length, signed) + value = s2n( + file_handle=self.file_handle, + offset=offset, + length=type_length, + signed=signed, + endian=self.endian + ) values.append(value) offset = offset + type_length # The test above causes problems with tags that are @@ -241,7 +246,7 @@ def _process_field2(self, ifd_name, tag_name, count, offset): def _process_tag(self, ifd, ifd_name: str, tag_entry, entry, tag: int, tag_name, relative, stop_tag) -> None: field_type = s2n( file_handle=self.file_handle, - offset=entry + 2, + offset=entry + self.tag_id_length, length=self.tag_fieldtype_length, signed=False, endian=self.endian @@ -343,19 +348,19 @@ def dump_ifd(self, ifd, ifd_name: str, tag_dict=None, relative=0, stop_tag=DEFAU for i in range(entries): # entry is index of start of this IFD in the file - entry = ifd + self.offset_to_next_ifd(i) - tag = s2n(file_handle=self.file_handle, offset=entry, length=2, signed=False, endian=self.endian) + entry = self.offset_to_next_ifd(ifd, i) + tag_id = s2n(file_handle=self.file_handle, offset=entry, length=self.tag_id_length, signed=False, endian=self.endian) # get tag name early to avoid errors, help debug - tag_entry = tag_dict.get(tag) + tag_entry = tag_dict.get(tag_id) if tag_entry: tag_name = tag_entry[0] else: - tag_name = 'Tag 0x%04X' % tag + tag_name = 'Tag 0x%04X' % tag_id # ignore certain tags for faster processing - if not (not self.detailed and tag in IGNORE_TAGS): - self._process_tag(ifd, ifd_name, tag_entry, entry, tag, tag_name, relative, stop_tag) + if not (not self.detailed and tag_id in IGNORE_TAGS): + self._process_tag(ifd, ifd_name, tag_entry, entry, tag_id, tag_name, relative, stop_tag) if tag_name == stop_tag: break From 8e49c08c5ad2380220cfcf605f98b2efdba7a31b Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Wed, 23 Aug 2023 14:33:45 +0200 Subject: [PATCH 11/16] linting --- exifread/classes.py | 444 ++++++++++++++++++++++++++++---------------- 1 file changed, 287 insertions(+), 157 deletions(-) diff --git a/exifread/classes.py b/exifread/classes.py index 9901943..95c551d 100644 --- a/exifread/classes.py +++ b/exifread/classes.py @@ -4,36 +4,43 @@ from exifread.exif_log import get_logger from exifread.utils import Ratio -from exifread.tags import EXIF_TAGS, DEFAULT_STOP_TAG, FIELD_TYPES, IGNORE_TAGS, makernote +from exifread.tags import ( + EXIF_TAGS, + DEFAULT_STOP_TAG, + FIELD_TYPES, + IGNORE_TAGS, + makernote, +) logger = get_logger() + def s2n(file_handle, offset, length: int, signed=False, endian="I") -> int: """ Convert slice to integer, based on sign and endian flags. """ # Little-endian if Intel, big-endian if Motorola - fmt = '<' if endian == 'I' else '>' + fmt = "<" if endian == "I" else ">" # Construct a format string from the requested length and signedness; # raise a ValueError if length is something silly like 3 # Adding option for BigTiff, which uses long unsigned int # https://www.awaresystems.be/imaging/tiff/bigtiff.html - # + # try: fmt += { - (1, False): 'B', - (1, True): 'b', - (2, False): 'H', - (2, True): 'h', - (4, False): 'I', - (4, True): 'i', - (4, False): 'L', - (4, True): 'l', - (8, False): 'Q', - (8, True): 'q', - }[(length, signed)] + (1, False): "B", + (1, True): "b", + (2, False): "H", + (2, True): "h", + (4, False): "I", + (4, True): "i", + (4, False): "L", + (4, True): "l", + (8, False): "Q", + (8, True): "q", + }[(length, signed)] except KeyError as err: - raise ValueError('unexpected unpacking length: %d' % length) from err + raise ValueError("unexpected unpacking length: %d" % length) from err file_handle.seek(offset) buf = file_handle.read(length) @@ -44,13 +51,21 @@ def s2n(file_handle, offset, length: int, signed=False, endian="I") -> int: return struct.unpack(fmt, buf)[0] return 0 + class IfdTag: """ Eases dealing with tags. """ - def __init__(self, printable: str, tag: int, field_type: int, values, - field_offset: int, field_length: int): + def __init__( + self, + printable: str, + tag: int, + field_type: int, + values, + field_offset: int, + field_length: int, + ): # printable version of data self.printable = printable # tag ID number @@ -70,18 +85,18 @@ def __str__(self) -> str: def __repr__(self) -> str: try: - tag = '(0x%04X) %s=%s @ %d' % ( + tag = "(0x%04X) %s=%s @ %d" % ( self.tag, FIELD_TYPES[self.field_type][2], self.printable, - self.field_offset + self.field_offset, ) except TypeError: - tag = '(%s) %s=%s @ %s' % ( + tag = "(%s) %s=%s @ %s" % ( str(self.tag), FIELD_TYPES[self.field_type][2], self.printable, - str(self.field_offset) + str(self.field_offset), ) return tag @@ -91,20 +106,42 @@ class ExifHeader: Handle an EXIF header. """ - def __init__(self, file_handle: BinaryIO, endian, offset, fake_exif, strict: bool, - debug=False, detailed=True, truncate_tags=True): + def __init__( + self, + file_handle: BinaryIO, + endian, + offset, + fake_exif, + strict: bool, + debug=False, + detailed=True, + truncate_tags=True, + ): """ based on https://www.awaresystems.be/imaging/tiff/bigtiff.html#structures """ self.endian = endian - self.magic_number = s2n(file_handle=file_handle, offset=2, length=1, signed=False, endian=self.endian) - self.number_of_entries_length = 8 if self.magic_number==43 else 2 - self.bytesize_of_offset_value = 8 if self.magic_number==43 else 4 + self.magic_number = s2n( + file_handle=file_handle, + offset=2, + length=1, + signed=False, + endian=self.endian, + ) + self.number_of_entries_length = 8 if self.magic_number == 43 else 2 + self.bytesize_of_offset_value = 8 if self.magic_number == 43 else 4 self.tag_id_length = 2 self.tag_fieldtype_length = 2 - self.tag_nvalues_length = 8 if self.magic_number==43 else 4 - self.inlining_threshold = 8 if self.magic_number==43 else 4 # In classic TIFF, the tag data was written inside the tag structure, in the IFD, if its size was smaller than or equal to 4 bytes. Otherwise, it's written elsewhere, and pointed to. In BigTIFF, the tag data is written inside the tag structure, in the IFD, if its size is smaller than or equal to 8 bytes. - self.max_tag_length = self.tag_id_length + self.tag_fieldtype_length + self.tag_nvalues_length + self.inlining_threshold # max tag length + self.tag_nvalues_length = 8 if self.magic_number == 43 else 4 + self.inlining_threshold = ( + 8 if self.magic_number == 43 else 4 + ) # In classic TIFF, the tag data was written inside the tag structure, in the IFD, if its size was smaller than or equal to 4 bytes. Otherwise, it's written elsewhere, and pointed to. In BigTIFF, the tag data is written inside the tag structure, in the IFD, if its size is smaller than or equal to 8 bytes. + self.max_tag_length = ( + self.tag_id_length + + self.tag_fieldtype_length + + self.tag_nvalues_length + + self.inlining_threshold + ) # max tag length self.file_handle = file_handle self.offset = offset @@ -117,14 +154,14 @@ def __init__(self, file_handle: BinaryIO, endian, offset, fake_exif, strict: boo self.tags = {} # type: Dict[str, Any] def offset_to_next_ifd(self, ifd, entries): - #https://www.awaresystems.be/imaging/tiff/bigtiff.html - return ifd + self.number_of_entries_length+entries*self.max_tag_length + # https://www.awaresystems.be/imaging/tiff/bigtiff.html + return ifd + self.number_of_entries_length + entries * self.max_tag_length def n2b(self, offset, length) -> bytes: """Convert offset to bytes.""" - s = b'' + s = b"" for _ in range(length): - if self.endian == 'I': + if self.endian == "I": s += bytes([offset & 0xFF]) else: s = bytes([offset & 0xFF]) + s @@ -136,16 +173,27 @@ def _first_ifd(self) -> int: return s2n( self.file_handle, offset=self.bytesize_of_offset_value, - length=1, + length=1, signed=False, - endian=self.endian) + endian=self.endian, + ) def _next_ifd(self, ifd) -> int: - """Return the pointer to next IFD.""" - entries = s2n(file_handle=self.file_handle, offset=ifd, length=self.number_of_entries_length, signed=False, endian=self.endian) - next_ifd = s2n(file_handle=self.file_handle, offset=self.offset_to_next_ifd(ifd, entries), length=self.bytesize_of_offset_value, endian=self.endian) - #bytesize of offsets is 8 in bigtiff and 4 in tiff + entries = s2n( + file_handle=self.file_handle, + offset=ifd, + length=self.number_of_entries_length, + signed=False, + endian=self.endian, + ) + next_ifd = s2n( + file_handle=self.file_handle, + offset=self.offset_to_next_ifd(ifd, entries), + length=self.bytesize_of_offset_value, + endian=self.endian, + ) + # bytesize of offsets is 8 in bigtiff and 4 in tiff if next_ifd == ifd: return 0 return next_ifd @@ -158,7 +206,7 @@ def list_ifd(self) -> list: set_ifds = set() while i: if i in set_ifds: - logger.warning('IFD loop detected.') + logger.warning("IFD loop detected.") break set_ifds.add(i) ifds.append(i) @@ -167,7 +215,7 @@ def list_ifd(self) -> list: def _process_field(self, tag_name, count, field_type, type_length, offset): values = [] - signed = (field_type in [6, 8, 9, 10]) + signed = field_type in [6, 8, 9, 10] # XXX investigate # some entries get too big to handle could be malformed # file or problem with self.s2n @@ -176,26 +224,25 @@ def _process_field(self, tag_name, count, field_type, type_length, offset): if field_type in (5, 10): # a ratio value = Ratio( - self.s2n(offset, 4, signed), - self.s2n(offset + 4, 4, signed) + self.s2n(offset, 4, signed), self.s2n(offset + 4, 4, signed) ) elif field_type in (11, 12): # a float or double - unpack_format = '' - if self.endian == 'I': - unpack_format += '<' + unpack_format = "" + if self.endian == "I": + unpack_format += "<" else: - unpack_format += '>' + unpack_format += ">" if field_type == 11: - unpack_format += 'f' + unpack_format += "f" else: - unpack_format += 'd' + unpack_format += "d" self.file_handle.seek(self.offset + offset) byte_str = self.file_handle.read(type_length) try: value = struct.unpack(unpack_format, byte_str) except struct.error: - logger.warning('Possibly corrupted field %s', tag_name) + logger.warning("Possibly corrupted field %s", tag_name) # -1 means corrupted value = -1 else: @@ -204,13 +251,13 @@ def _process_field(self, tag_name, count, field_type, type_length, offset): offset=offset, length=type_length, signed=signed, - endian=self.endian - ) + endian=self.endian, + ) values.append(value) offset = offset + type_length # The test above causes problems with tags that are # supposed to have long values! Fix up one important case. - elif tag_name in ('MakerNote', makernote.canon.CAMERA_INFO_TAG_NAME): + elif tag_name in ("MakerNote", makernote.canon.CAMERA_INFO_TAG_NAME): for _ in range(count): value = self.s2n(offset, type_length, signed) values.append(value) @@ -218,7 +265,7 @@ def _process_field(self, tag_name, count, field_type, type_length, offset): return values def _process_field2(self, ifd_name, tag_name, count, offset): - values = '' + values = "" # special case: null-terminated ASCII string # XXX investigate # sometimes gets too big to fit in int value @@ -229,33 +276,49 @@ def _process_field2(self, ifd_name, tag_name, count, offset): values = self.file_handle.read(count) # Drop any garbage after a null. - values = values.split(b'\x00', 1)[0] + values = values.split(b"\x00", 1)[0] if isinstance(values, bytes): try: - values = values.decode('utf-8') + values = values.decode("utf-8") except UnicodeDecodeError: - logger.warning('Possibly corrupted field %s in %s IFD', tag_name, ifd_name) + logger.warning( + "Possibly corrupted field %s in %s IFD", tag_name, ifd_name + ) except OverflowError: - logger.warning('OverflowError at position: %s, length: %s', file_position, count) - values = '' + logger.warning( + "OverflowError at position: %s, length: %s", file_position, count + ) + values = "" except MemoryError: - logger.warning('MemoryError at position: %s, length: %s', file_position, count) - values = '' + logger.warning( + "MemoryError at position: %s, length: %s", file_position, count + ) + values = "" return values - def _process_tag(self, ifd, ifd_name: str, tag_entry, entry, tag: int, tag_name, relative, stop_tag) -> None: + def _process_tag( + self, + ifd, + ifd_name: str, + tag_entry, + entry, + tag: int, + tag_name, + relative, + stop_tag, + ) -> None: field_type = s2n( file_handle=self.file_handle, offset=entry + self.tag_id_length, length=self.tag_fieldtype_length, signed=False, - endian=self.endian - ) + endian=self.endian, + ) # unknown field type if not 0 < field_type < len(FIELD_TYPES): if not self.strict: return - raise ValueError('Unknown type %d in tag 0x%04X' % (field_type, tag)) + raise ValueError("Unknown type %d in tag 0x%04X" % (field_type, tag)) type_length = FIELD_TYPES[field_type][0] count = s2n( @@ -263,12 +326,17 @@ def _process_tag(self, ifd, ifd_name: str, tag_entry, entry, tag: int, tag_name, offset=entry + self.tag_id_length + self.tag_fieldtype_length, length=self.number_of_entries_length, signed=False, - endian=self.endian - ) - # the length of this field (number of values) is the same as the bytesize_of_offset by chance (or by design?) + endian=self.endian, + ) + # the length of this field (number of values) is the same as the bytesize_of_offset by chance (or by design?) # Adjust for tag id/type/count (2+2+4 bytes) or in the case of bigtiff (2+2+8) # Now we point at either the data or the 2nd level offset - offset = entry + self.tag_id_length + self.tag_fieldtype_length + self.tag_nvalues_length + offset = ( + entry + + self.tag_id_length + + self.tag_fieldtype_length + + self.tag_nvalues_length + ) # If the value fits in 4 bytes, it is inlined, else we # need to jump ahead again. @@ -280,10 +348,11 @@ def _process_tag(self, ifd, ifd_name: str, tag_entry, entry, tag: int, tag_name, # other relative offsets, which would have to be computed here # slightly differently. tmp_offset = s2n( - file_handle=self.file_handle, - offset=offset, - length=self.bytesize_of_offset_value, - signed=False) + file_handle=self.file_handle, + offset=offset, + length=self.bytesize_of_offset_value, + signed=False, + ) if relative: offset = tmp_offset + ifd - 8 if self.fake_exif: @@ -295,14 +364,16 @@ def _process_tag(self, ifd, ifd_name: str, tag_entry, entry, tag: int, tag_name, if field_type == 2: values = self._process_field2(ifd_name, tag_name, count, offset) else: - values = self._process_field(tag_name, count, field_type, type_length, offset) + values = self._process_field( + tag_name, count, field_type, type_length, offset + ) # now 'values' is either a string or an array # TODO: use only one type if count == 1 and field_type != 2: printable = str(values[0]) elif count > 50 and len(values) > 20 and not isinstance(values, str): if self.truncate_tags: - printable = str(values[0:20])[0:-1] + ', ... ]' + printable = str(values[0:20])[0:-1] + ", ... ]" else: printable = str(values[0:-1]) else: @@ -317,23 +388,30 @@ def _process_tag(self, ifd, ifd_name: str, tag_entry, entry, tag: int, tag_name, elif isinstance(tag_entry[1], tuple): ifd_info = tag_entry[1] try: - logger.debug('%s SubIFD at offset %d:', ifd_info[0], values[0]) - self.dump_ifd(values[0], ifd_info[0], tag_dict=ifd_info[1], stop_tag=stop_tag) + logger.debug("%s SubIFD at offset %d:", ifd_info[0], values[0]) + self.dump_ifd( + values[0], + ifd_info[0], + tag_dict=ifd_info[1], + stop_tag=stop_tag, + ) except IndexError: - logger.warning('No values found for %s SubIFD', ifd_info[0]) + logger.warning("No values found for %s SubIFD", ifd_info[0]) else: - printable = '' + printable = "" for val in values: # use lookup table for this tag printable += tag_entry[1].get(val, repr(val)) - self.tags[ifd_name + ' ' + tag_name] = IfdTag( + self.tags[ifd_name + " " + tag_name] = IfdTag( printable, tag, field_type, values, field_offset, count * type_length ) - tag_value = repr(self.tags[ifd_name + ' ' + tag_name]) - logger.debug(' %s: %s', tag_name, tag_value) + tag_value = repr(self.tags[ifd_name + " " + tag_name]) + logger.debug(" %s: %s", tag_name, tag_value) - def dump_ifd(self, ifd, ifd_name: str, tag_dict=None, relative=0, stop_tag=DEFAULT_STOP_TAG) -> None: + def dump_ifd( + self, ifd, ifd_name: str, tag_dict=None, relative=0, stop_tag=DEFAULT_STOP_TAG + ) -> None: """ Return a list of entries in the given IFD. """ @@ -341,26 +419,47 @@ def dump_ifd(self, ifd, ifd_name: str, tag_dict=None, relative=0, stop_tag=DEFAU if tag_dict is None: tag_dict = EXIF_TAGS try: - entries = s2n(file_handle=self.file_handle, offset=ifd, length=self.number_of_entries_length, signed=False, endian=self.endian) + entries = s2n( + file_handle=self.file_handle, + offset=ifd, + length=self.number_of_entries_length, + signed=False, + endian=self.endian, + ) except TypeError: - logger.warning('Possibly corrupted IFD: %s', ifd) + logger.warning("Possibly corrupted IFD: %s", ifd) return for i in range(entries): # entry is index of start of this IFD in the file entry = self.offset_to_next_ifd(ifd, i) - tag_id = s2n(file_handle=self.file_handle, offset=entry, length=self.tag_id_length, signed=False, endian=self.endian) + tag_id = s2n( + file_handle=self.file_handle, + offset=entry, + length=self.tag_id_length, + signed=False, + endian=self.endian, + ) # get tag name early to avoid errors, help debug tag_entry = tag_dict.get(tag_id) if tag_entry: tag_name = tag_entry[0] else: - tag_name = 'Tag 0x%04X' % tag_id + tag_name = "Tag 0x%04X" % tag_id # ignore certain tags for faster processing if not (not self.detailed and tag_id in IGNORE_TAGS): - self._process_tag(ifd, ifd_name, tag_entry, entry, tag_id, tag_name, relative, stop_tag) + self._process_tag( + ifd, + ifd_name, + tag_entry, + entry, + tag_id, + tag_name, + relative, + stop_tag, + ) if tag_name == stop_tag: break @@ -372,19 +471,19 @@ def extract_tiff_thumbnail(self, thumb_ifd: int) -> None: Take advantage of the pre-existing layout in the thumbnail IFD as much as possible """ - thumb = self.tags.get('Thumbnail Compression') - if not thumb or thumb.printable != 'Uncompressed TIFF': + thumb = self.tags.get("Thumbnail Compression") + if not thumb or thumb.printable != "Uncompressed TIFF": return entries = self.s2n(thumb_ifd, 2) # this is header plus offset to IFD ... - if self.endian == 'M': - tiff = b'MM\x00*\x00\x00\x00\x08' + if self.endian == "M": + tiff = b"MM\x00*\x00\x00\x00\x08" else: - tiff = b'II*\x00\x08\x00\x00\x00' + tiff = b"II*\x00\x08\x00\x00\x00" # ... plus thumbnail IFD data plus a null "next IFD" pointer self.file_handle.seek(self.offset + thumb_ifd) - tiff += self.file_handle.read(entries * 12 + 2) + b'\x00\x00\x00\x00' + tiff += self.file_handle.read(entries * 12 + 2) + b"\x00\x00\x00\x00" # fix up large value offset pointers into data area for i in range(entries): @@ -405,7 +504,7 @@ def extract_tiff_thumbnail(self, thumb_ifd: int) -> None: # update offset pointer (nasty "strings are immutable" crap) # should be able to say "tiff[ptr:ptr+4]=newoff" newoff = len(tiff) - tiff = tiff[:ptr] + self.n2b(newoff, 4) + tiff[ptr + 4:] + tiff = tiff[:ptr] + self.n2b(newoff, 4) + tiff[ptr + 4 :] # remember strip offsets location if tag == 0x0111: strip_off = newoff @@ -415,18 +514,18 @@ def extract_tiff_thumbnail(self, thumb_ifd: int) -> None: tiff += self.file_handle.read(count * type_length) # add pixel strips and update strip offset info - old_offsets = self.tags['Thumbnail StripOffsets'].values - old_counts = self.tags['Thumbnail StripByteCounts'].values + old_offsets = self.tags["Thumbnail StripOffsets"].values + old_counts = self.tags["Thumbnail StripByteCounts"].values for i, old_offset in enumerate(old_offsets): # update offset pointer (more nasty "strings are immutable" crap) offset = self.n2b(len(tiff), strip_len) - tiff = tiff[:strip_off] + offset + tiff[strip_off + strip_len:] + tiff = tiff[:strip_off] + offset + tiff[strip_off + strip_len :] strip_off += strip_len # add pixel strip to end self.file_handle.seek(self.offset + old_offset) tiff += self.file_handle.read(old_counts[i]) - self.tags['TIFFThumbnail'] = tiff + self.tags["TIFFThumbnail"] = tiff def extract_jpeg_thumbnail(self) -> None: """ @@ -434,19 +533,21 @@ def extract_jpeg_thumbnail(self) -> None: (Thankfully the JPEG data is stored as a unit.) """ - thumb_offset = self.tags.get('Thumbnail JPEGInterchangeFormat') + thumb_offset = self.tags.get("Thumbnail JPEGInterchangeFormat") if thumb_offset: self.file_handle.seek(self.offset + thumb_offset.values[0]) - size = self.tags['Thumbnail JPEGInterchangeFormatLength'].values[0] - self.tags['JPEGThumbnail'] = self.file_handle.read(size) + size = self.tags["Thumbnail JPEGInterchangeFormatLength"].values[0] + self.tags["JPEGThumbnail"] = self.file_handle.read(size) # Sometimes in a TIFF file, a JPEG thumbnail is hidden in the MakerNote # since it's not allowed in a uncompressed TIFF IFD - if 'JPEGThumbnail' not in self.tags: - thumb_offset = self.tags.get('MakerNote JPEGThumbnail') + if "JPEGThumbnail" not in self.tags: + thumb_offset = self.tags.get("MakerNote JPEGThumbnail") if thumb_offset: self.file_handle.seek(self.offset + thumb_offset.values[0]) - self.tags['JPEGThumbnail'] = self.file_handle.read(thumb_offset.field_length) + self.tags["JPEGThumbnail"] = self.file_handle.read( + thumb_offset.field_length + ) def decode_maker_note(self) -> None: """ @@ -472,109 +573,130 @@ def decode_maker_note(self) -> None: TODO: look into splitting this up """ - note = self.tags['EXIF MakerNote'] + note = self.tags["EXIF MakerNote"] # Some apps use MakerNote tags but do not use a format for which we # have a description, so just do a raw dump for these. - make = self.tags['Image Make'].printable + make = self.tags["Image Make"].printable # Nikon # The maker note usually starts with the word Nikon, followed by the # type of the makernote (1 or 2, as a short). If the word Nikon is # not at the start of the makernote, it's probably type 2, since some # cameras work that way. - if 'NIKON' in make: + if "NIKON" in make: if note.values[0:7] == [78, 105, 107, 111, 110, 0, 1]: - logger.debug('Looks like a type 1 Nikon MakerNote.') - self.dump_ifd(note.field_offset + 8, 'MakerNote', - tag_dict=makernote.nikon.TAGS_OLD) + logger.debug("Looks like a type 1 Nikon MakerNote.") + self.dump_ifd( + note.field_offset + 8, + "MakerNote", + tag_dict=makernote.nikon.TAGS_OLD, + ) elif note.values[0:7] == [78, 105, 107, 111, 110, 0, 2]: - logger.debug('Looks like a labeled type 2 Nikon MakerNote') + logger.debug("Looks like a labeled type 2 Nikon MakerNote") if note.values[12:14] != [0, 42] and note.values[12:14] != [42, 0]: - raise ValueError('Missing marker tag 42 in MakerNote.') + raise ValueError("Missing marker tag 42 in MakerNote.") # skip the Makernote label and the TIFF header - self.dump_ifd(note.field_offset + 10 + 8, 'MakerNote', - tag_dict=makernote.nikon.TAGS_NEW, relative=1) + self.dump_ifd( + note.field_offset + 10 + 8, + "MakerNote", + tag_dict=makernote.nikon.TAGS_NEW, + relative=1, + ) else: # E99x or D1 - logger.debug('Looks like an unlabeled type 2 Nikon MakerNote') - self.dump_ifd(note.field_offset, 'MakerNote', - tag_dict=makernote.nikon.TAGS_NEW) + logger.debug("Looks like an unlabeled type 2 Nikon MakerNote") + self.dump_ifd( + note.field_offset, "MakerNote", tag_dict=makernote.nikon.TAGS_NEW + ) return # Olympus - if make.startswith('OLYMPUS'): - self.dump_ifd(note.field_offset + 8, 'MakerNote', tag_dict=makernote.olympus.TAGS) + if make.startswith("OLYMPUS"): + self.dump_ifd( + note.field_offset + 8, "MakerNote", tag_dict=makernote.olympus.TAGS + ) # TODO - #for i in (('MakerNote Tag 0x2020', makernote.OLYMPUS_TAG_0x2020),): + # for i in (('MakerNote Tag 0x2020', makernote.OLYMPUS_TAG_0x2020),): # self.decode_olympus_tag(self.tags[i[0]].values, i[1]) - #return + # return # Casio - if 'CASIO' in make or 'Casio' in make: - self.dump_ifd(note.field_offset, 'MakerNote', - tag_dict=makernote.casio.TAGS) + if "CASIO" in make or "Casio" in make: + self.dump_ifd(note.field_offset, "MakerNote", tag_dict=makernote.casio.TAGS) return # Fujifilm - if make == 'FUJIFILM': + if make == "FUJIFILM": # bug: everything else is "Motorola" endian, but the MakerNote # is "Intel" endian endian = self.endian - self.endian = 'I' + self.endian = "I" # bug: IFD offsets are from beginning of MakerNote, not # beginning of file header offset = self.offset self.offset += note.field_offset # process note with bogus values (note is actually at offset 12) - self.dump_ifd(12, 'MakerNote', tag_dict=makernote.fujifilm.TAGS) + self.dump_ifd(12, "MakerNote", tag_dict=makernote.fujifilm.TAGS) # reset to correct values self.endian = endian self.offset = offset return # Apple - if make == 'Apple' and note.values[0:10] == [65, 112, 112, 108, 101, 32, 105, 79, 83, 0]: + if make == "Apple" and note.values[0:10] == [ + 65, + 112, + 112, + 108, + 101, + 32, + 105, + 79, + 83, + 0, + ]: offset = self.offset self.offset += note.field_offset + 14 - self.dump_ifd(0, 'MakerNote', tag_dict=makernote.apple.TAGS) + self.dump_ifd(0, "MakerNote", tag_dict=makernote.apple.TAGS) self.offset = offset return - if make == 'DJI': + if make == "DJI": endian = self.endian - self.endian = 'I' + self.endian = "I" offset = self.offset self.offset += note.field_offset - self.dump_ifd(0, 'MakerNote', tag_dict=makernote.dji.TAGS) + self.dump_ifd(0, "MakerNote", tag_dict=makernote.dji.TAGS) self.offset = offset self.endian = endian return # Canon - if make == 'Canon': - self.dump_ifd(note.field_offset, 'MakerNote', - tag_dict=makernote.canon.TAGS) - - for i in (('MakerNote Tag 0x0001', makernote.canon.CAMERA_SETTINGS), - ('MakerNote Tag 0x0002', makernote.canon.FOCAL_LENGTH), - ('MakerNote Tag 0x0004', makernote.canon.SHOT_INFO), - ('MakerNote Tag 0x0026', makernote.canon.AF_INFO_2), - ('MakerNote Tag 0x0093', makernote.canon.FILE_INFO)): + if make == "Canon": + self.dump_ifd(note.field_offset, "MakerNote", tag_dict=makernote.canon.TAGS) + + for i in ( + ("MakerNote Tag 0x0001", makernote.canon.CAMERA_SETTINGS), + ("MakerNote Tag 0x0002", makernote.canon.FOCAL_LENGTH), + ("MakerNote Tag 0x0004", makernote.canon.SHOT_INFO), + ("MakerNote Tag 0x0026", makernote.canon.AF_INFO_2), + ("MakerNote Tag 0x0093", makernote.canon.FILE_INFO), + ): if i[0] in self.tags: - logger.debug('Canon %s', i[0]) + logger.debug("Canon %s", i[0]) self._canon_decode_tag(self.tags[i[0]].values, i[1]) del self.tags[i[0]] if makernote.canon.CAMERA_INFO_TAG_NAME in self.tags: tag = self.tags[makernote.canon.CAMERA_INFO_TAG_NAME] - logger.debug('Canon CameraInfo') + logger.debug("Canon CameraInfo") self._canon_decode_camera_info(tag) del self.tags[makernote.canon.CAMERA_INFO_TAG_NAME] return -# TODO Decode Olympus MakerNote tag based on offset within tag. -# def _olympus_decode_tag(self, value, mn_tags): -# pass + # TODO Decode Olympus MakerNote tag based on offset within tag. + # def _olympus_decode_tag(self, value, mn_tags): + # pass def _canon_decode_tag(self, value, mn_tags): """ @@ -583,10 +705,10 @@ def _canon_decode_tag(self, value, mn_tags): See http://www.burren.cx/david/canon.html by David Burren """ for i in range(1, len(value)): - tag = mn_tags.get(i, ('Unknown', )) + tag = mn_tags.get(i, ("Unknown",)) name = tag[0] if len(tag) > 1: - val = tag[1].get(value[i], 'Unknown') + val = tag[1].get(value[i], "Unknown") else: val = value[i] try: @@ -596,19 +718,19 @@ def _canon_decode_tag(self, value, mn_tags): # It's not a real IFD Tag but we fake one to make everybody happy. # This will have a "proprietary" type - self.tags['MakerNote ' + name] = IfdTag(str(val), 0, 0, val, 0, 0) + self.tags["MakerNote " + name] = IfdTag(str(val), 0, 0, val, 0, 0) def _canon_decode_camera_info(self, camera_info_tag): """ Decode the variable length encoded camera info section. """ - model = self.tags.get('Image Model', None) + model = self.tags.get("Image Model", None) if not model: return model = str(model.values) camera_info_tags = {} - for (model_name_re, tag_desc) in makernote.canon.CAMERA_INFO_MODEL_MAP.items(): + for model_name_re, tag_desc in makernote.canon.CAMERA_INFO_MODEL_MAP.items(): if re.search(model_name_re, model): camera_info_tags = tag_desc break @@ -619,7 +741,9 @@ def _canon_decode_camera_info(self, camera_info_tag): # Unknown) if camera_info_tag.field_type not in (1, 7): return - camera_info = struct.pack('<%dB' % len(camera_info_tag.values), *camera_info_tag.values) + camera_info = struct.pack( + "<%dB" % len(camera_info_tag.values), *camera_info_tag.values + ) # Look for each data value and decode it appropriately. for offset, tag in camera_info_tags.items(): @@ -627,7 +751,7 @@ def _canon_decode_camera_info(self, camera_info_tag): tag_size = struct.calcsize(tag_format) if len(camera_info) < offset + tag_size: continue - packed_tag_value = camera_info[offset:offset + tag_size] + packed_tag_value = camera_info[offset : offset + tag_size] tag_value = struct.unpack(tag_format, packed_tag_value)[0] tag_name = tag[0] @@ -638,7 +762,9 @@ def _canon_decode_camera_info(self, camera_info_tag): tag_value = tag[2].get(tag_value, tag_value) logger.debug(" %s %s", tag_name, tag_value) - self.tags['MakerNote ' + tag_name] = IfdTag(str(tag_value), 0, 0, tag_value, 0, 0) + self.tags["MakerNote " + tag_name] = IfdTag( + str(tag_value), 0, 0, tag_value, 0, 0 + ) def parse_xmp(self, xmp_bytes: bytes): """Adobe's Extensible Metadata Platform, just dump the pretty XML.""" @@ -655,10 +781,14 @@ def parse_xmp(self, xmp_bytes: bytes): pretty = xml.dom.minidom.parseString(xmp_string).toprettyxml() except xml.parsers.expat.ExpatError: logger.warning("XMP: XML is not well formed") - self.tags['Image ApplicationNotes'] = IfdTag(xmp_string, 0, 1, xmp_bytes, 0, 0) + self.tags["Image ApplicationNotes"] = IfdTag( + xmp_string, 0, 1, xmp_bytes, 0, 0 + ) return cleaned = [] for line in pretty.splitlines(): if line.strip(): cleaned.append(line) - self.tags['Image ApplicationNotes'] = IfdTag('\n'.join(cleaned), 0, 1, xmp_bytes, 0, 0) + self.tags["Image ApplicationNotes"] = IfdTag( + "\n".join(cleaned), 0, 1, xmp_bytes, 0, 0 + ) From af354ff8e0a58b2f0406b6417aec9970dcf55ba0 Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Wed, 23 Aug 2023 16:00:51 +0200 Subject: [PATCH 12/16] everything parameterized --- exifread/classes.py | 75 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 10 deletions(-) diff --git a/exifread/classes.py b/exifread/classes.py index 95c551d..4d9001a 100644 --- a/exifread/classes.py +++ b/exifread/classes.py @@ -133,9 +133,14 @@ def __init__( self.tag_id_length = 2 self.tag_fieldtype_length = 2 self.tag_nvalues_length = 8 if self.magic_number == 43 else 4 - self.inlining_threshold = ( - 8 if self.magic_number == 43 else 4 - ) # In classic TIFF, the tag data was written inside the tag structure, in the IFD, if its size was smaller than or equal to 4 bytes. Otherwise, it's written elsewhere, and pointed to. In BigTIFF, the tag data is written inside the tag structure, in the IFD, if its size is smaller than or equal to 8 bytes. + # In classic TIFF, + # the tag data was written inside the tag structure, + # in the IFD, if its size was smaller than or equal + # to 4 bytes. Otherwise, it's written elsewhere, + # and pointed to. In BigTIFF, the tag data is written + # inside the tag structure, in the IFD, if its + # size is smaller than or equal to 8 bytes: + self.inlining_threshold = 8 if self.magic_number == 43 else 4 self.max_tag_length = ( self.tag_id_length + self.tag_fieldtype_length @@ -224,7 +229,20 @@ def _process_field(self, tag_name, count, field_type, type_length, offset): if field_type in (5, 10): # a ratio value = Ratio( - self.s2n(offset, 4, signed), self.s2n(offset + 4, 4, signed) + s2n( + file_handle=self.file_handle, + offset=offset, + length=4, + signed=signed, + endian=self.endian, + ), + s2n( + file_handle=self.file_handle, + offset=offset + 4, + length=4, + signed=signed, + endian=self.endian, + ), ) elif field_type in (11, 12): # a float or double @@ -259,7 +277,13 @@ def _process_field(self, tag_name, count, field_type, type_length, offset): # supposed to have long values! Fix up one important case. elif tag_name in ("MakerNote", makernote.canon.CAMERA_INFO_TAG_NAME): for _ in range(count): - value = self.s2n(offset, type_length, signed) + value = s2n( + file_handle=self.file_handle, + offset=offset, + length=type_length, + signed=signed, + endian=self.endian, + ) values.append(value) offset = offset + type_length return values @@ -475,7 +499,14 @@ def extract_tiff_thumbnail(self, thumb_ifd: int) -> None: if not thumb or thumb.printable != "Uncompressed TIFF": return - entries = self.s2n(thumb_ifd, 2) + entries = s2n( + file_handle=self.file_handle, + offset=thumb_ifd, + length=2, + signed=False, + endian=self.endian, + ) + # this is header plus offset to IFD ... if self.endian == "M": tiff = b"MM\x00*\x00\x00\x00\x08" @@ -488,11 +519,35 @@ def extract_tiff_thumbnail(self, thumb_ifd: int) -> None: # fix up large value offset pointers into data area for i in range(entries): entry = thumb_ifd + 2 + 12 * i - tag = self.s2n(entry, 2) - field_type = self.s2n(entry + 2, 2) + tag = s2n( + file_handle=self.file_handle, + offset=entry, + length=2, + signed=False, + endian=self.endian, + ) + field_type = s2n( + file_handle=self.file_handle, + offset=entry+2, + length=2, + signed=False, + endian=self.endian, + ) type_length = FIELD_TYPES[field_type][0] - count = self.s2n(entry + 4, 4) - old_offset = self.s2n(entry + 8, 4) + count = s2n( + file_handle=self.file_handle, + offset=entry+4, + length=4, + signed=False, + endian=self.endian, + ) + old_offset = s2n( + file_handle=self.file_handle, + offset=entry+8, + length=4, + signed=False, + endian=self.endian, + ) # start of the 4-byte pointer area in entry ptr = i * 12 + 18 # remember strip offsets location From 9d7c3535a54d1e6d854d00d7335245c513000a5c Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Thu, 24 Aug 2023 10:28:36 +0200 Subject: [PATCH 13/16] splits into ifd and tag classes --- exifread/classes.py | 136 +++++++++++++++++++++++++++----------------- 1 file changed, 84 insertions(+), 52 deletions(-) diff --git a/exifread/classes.py b/exifread/classes.py index 4d9001a..4292c2c 100644 --- a/exifread/classes.py +++ b/exifread/classes.py @@ -1,6 +1,7 @@ import re import struct from typing import BinaryIO, Dict, Any +from dataclasses import dataclass from exifread.exif_log import get_logger from exifread.utils import Ratio @@ -101,6 +102,56 @@ def __repr__(self) -> str: return tag +class IfdAttrs: + def __init__(self, magic_number: int) -> None: + self.magic_number: int = magic_number + + @property + def offset_to_ifd_length(self) -> int: + return 8 if self.magic_number == 43 else 4 + + @property + def ntag_length(self) -> int: + return 8 if self.magic_number == 43 else 2 + + +class TagAttrs: + """ + Tag attributes as described in https://www.awaresystems.be/imaging/tiff/bigtiff.html + Tiff and BigTiff differ mainly in length of fields. + """ + + def __init__(self, magic_number: int) -> None: + self.magic_number: int = magic_number + self.id_length: int = 2 + self.fieldtype_length: int = 2 + + @property + def nvalues_length(self) -> int: + return 8 if self.magic_number == 43 else 4 + + @property + def inlining_threshold(self) -> int: + """ + In classic TIFF, the tag data was + written inside the tag structure, + in the IFD, if its size was smaller than or equal + to 4 bytes. Otherwise, it's written elsewhere, + and pointed to. In BigTIFF, the tag data is written + inside the tag structure, in the IFD, if its + size is smaller than or equal to 8 bytes:""" + return 8 if self.magic_number == 43 else 4 + + @property + def total_length(self) -> int: + return ( + self.id_length + + self.fieldtype_length + + self.nvalues_length + + self.inlining_threshold + ) + + class ExifHeader: """ Handle an EXIF header. @@ -128,26 +179,6 @@ def __init__( signed=False, endian=self.endian, ) - self.number_of_entries_length = 8 if self.magic_number == 43 else 2 - self.bytesize_of_offset_value = 8 if self.magic_number == 43 else 4 - self.tag_id_length = 2 - self.tag_fieldtype_length = 2 - self.tag_nvalues_length = 8 if self.magic_number == 43 else 4 - # In classic TIFF, - # the tag data was written inside the tag structure, - # in the IFD, if its size was smaller than or equal - # to 4 bytes. Otherwise, it's written elsewhere, - # and pointed to. In BigTIFF, the tag data is written - # inside the tag structure, in the IFD, if its - # size is smaller than or equal to 8 bytes: - self.inlining_threshold = 8 if self.magic_number == 43 else 4 - self.max_tag_length = ( - self.tag_id_length - + self.tag_fieldtype_length - + self.tag_nvalues_length - + self.inlining_threshold - ) # max tag length - self.file_handle = file_handle self.offset = offset self.fake_exif = fake_exif @@ -157,10 +188,11 @@ def __init__( self.truncate_tags = truncate_tags # TODO: get rid of 'Any' type self.tags = {} # type: Dict[str, Any] + self.ifd_attrs = IfdAttrs(self.magic_number) + self.tag_attrs = TagAttrs(self.magic_number) def offset_to_next_ifd(self, ifd, entries): - # https://www.awaresystems.be/imaging/tiff/bigtiff.html - return ifd + self.number_of_entries_length + entries * self.max_tag_length + return ifd + self.ifd_attrs.ntag_length + entries * self.tag_attrs.total_length def n2b(self, offset, length) -> bytes: """Convert offset to bytes.""" @@ -177,7 +209,7 @@ def _first_ifd(self) -> int: """Return the pointer to first IFD.""" return s2n( self.file_handle, - offset=self.bytesize_of_offset_value, + offset=self.ifd_attrs.offset_to_ifd_length, length=1, signed=False, endian=self.endian, @@ -188,14 +220,14 @@ def _next_ifd(self, ifd) -> int: entries = s2n( file_handle=self.file_handle, offset=ifd, - length=self.number_of_entries_length, + length=self.ifd_attrs.ntag_length, signed=False, endian=self.endian, ) next_ifd = s2n( file_handle=self.file_handle, offset=self.offset_to_next_ifd(ifd, entries), - length=self.bytesize_of_offset_value, + length=self.ifd_attrs.offset_to_ifd_length, endian=self.endian, ) # bytesize of offsets is 8 in bigtiff and 4 in tiff @@ -278,12 +310,12 @@ def _process_field(self, tag_name, count, field_type, type_length, offset): elif tag_name in ("MakerNote", makernote.canon.CAMERA_INFO_TAG_NAME): for _ in range(count): value = s2n( - file_handle=self.file_handle, - offset=offset, - length=type_length, - signed=signed, - endian=self.endian, - ) + file_handle=self.file_handle, + offset=offset, + length=type_length, + signed=signed, + endian=self.endian, + ) values.append(value) offset = offset + type_length return values @@ -333,8 +365,8 @@ def _process_tag( ) -> None: field_type = s2n( file_handle=self.file_handle, - offset=entry + self.tag_id_length, - length=self.tag_fieldtype_length, + offset=entry + self.tag_attrs.id_length, + length=self.tag_attrs.fieldtype_length, signed=False, endian=self.endian, ) @@ -347,8 +379,8 @@ def _process_tag( type_length = FIELD_TYPES[field_type][0] count = s2n( file_handle=self.file_handle, - offset=entry + self.tag_id_length + self.tag_fieldtype_length, - length=self.number_of_entries_length, + offset=entry + self.tag_attrs.id_length + self.tag_attrs.fieldtype_length, + length=self.ifd_attrs.ntag_length, signed=False, endian=self.endian, ) @@ -357,14 +389,14 @@ def _process_tag( # Now we point at either the data or the 2nd level offset offset = ( entry - + self.tag_id_length - + self.tag_fieldtype_length - + self.tag_nvalues_length + + self.tag_attrs.id_length + + self.tag_attrs.fieldtype_length + + self.tag_attrs.nvalues_length ) - # If the value fits in 4 bytes, it is inlined, else we + # If the value fits in 4 bytes (8 bytes in the case of bigtiff), it is inlined, else we # need to jump ahead again. - if count * type_length > self.inlining_threshold: + if count * type_length > self.tag_attrs.inlining_threshold: # offset is not the value; it's a pointer to the value # if relative we set things up so s2n will seek to the right # place when it adds self.offset. Note that this 'relative' @@ -374,7 +406,7 @@ def _process_tag( tmp_offset = s2n( file_handle=self.file_handle, offset=offset, - length=self.bytesize_of_offset_value, + length=self.ifd_attrs.offset_to_ifd_length, signed=False, ) if relative: @@ -446,7 +478,7 @@ def dump_ifd( entries = s2n( file_handle=self.file_handle, offset=ifd, - length=self.number_of_entries_length, + length=self.ifd_attrs.ntag_length, signed=False, endian=self.endian, ) @@ -460,7 +492,7 @@ def dump_ifd( tag_id = s2n( file_handle=self.file_handle, offset=entry, - length=self.tag_id_length, + length=self.tag_attrs.id_length, signed=False, endian=self.endian, ) @@ -500,12 +532,12 @@ def extract_tiff_thumbnail(self, thumb_ifd: int) -> None: return entries = s2n( - file_handle=self.file_handle, - offset=thumb_ifd, - length=2, - signed=False, - endian=self.endian, - ) + file_handle=self.file_handle, + offset=thumb_ifd, + length=2, + signed=False, + endian=self.endian, + ) # this is header plus offset to IFD ... if self.endian == "M": @@ -528,7 +560,7 @@ def extract_tiff_thumbnail(self, thumb_ifd: int) -> None: ) field_type = s2n( file_handle=self.file_handle, - offset=entry+2, + offset=entry + 2, length=2, signed=False, endian=self.endian, @@ -536,14 +568,14 @@ def extract_tiff_thumbnail(self, thumb_ifd: int) -> None: type_length = FIELD_TYPES[field_type][0] count = s2n( file_handle=self.file_handle, - offset=entry+4, + offset=entry + 4, length=4, signed=False, endian=self.endian, ) old_offset = s2n( file_handle=self.file_handle, - offset=entry+8, + offset=entry + 8, length=4, signed=False, endian=self.endian, From fe531be3f67f80ace3875a5a4d37633ff2c41467 Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Thu, 24 Aug 2023 10:33:05 +0200 Subject: [PATCH 14/16] additional field --- exifread/classes.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/exifread/classes.py b/exifread/classes.py index 4292c2c..2530989 100644 --- a/exifread/classes.py +++ b/exifread/classes.py @@ -106,6 +106,10 @@ class IfdAttrs: def __init__(self, magic_number: int) -> None: self.magic_number: int = magic_number + @property + def offset_to_first_ifd(self) -> int: + return 8 if self.magic_number == 43 else 4 + @property def offset_to_ifd_length(self) -> int: return 8 if self.magic_number == 43 else 4 @@ -209,8 +213,8 @@ def _first_ifd(self) -> int: """Return the pointer to first IFD.""" return s2n( self.file_handle, - offset=self.ifd_attrs.offset_to_ifd_length, - length=1, + offset=self.ifd_attrs.offset_to_first_ifd, + length=self.ifd_attrs.offset_to_ifd_length, signed=False, endian=self.endian, ) From a3ae3e0899af6cb74d75b9010718b523a7d9f117 Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Thu, 24 Aug 2023 11:03:21 +0200 Subject: [PATCH 15/16] add additional attributes as cached properties --- exifread/classes.py | 66 ++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/exifread/classes.py b/exifread/classes.py index 2530989..001bdfe 100644 --- a/exifread/classes.py +++ b/exifread/classes.py @@ -1,7 +1,7 @@ import re import struct from typing import BinaryIO, Dict, Any -from dataclasses import dataclass +from functools import cached_property from exifread.exif_log import get_logger from exifread.utils import Ratio @@ -124,7 +124,7 @@ class TagAttrs: Tag attributes as described in https://www.awaresystems.be/imaging/tiff/bigtiff.html Tiff and BigTiff differ mainly in length of fields. """ - + def __init__(self, magic_number: int) -> None: self.magic_number: int = magic_number self.id_length: int = 2 @@ -176,13 +176,6 @@ def __init__( based on https://www.awaresystems.be/imaging/tiff/bigtiff.html#structures """ self.endian = endian - self.magic_number = s2n( - file_handle=file_handle, - offset=2, - length=1, - signed=False, - endian=self.endian, - ) self.file_handle = file_handle self.offset = offset self.fake_exif = fake_exif @@ -192,8 +185,24 @@ def __init__( self.truncate_tags = truncate_tags # TODO: get rid of 'Any' type self.tags = {} # type: Dict[str, Any] - self.ifd_attrs = IfdAttrs(self.magic_number) - self.tag_attrs = TagAttrs(self.magic_number) + + @cached_property + def magic_number(self): + return s2n( + file_handle=self.file_handle, + offset=self.offset + 2, + length=1, + signed=False, + endian=self.endian, + ) + + @cached_property + def ifd_attrs(self): + return IfdAttrs(self.magic_number) + + @cached_property + def tag_attrs(self): + return TagAttrs(self.magic_number) def offset_to_next_ifd(self, ifd, entries): return ifd + self.ifd_attrs.ntag_length + entries * self.tag_attrs.total_length @@ -213,7 +222,7 @@ def _first_ifd(self) -> int: """Return the pointer to first IFD.""" return s2n( self.file_handle, - offset=self.ifd_attrs.offset_to_first_ifd, + offset=self.offset + self.ifd_attrs.offset_to_first_ifd, length=self.ifd_attrs.offset_to_ifd_length, signed=False, endian=self.endian, @@ -223,14 +232,14 @@ def _next_ifd(self, ifd) -> int: """Return the pointer to next IFD.""" entries = s2n( file_handle=self.file_handle, - offset=ifd, + offset=self.offset + ifd, length=self.ifd_attrs.ntag_length, signed=False, endian=self.endian, ) next_ifd = s2n( file_handle=self.file_handle, - offset=self.offset_to_next_ifd(ifd, entries), + offset=self.offset + self.offset_to_next_ifd(ifd, entries), length=self.ifd_attrs.offset_to_ifd_length, endian=self.endian, ) @@ -267,14 +276,14 @@ def _process_field(self, tag_name, count, field_type, type_length, offset): value = Ratio( s2n( file_handle=self.file_handle, - offset=offset, + offset=self.offset + offset, length=4, signed=signed, endian=self.endian, ), s2n( file_handle=self.file_handle, - offset=offset + 4, + offset=self.offset + offset + 4, length=4, signed=signed, endian=self.endian, @@ -302,7 +311,7 @@ def _process_field(self, tag_name, count, field_type, type_length, offset): else: value = s2n( file_handle=self.file_handle, - offset=offset, + offset=self.offset + offset, length=type_length, signed=signed, endian=self.endian, @@ -315,7 +324,7 @@ def _process_field(self, tag_name, count, field_type, type_length, offset): for _ in range(count): value = s2n( file_handle=self.file_handle, - offset=offset, + offset=self.offset + offset, length=type_length, signed=signed, endian=self.endian, @@ -369,7 +378,7 @@ def _process_tag( ) -> None: field_type = s2n( file_handle=self.file_handle, - offset=entry + self.tag_attrs.id_length, + offset=self.offset + entry + self.tag_attrs.id_length, length=self.tag_attrs.fieldtype_length, signed=False, endian=self.endian, @@ -383,7 +392,10 @@ def _process_tag( type_length = FIELD_TYPES[field_type][0] count = s2n( file_handle=self.file_handle, - offset=entry + self.tag_attrs.id_length + self.tag_attrs.fieldtype_length, + offset=self.offset + + entry + + self.tag_attrs.id_length + + self.tag_attrs.fieldtype_length, length=self.ifd_attrs.ntag_length, signed=False, endian=self.endian, @@ -409,7 +421,7 @@ def _process_tag( # slightly differently. tmp_offset = s2n( file_handle=self.file_handle, - offset=offset, + offset=self.offset + offset, length=self.ifd_attrs.offset_to_ifd_length, signed=False, ) @@ -481,7 +493,7 @@ def dump_ifd( try: entries = s2n( file_handle=self.file_handle, - offset=ifd, + offset=self.offset + ifd, length=self.ifd_attrs.ntag_length, signed=False, endian=self.endian, @@ -495,7 +507,7 @@ def dump_ifd( entry = self.offset_to_next_ifd(ifd, i) tag_id = s2n( file_handle=self.file_handle, - offset=entry, + offset=self.offset + entry, length=self.tag_attrs.id_length, signed=False, endian=self.endian, @@ -537,7 +549,7 @@ def extract_tiff_thumbnail(self, thumb_ifd: int) -> None: entries = s2n( file_handle=self.file_handle, - offset=thumb_ifd, + offset=self.offset + thumb_ifd, length=2, signed=False, endian=self.endian, @@ -557,7 +569,7 @@ def extract_tiff_thumbnail(self, thumb_ifd: int) -> None: entry = thumb_ifd + 2 + 12 * i tag = s2n( file_handle=self.file_handle, - offset=entry, + offset=self.offset + entry, length=2, signed=False, endian=self.endian, @@ -572,14 +584,14 @@ def extract_tiff_thumbnail(self, thumb_ifd: int) -> None: type_length = FIELD_TYPES[field_type][0] count = s2n( file_handle=self.file_handle, - offset=entry + 4, + offset=self.offset + entry + 4, length=4, signed=False, endian=self.endian, ) old_offset = s2n( file_handle=self.file_handle, - offset=entry + 8, + offset=self.offset + entry + 8, length=4, signed=False, endian=self.endian, From 04c8be1897ef65a7dcee95c81fc5c74860156645 Mon Sep 17 00:00:00 2001 From: Miguel Delgado Date: Mon, 18 Sep 2023 15:18:50 +0100 Subject: [PATCH 16/16] remove file added by accident --- test.py | 55 ------------------------------------------------------- 1 file changed, 55 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index 678c90b..0000000 --- a/test.py +++ /dev/null @@ -1,55 +0,0 @@ -from pathlib import Path -import zipfile -from exifread import process_file - -BIG_TIFF_PACKAGE: str = "compressed_asset.large.zip" -TIFF_PACKAGE: str = "compressed_asset.zip" -PARENT_PATH: str = "/Users/jose.delgado/delivery_data" -path = zipfile.Path(Path(PARENT_PATH).joinpath(BIG_TIFF_PACKAGE)) - -for file_name in path.root.namelist(): - if ".tif" not in file_name.lower(): - continue - path.at = file_name - with path.open("rb") as fo: - header = process_file(fo) - break -for key, value in header.items(): - if value.tag == 34737: - print(value.values) -# print(header) - -""" -Struggling with the following byte string: b'E\x00\x00\x7fC\xb0\xd4\x00' - -- `E` is a shift-out character with no practical use https://en.wikipedia.org/wiki/Shift_Out_and_Shift_In_characters -- `\x7f` is a delete charater https://en.wikipedia.org/wiki/Delete_character -- `C` is form feed, or page break https://en.wikipedia.org/wiki/Page_break -- `\x00` is NULL and every ASCII string should end in a NULL - -So the byte string should break into something like: -b'\x00\x00' -b'\xb0\xd4\x00' - -Also from the tiff specification: - -Note on ASCII Keys: - -Special handling is required for -ASCII-valued keys. While it is true that -TIFF 6.0 permits multiple NULL-delimited -strings within a single ASCII tag, the secondary -strings might not appear in the output of naive "tiffdump" -programs. For this reason, the null delimiter of each -ASCII Key value shall be converted to -a "|" (pipe) character before being -installed back into the ASCII holding -tag, so that a dump of the tag will look like this. - -"|" is b'\x7c' in hex - -values.replace(b'b'\x7c',) - - AsciiTag="first_value|second_value|etc...last_value|" - -""" \ No newline at end of file