From f484b17a7f28fa8b03e133e27ab8fc8c6b9a6e8c Mon Sep 17 00:00:00 2001 From: David Stokes Date: Wed, 20 Mar 2024 11:30:12 -0400 Subject: [PATCH] Added caching, etc. --- idelib/dataset.py | 6 +++++ idelib/userdata.py | 61 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/idelib/dataset.py b/idelib/dataset.py index 521e2a1..176f13e 100644 --- a/idelib/dataset.py +++ b/idelib/dataset.py @@ -49,6 +49,7 @@ from collections.abc import Iterable, Sequence from datetime import datetime from threading import Lock +from typing import Any, Dict, Optional import warnings import os.path @@ -260,6 +261,11 @@ def __init__(self, stream, name=None, quiet=True, attributes=None): self.loading = True self.filename = getattr(stream, "name", None) + # For keeping user-defined data + self._userdata: Optional[Dict[str, Any]] = None + self._userdataOffset: Optional[int] = None + self._filesize: Optional[int] = None + self._channelDataLock = Lock() # Subsets: used when importing multiple files into the same dataset. diff --git a/idelib/userdata.py b/idelib/userdata.py index d45c949..5d2c666 100644 --- a/idelib/userdata.py +++ b/idelib/userdata.py @@ -22,13 +22,21 @@ # #=============================================================================== -def getUserDataPos(dataset: Dataset) -> Tuple[bool, int, int]: +def getUserDataPos(dataset: Dataset, + refresh: bool = False) -> Tuple[bool, int, int]: """ Get the offset of the start of the user data. :param dataset: The `Dataset` in which to locate the user data. + :param refresh:: If `True`, ignore any cached values and re-read + from the file. :return: A tuple containing a bool (wheter or not data exists), the offset of the user data, and the total length of the file. + Offset and filesize will typically be the same if there is no + user data. """ + if not refresh and dataset._userdataOffset and dataset._filesize: + return bool(dataset._userdata), dataset._userdataOffset, dataset._filesize + doc = dataset.ebmldoc fs = doc.stream hasdata = False @@ -37,10 +45,13 @@ def getUserDataPos(dataset: Dataset) -> Tuple[bool, int, int]: filesize = fs.seek(0, os.SEEK_END) offset = filesize + # The UserDataOffset is a known, fixed size example = doc.schema['UserDataOffset'].encode(1, length=8, lengthSize=8) header = example[:-8] try: + # UserDataOffset *should* be right at the end of the file, but + # don't assume so. Start some bytes back and find the header. pos = offset - int(len(example) * 1.5) fs.seek(pos, os.SEEK_SET) chunk = fs.read() @@ -57,6 +68,8 @@ def getUserDataPos(dataset: Dataset) -> Tuple[bool, int, int]: finally: fs.seek(oldpos, os.SEEK_SET) + dataset._userdataOffset = offset + dataset._filesize = filesize return hasdata, offset, filesize @@ -64,27 +77,36 @@ def getUserDataPos(dataset: Dataset) -> Tuple[bool, int, int]: # #=============================================================================== -def readUserData(dataset: Dataset) -> Union[Dict[str, Any], None]: +def readUserData(dataset: Dataset, + refresh: bool = False) -> Union[Dict[str, Any], None]: """ Read application-specific user data from the end of an IDE file. :param dataset: The `Dataset` from which to read the user data. + :param refresh:: If `True`, ignore any cached values and re-read + from the file. :return: A dictionary of user data, or `None` if no user data could be read from the file (e.g., none exists). """ + if not refresh and dataset._userdataOffset and dataset._filesize: + return dataset._userdata + doc = dataset.ebmldoc fs = doc.stream oldpos = fs.tell() - hasdata, offset, filesize = getUserDataPos(dataset) + hasdata, offset, filesize = getUserDataPos(dataset, refresh=refresh) if not hasdata: logger.debug('No user data found') + dataset._userdata = None return None try: fs.seek(offset, os.SEEK_SET) data, _next = doc.parseElement(fs) - return data.dump() + dump = data.dump() + dataset._userdata = dump + return dump finally: fs.seek(oldpos, os.SEEK_SET) @@ -95,7 +117,8 @@ def readUserData(dataset: Dataset) -> Union[Dict[str, Any], None]: #=============================================================================== def writeUserData(dataset: Dataset, - userdata: Dict[str, Any]): + userdata: Dict[str, Any], + refresh: bool = False): """ Write user data to the end of an IDE file. :param dataset: The `Dataset` from which to read the user data. @@ -103,23 +126,36 @@ def writeUserData(dataset: Dataset, existing user data. Note that the file will not get smaller if the user data is removed (or the new user data is smaller); it is just overwritten with null data (an EBML `Void` element). + :param refresh: If `True`, ignore any cached values and find the + position in the file to which to write. """ schema = dataset.ebmldoc.schema fs = dataset.ebmldoc.stream oldpos = fs.tell() try: - _hasdata, offset, filesize = getUserDataPos(dataset) - - dataBin = schema.encodes({'UserData': userdata}) - offsetBin = schema['UserDataOffset'].encode(offset, length=8, lengthSize=8) - newsize = (len(offsetBin) + len(dataBin) + offset + MIN_VOID_SIZE) - voidBin = schema['Void'].encode(None, length=max(0, filesize - newsize), - lengthSize=8) + hasdata, offset, filesize = getUserDataPos(dataset, refresh=refresh) + + if userdata: + # User data consists of a `UserData` element, a `Void`, and `UserDataOffset` + dataBin = schema.encodes({'UserData': userdata or {}}) + offsetBin = schema['UserDataOffset'].encode(offset, length=8, lengthSize=8) + newsize = (len(offsetBin) + len(dataBin) + offset + MIN_VOID_SIZE) + voidBin = schema['Void'].encode(None, length=max(0, filesize - newsize), + lengthSize=8) + else: + # No new userdata, just write 'Void' over any existing userdata + dataset._userdata = userdata + if not hasdata: + return + newsize = filesize + dataBin = offsetBin = b'' + voidBin = schema['Void'].encode(None, length=max(0, filesize - MIN_VOID_SIZE)) userblob = dataBin + voidBin + offsetBin if '+' not in fs.mode and 'w' not in fs.mode: + # File/stream is read-only; attempt to create a new file stream. if not getattr(fs, 'name', None): logger.debug(f'(userdata) Dataset stream read only (mode {fs.mode!r}) ' 'and has no name, not writing user data') @@ -134,6 +170,7 @@ def writeUserData(dataset: Dataset, fs.seek(offset, os.SEEK_SET) fs.write(userblob) + dataset._userdata = userdata logger.debug(f'(userdata) Wrote {len(userblob)} bytes to {dataset} ' f'(file was {filesize}, now {newsize})')