Skip to content

Commit

Permalink
Added caching, etc.
Browse files Browse the repository at this point in the history
  • Loading branch information
StokesMIDE committed Mar 20, 2024
1 parent 4792951 commit f484b17
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 12 deletions.
6 changes: 6 additions & 0 deletions idelib/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
from collections.abc import Iterable, Sequence
from datetime import datetime
from threading import Lock
from typing import Any, Dict, Optional
import warnings

import os.path
Expand Down Expand Up @@ -260,6 +261,11 @@ def __init__(self, stream, name=None, quiet=True, attributes=None):
self.loading = True
self.filename = getattr(stream, "name", None)

# For keeping user-defined data
self._userdata: Optional[Dict[str, Any]] = None
self._userdataOffset: Optional[int] = None
self._filesize: Optional[int] = None

self._channelDataLock = Lock()

# Subsets: used when importing multiple files into the same dataset.
Expand Down
61 changes: 49 additions & 12 deletions idelib/userdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,21 @@
#
#===============================================================================

def getUserDataPos(dataset: Dataset) -> Tuple[bool, int, int]:
def getUserDataPos(dataset: Dataset,
refresh: bool = False) -> Tuple[bool, int, int]:
""" Get the offset of the start of the user data.
:param dataset: The `Dataset` in which to locate the user data.
:param refresh:: If `True`, ignore any cached values and re-read
from the file.
:return: A tuple containing a bool (wheter or not data exists),
the offset of the user data, and the total length of the file.
Offset and filesize will typically be the same if there is no
user data.
"""
if not refresh and dataset._userdataOffset and dataset._filesize:
return bool(dataset._userdata), dataset._userdataOffset, dataset._filesize

doc = dataset.ebmldoc
fs = doc.stream
hasdata = False
Expand All @@ -37,10 +45,13 @@ def getUserDataPos(dataset: Dataset) -> Tuple[bool, int, int]:
filesize = fs.seek(0, os.SEEK_END)
offset = filesize

# The UserDataOffset is a known, fixed size
example = doc.schema['UserDataOffset'].encode(1, length=8, lengthSize=8)
header = example[:-8]

try:
# UserDataOffset *should* be right at the end of the file, but
# don't assume so. Start some bytes back and find the header.
pos = offset - int(len(example) * 1.5)
fs.seek(pos, os.SEEK_SET)
chunk = fs.read()
Expand All @@ -57,34 +68,45 @@ def getUserDataPos(dataset: Dataset) -> Tuple[bool, int, int]:
finally:
fs.seek(oldpos, os.SEEK_SET)

dataset._userdataOffset = offset
dataset._filesize = filesize
return hasdata, offset, filesize


#===============================================================================
#
#===============================================================================

def readUserData(dataset: Dataset) -> Union[Dict[str, Any], None]:
def readUserData(dataset: Dataset,
refresh: bool = False) -> Union[Dict[str, Any], None]:
""" Read application-specific user data from the end of an IDE file.
:param dataset: The `Dataset` from which to read the user data.
:param refresh:: If `True`, ignore any cached values and re-read
from the file.
:return: A dictionary of user data, or `None` if no user data
could be read from the file (e.g., none exists).
"""
if not refresh and dataset._userdataOffset and dataset._filesize:
return dataset._userdata

doc = dataset.ebmldoc
fs = doc.stream
oldpos = fs.tell()

hasdata, offset, filesize = getUserDataPos(dataset)
hasdata, offset, filesize = getUserDataPos(dataset, refresh=refresh)

if not hasdata:
logger.debug('No user data found')
dataset._userdata = None
return None

try:
fs.seek(offset, os.SEEK_SET)
data, _next = doc.parseElement(fs)
return data.dump()
dump = data.dump()
dataset._userdata = dump
return dump

finally:
fs.seek(oldpos, os.SEEK_SET)
Expand All @@ -95,31 +117,45 @@ def readUserData(dataset: Dataset) -> Union[Dict[str, Any], None]:
#===============================================================================

def writeUserData(dataset: Dataset,
userdata: Dict[str, Any]):
userdata: Dict[str, Any],
refresh: bool = False):
""" Write user data to the end of an IDE file.
:param dataset: The `Dataset` from which to read the user data.
:param userdata: A dictionary of user data, or `None` to remove
existing user data. Note that the file will not get smaller if
the user data is removed (or the new user data is smaller);
it is just overwritten with null data (an EBML `Void` element).
:param refresh: If `True`, ignore any cached values and find the
position in the file to which to write.
"""
schema = dataset.ebmldoc.schema
fs = dataset.ebmldoc.stream
oldpos = fs.tell()

try:
_hasdata, offset, filesize = getUserDataPos(dataset)

dataBin = schema.encodes({'UserData': userdata})
offsetBin = schema['UserDataOffset'].encode(offset, length=8, lengthSize=8)
newsize = (len(offsetBin) + len(dataBin) + offset + MIN_VOID_SIZE)
voidBin = schema['Void'].encode(None, length=max(0, filesize - newsize),
lengthSize=8)
hasdata, offset, filesize = getUserDataPos(dataset, refresh=refresh)

if userdata:
# User data consists of a `UserData` element, a `Void`, and `UserDataOffset`
dataBin = schema.encodes({'UserData': userdata or {}})
offsetBin = schema['UserDataOffset'].encode(offset, length=8, lengthSize=8)
newsize = (len(offsetBin) + len(dataBin) + offset + MIN_VOID_SIZE)
voidBin = schema['Void'].encode(None, length=max(0, filesize - newsize),
lengthSize=8)
else:
# No new userdata, just write 'Void' over any existing userdata
dataset._userdata = userdata
if not hasdata:
return
newsize = filesize
dataBin = offsetBin = b''
voidBin = schema['Void'].encode(None, length=max(0, filesize - MIN_VOID_SIZE))

userblob = dataBin + voidBin + offsetBin

if '+' not in fs.mode and 'w' not in fs.mode:
# File/stream is read-only; attempt to create a new file stream.
if not getattr(fs, 'name', None):
logger.debug(f'(userdata) Dataset stream read only (mode {fs.mode!r}) '
'and has no name, not writing user data')
Expand All @@ -134,6 +170,7 @@ def writeUserData(dataset: Dataset,
fs.seek(offset, os.SEEK_SET)
fs.write(userblob)

dataset._userdata = userdata
logger.debug(f'(userdata) Wrote {len(userblob)} bytes to {dataset} '
f'(file was {filesize}, now {newsize})')

Expand Down

0 comments on commit f484b17

Please sign in to comment.