From f327a0fef65d95e5bb4741413febccb521687670 Mon Sep 17 00:00:00 2001 From: John Kitchin Date: Tue, 18 Jun 2024 14:21:56 -0400 Subject: [PATCH] save old hashcache to v1, and update to class decorators --- pycse/hashcache.py | 456 ++++++++++++++++++++++++++++++------------ pycse/hashcache_v1.py | 247 +++++++++++++++++++++++ 2 files changed, 576 insertions(+), 127 deletions(-) create mode 100644 pycse/hashcache_v1.py diff --git a/pycse/hashcache.py b/pycse/hashcache.py index dff951e..7764e93 100644 --- a/pycse/hashcache.py +++ b/pycse/hashcache.py @@ -1,4 +1,4 @@ -"""hashcache - a decorator for persistent, file/hash-based cache +"""hashcache - a class decorator for persistent, file/hash-based cache I found some features of joblib were unsuitable for how I want to use a cache. @@ -18,18 +18,22 @@ functions with the same bytecode, even if they have different names, will cache to the same result. -The cache location is set as a function attribute: +The cache location is set as a class attribute: - hashcache.cache = './cache' + HashCache.cache = './cache' -This is alpha, proof of concept code. Test it a lot for your use case. The API -is not stable, and subject to change. + HashCache - stores joblib.dump pickle strings in files named by hash -Some things to do: -1. the function attributes are kind of weird, maybe these should be decorator -arguments. + SqlCache - stores orjson serialized data in a sqlite3 database by hash key + + JsonCache - stores orjson serialized data in json files, compatible with maggma + + +This is still alpha, proof of concept code. Test it a lot for your use case. The +API is not stable, and subject to change. + Pros: @@ -71,112 +75,127 @@ enabling other backends like lmdb or sqlite instead of files. You can then simply provide new functions for this. +[2024-06-18 Tue] Changed from function to class decorator (breaking change). + """ -import functools import inspect import joblib +import orjson import os from pathlib import Path import pprint +import socket +import sqlite3 import time -def get_standardized_args(func, args, kwargs): - """Returns a standardized dictionary of kwargs for func(args, kwargs) - - This dictionary includes default values, even if they were not called. - - """ - sig = inspect.signature(func) - standardized_args = sig.bind(*args, **kwargs) - standardized_args.apply_defaults() - return standardized_args.arguments - - -def get_hash(func, args, kwargs): - """Get a hash for running FUNC(ARGS, KWARGS). - - This is the most critical feature of hashcache as it provides a key to store - and look up results later. You should think carefully before changing this - function, it breaks past caches. - - FUNC should be as pure as reasonable. This hash is insensitive to global - variables. - - The hash is on the function name, bytecode, and a standardized kwargs - including defaults. We use bytecode because it is insensitive to things like - whitespace, comments, docstrings, and variable name changes that don't - affect results. It is assumed that two functions with the same name and - bytecode will evaluate to the same result. - - """ - return joblib.hash( - [ - func.__code__.co_name, # This is the function name - func.__code__.co_code, # this is the function bytecode - get_standardized_args(func, args, kwargs), # The args used, including defaults - ], - hash_name="sha1", +def hashcache(*args, **kwargs): + """Raises an exception if the old hashcache decorator is used.""" + raise Exception( + "The hashcache function decorator is deprecated." " Please use the class decorator instead." ) -def get_hashpath(hsh): - """Return path to file for HSH.""" - cache = Path(hashcache.cache) - hshdir = cache / hsh[0:2] - hshpath = hshdir / hsh - return hshpath - - -def load_data(hsh, verbose=False): - """Load data for HSH. - - HSH is a string for the hash associated with the data you want. - - Returns success, data. If it succeeds, success with be True. If the data - does not exist yet, sucess will be False, and data will be None. +class HashCache: + """Class decorator to cache using hashes and pickle (via joblib). + Data is stored in directories named by the hash. """ - hshpath = get_hashpath(hsh) - if os.path.exists(hshpath): - data = joblib.load(hshpath) - if verbose: + + # cache is the name of the directory to store results in + cache = "cache" + version = "0.1.0" + verbose = False + + def __init__(self, function): + self.function = function + + def get_standardized_args(self, args, kwargs): + """Returns a standardized dictionary of kwargs for func(args, kwargs) + + This dictionary includes default values, even if they were not called. + + """ + sig = inspect.signature(self.function) + standardized_args = sig.bind(*args, **kwargs) + standardized_args.apply_defaults() + return standardized_args.arguments + + def get_hash(self, args, kwargs): + """Get a hash for running FUNC(ARGS, KWARGS). + + This is the most critical feature of hashcache as it provides a key to store + and look up results later. You should think carefully before changing this + function, it breaks past caches. + + FUNC should be as pure as reasonable. This hash is insensitive to global + variables. + + The hash is on the function name, bytecode, and a standardized kwargs + including defaults. We use bytecode because it is insensitive to things + like whitespace, comments, docstrings, and variable name changes that + don't affect results. It is assumed that two functions with the same + name and bytecode will evaluate to the same result. However, this makes + the hash fragile to changes in Python version that affect bytecode. + + """ + return joblib.hash( + [ + self.function.__code__.co_name, # This is the function name + self.function.__code__.co_code, # this is the function bytecode + self.get_standardized_args(args, kwargs), # The args used, including defaults + ], + hash_name="sha1", + ) + + def get_hashpath(self, hsh): + """Return path to file for HSH.""" + cache = Path(self.cache) + hshdir = cache / hsh[0:2] + hshpath = hshdir / hsh + return hshpath + + def load_data(self, hsh): + """Load data for HSH. + + HSH is a string for the hash associated with the data you want. + + Returns success, data. If it succeeds, success with be True. If the data + does not exist yet, sucess will be False, and data will be None. + + """ + hshpath = self.get_hashpath(hsh) + if os.path.exists(hshpath): + data = joblib.load(hshpath) + if self.verbose: + pp = pprint.PrettyPrinter(indent=4) + pp.pprint(data) + return True, data["output"] + else: + return False, None + + def dump_data(self, hsh, data): + """Dump DATA into HSH.""" + hshpath = self.get_hashpath(hsh) + os.makedirs(hshpath.parent, exist_ok=True) + + files = joblib.dump(data, hshpath) + + if self.verbose: pp = pprint.PrettyPrinter(indent=4) + print(f"wrote {hshpath}") pp.pprint(data) - return True, data["output"] - else: - return False, None - - -def dump_data(hsh, data, verbose): - """Dump DATA into HSH.""" - hshpath = get_hashpath(hsh) - os.makedirs(hshpath.parent, exist_ok=True) - files = joblib.dump(data, hshpath) + return files - if verbose: - pp = pprint.PrettyPrinter(indent=4) - print(f"wrote {hshpath}") - pp.pprint(data) + def __call__(self, *args, **kwargs): + """This is the decorator code that runs around self.function.""" - return files - - -def hashcache(fn=None, *, verbose=False, loader=load_data, dumper=dump_data): - """Cache results by hash of the function, arguments and kwargs. - - Set hashcache.cache to the directory you want the cache saved in. - Default = cache - """ - - def wrapper(func, *args, **kwargs): - - hsh = get_hash(func, args, kwargs) + hsh = self.get_hash(args, kwargs) # Try getting the data first - success, data = loader(hsh, verbose) + success, data = self.load_data(hsh) if success: return data @@ -184,13 +203,13 @@ def wrapper(func, *args, **kwargs): # we did not succeed, so we run the function, and cache it # We store some metadata for future analysis. t0 = time.time() - value = func(*args, **kwargs) + value = self.function(*args, **kwargs) tf = time.time() # functions with mutable arguments can change the arguments, which # is a problem here. We just warn the user. Nothing else makes # sense, the mutability may be intentional. - if not hsh == get_hash(func, args, kwargs): + if not hsh == self.get_hash(args, kwargs): print("WARNING something mutated, future" " calls will not use the cache.") # Try a bunch of ways to get a username. @@ -202,46 +221,229 @@ def wrapper(func, *args, **kwargs): data = { "output": value, "hash": hsh, - "func": func.__code__.co_name, # This is the function name - "module": func.__module__, + "func": self.function.__code__.co_name, # This is the function name + "module": self.function.__module__, "args": args, "kwargs": kwargs, - "standardized-kwargs": get_standardized_args(func, args, kwargs), - "version": hashcache.version, - "cwd": os.getcwd(), # Is this a good idea? Could it leak - # sensitive information from the path? - # should we include other info like - # hostname? + "standardized-kwargs": self.get_standardized_args(args, kwargs), + "version": self.version, + "cwd": os.getcwd(), + "hostname": socket.getfqdn(), "user": user, "run-at": t0, "run-at-human": time.asctime(time.localtime(t0)), "elapsed_time": tf - t0, } - dumper(hsh, data, verbose) + self.dump_data(hsh, data) return value - # This silliness is because I want to have the decorator work with and - # without arguments - # - # @hashcache - # def f(...) - # - # and - # @hashcache(verbose=True) - # def f(...) - # - # yea, it feels gross. - if fn is not None: - return functools.partial(wrapper, fn) - else: - - def decorator(func): - newrapper = functools.partial(wrapper, func) - return functools.update_wrapper(newrapper, func) - - return decorator - - -hashcache.cache = "cache" -hashcache.version = "0.0.3" + @staticmethod + def dump(**kwargs): + """Dump KWARGS to the cache. + Returns a hash string for future lookup. + + """ + t0 = time.time() + hsh = joblib.hash(kwargs) + + try: + user = os.getlogin() + except OSError: + user = os.environ.get("USER") + + data = { + "func": "dump", + "kwargs": kwargs, + "hash": hsh, + "saved-at": t0, + "saved-at-human": time.asctime(time.localtime(t0)), + "cwd": os.getcwd(), + "hostname": socket.getfqdn(), + "user": user, + } + + hc = HashCache(lambda x: x) + hc.dump_data(hsh, data) + return hsh + + @staticmethod + def load(hsh): + """Load saved variables from HSH.""" + hc = HashCache(lambda x: x) + + hshpath = hc.get_hashpath(hsh) + if os.path.exists(hshpath): + return joblib.load(hshpath)["kwargs"] + + +class SqlCache(HashCache): + """Class decorator to cache using orjson and sqlite. + Data is stored in a sqlite database as json. + + """ + + cache = "cache.sqlite" + + def __init__(self, function): + self.function = function + + self.con = sqlite3.connect(self.cache) + self.con.execute("CREATE TABLE if not exists cache(hash TEXT unique, value TEXT)") + + def dump_data(self, hsh, data): + """Dump DATA into HSH. + DATA must be serializable to json. + + """ + value = orjson.dumps(data, option=orjson.OPT_SERIALIZE_NUMPY) + with self.con: + self.con.execute("INSERT INTO cache(hash, value) VALUES(?, ?)", (hsh, value)) + + def load_data(self, hsh): + """Load data for HSH. + + HSH is a string for the hash associated with the data you want. + + Returns success, data. If it succeeds, success with be True. If the data + does not exist yet, sucess will be False, and data will be None. + + """ + with self.con: + cur = self.con.execute("SELECT value FROM cache WHERE hash = ?", (hsh,)) + (value,) = cur.fetchone() + if value is None: + return False, None + else: + return True, orjson.loads(value) + + @staticmethod + def search(query, *args): + """Run a sql QUERY with args. + args are substituted in ? placeholders in the query. + + This is just a light wrapper on con.execute. + + """ + con = sqlite3.connect(SqlCache.cache) + cur = con.execute(query, args) + return cur + + @staticmethod + def dump(**kwargs): + """Dump KWARGS to the cache. + Returns a hash string for future lookup. + """ + t0 = time.time() + hsh = joblib.hash(kwargs) + + try: + user = os.getlogin() + except OSError: + user = os.environ.get("USER") + + data = { + "func": "dump", + "kwargs": kwargs, + "hash": hsh, + "saved-at": t0, + "saved-at-human": time.asctime(time.localtime(t0)), + "cwd": os.getcwd(), + "hostname": socket.getfqdn(), + "user": user, + } + + hc = SqlCache(lambda x: x) + try: + hc.dump_data(hsh, data) + return hsh + except sqlite3.IntegrityError: + return hsh + + @staticmethod + def load(hsh): + """Load data from HSH.""" + + hc = SqlCache(lambda x: x) + with hc.con: + cur = hc.con.execute("SELECT value FROM cache WHERE hash = ?", (hsh,)) + (value,) = cur.fetchone() # this returns a tuple that we unpack + return orjson.loads(value)["kwargs"] + + +class JsonCache(HashCache): + """Json-based cache. + + This is compatible with maggma. + """ + + def __init__(self, function): + self.function = function + + if not os.path.exists(self.cache / Path("Filestore.json")): + os.makedirs(self.cache, exist_ok=True) + with open(self.cache / Path("Filestore.json"), "wb") as f: + f.write(orjson.dumps([])) + + def dump_data(self, hsh, data): + """Dump DATA into HSH.""" + hshpath = self.get_hashpath(hsh).with_suffix(".json") + os.makedirs(hshpath.parent, exist_ok=True) + + with open(hshpath, "wb") as f: + f.write(orjson.dumps(data)) + + def load_data(self, hsh): + hshpath = self.get_hashpath(hsh).with_suffix(".json") + if os.path.exists(hshpath): + with open(hshpath, "rb") as f: + data = orjson.loads(f.read()) + + if self.verbose: + pp = pprint.PrettyPrinter(indent=4) + pp.pprint(data) + return True, data["output"] + else: + return False, None + + @staticmethod + def dump(**kwargs): + """Dump KWARGS to the cache. + Returns a hash string for future lookup. + """ + t0 = time.time() + hsh = joblib.hash(kwargs) + + try: + user = os.getlogin() + except OSError: + user = os.environ.get("USER") + + data = { + "func": "dump", + "kwargs": kwargs, + "hash": hsh, + "saved-at": t0, + "saved-at-human": time.asctime(time.localtime(t0)), + "cwd": os.getcwd(), + "hostname": socket.getfqdn(), + "user": user, + } + + hc = JsonCache(lambda x: x) + hshpath = hc.get_hashpath(hsh).with_suffix(".json") + + os.makedirs(hshpath.parent, exist_ok=True) + with open(hshpath, "wb") as f: + f.write(orjson.dumps(data)) + return hsh + + @staticmethod + def load(hsh): + """Load data from HSH.""" + + hc = JsonCache(lambda x: x) + hshpath = hc.get_hashpath(hsh).with_suffix(".json") + if os.path.exists(hshpath): + with open(hshpath, "rb") as f: + return orjson.loads(f.read())["kwargs"] diff --git a/pycse/hashcache_v1.py b/pycse/hashcache_v1.py new file mode 100644 index 0000000..dff951e --- /dev/null +++ b/pycse/hashcache_v1.py @@ -0,0 +1,247 @@ +"""hashcache - a decorator for persistent, file/hash-based cache + +I found some features of joblib were unsuitable for how I want to use a cache. + +1. The "file" Python thinks the function is in is used to save the results in +joblib, which leads to repeated runs if you run the same code in Python, +notebook or stdin, and means the cache is not portable to other machines, and +maybe not even in time since temp directories and kernel parameters are +involved. I could not figure out how to change those in joblib. + +2. joblib uses the function source code in the hash, so inconsequential changes +like whitespace, docstrings and comments change the hash. + +This library aims to provide a simpler version of what I wish joblib did for me. + +Results are cached based on a hash of the function name, argnames, bytecode, arg +values and kwarg values. I use joblib.hash for this. This means any two +functions with the same bytecode, even if they have different names, will cache +to the same result. + +The cache location is set as a function attribute: + + hashcache.cache = './cache' + + +This is alpha, proof of concept code. Test it a lot for your use case. The API +is not stable, and subject to change. + +Some things to do: + +1. the function attributes are kind of weird, maybe these should be decorator +arguments. + +Pros: + +1. File-based cache which means many functions can run in parallel reading and +writing, and you are limited only by file io speeds, and disk space. + +2. semi-portability. The cache could be synced across machines, and caches +can be merged with little risk of conflict. + +3. No server is required. Everything is done at the OS level. + +4. Extendability. You can define your own functions for loading and dumping +data. + +Cons: + +1. hashes are fragile and not robust. They are fragile with respect to any +changes in how byte-code is made, or via mutable arguments, etc. The hashes are +not robust to system level changes like library versions, or global variables. +The only advantage of hashes is you can compute them. + +2. File-based cache which means if you generate thousands of files, it can be +slow to delete them. Although it should be fast to access the results since you +access them directly by path, it will not be fast to iterate over all the +results, e.g. if you want to implement some kind of search or reporting. + +3. No server. You have to roll your own update strategy if you run things on +multiple machines that should all cache to a common location. + +Changelog +--------- + +[2023-09-23 Sat] Changed hash signature (breaking change). It is too difficult +to figure out how to capture global state, and the use of internal variable +names is not consistent with using the bytecode to be insensitive to +unimportant variable name changes. + +Pulled out some functions for loading and dumping data. This is a precursor to +enabling other backends like lmdb or sqlite instead of files. You can then +simply provide new functions for this. + +""" + +import functools +import inspect +import joblib +import os +from pathlib import Path +import pprint +import time + + +def get_standardized_args(func, args, kwargs): + """Returns a standardized dictionary of kwargs for func(args, kwargs) + + This dictionary includes default values, even if they were not called. + + """ + sig = inspect.signature(func) + standardized_args = sig.bind(*args, **kwargs) + standardized_args.apply_defaults() + return standardized_args.arguments + + +def get_hash(func, args, kwargs): + """Get a hash for running FUNC(ARGS, KWARGS). + + This is the most critical feature of hashcache as it provides a key to store + and look up results later. You should think carefully before changing this + function, it breaks past caches. + + FUNC should be as pure as reasonable. This hash is insensitive to global + variables. + + The hash is on the function name, bytecode, and a standardized kwargs + including defaults. We use bytecode because it is insensitive to things like + whitespace, comments, docstrings, and variable name changes that don't + affect results. It is assumed that two functions with the same name and + bytecode will evaluate to the same result. + + """ + return joblib.hash( + [ + func.__code__.co_name, # This is the function name + func.__code__.co_code, # this is the function bytecode + get_standardized_args(func, args, kwargs), # The args used, including defaults + ], + hash_name="sha1", + ) + + +def get_hashpath(hsh): + """Return path to file for HSH.""" + cache = Path(hashcache.cache) + hshdir = cache / hsh[0:2] + hshpath = hshdir / hsh + return hshpath + + +def load_data(hsh, verbose=False): + """Load data for HSH. + + HSH is a string for the hash associated with the data you want. + + Returns success, data. If it succeeds, success with be True. If the data + does not exist yet, sucess will be False, and data will be None. + + """ + hshpath = get_hashpath(hsh) + if os.path.exists(hshpath): + data = joblib.load(hshpath) + if verbose: + pp = pprint.PrettyPrinter(indent=4) + pp.pprint(data) + return True, data["output"] + else: + return False, None + + +def dump_data(hsh, data, verbose): + """Dump DATA into HSH.""" + hshpath = get_hashpath(hsh) + os.makedirs(hshpath.parent, exist_ok=True) + + files = joblib.dump(data, hshpath) + + if verbose: + pp = pprint.PrettyPrinter(indent=4) + print(f"wrote {hshpath}") + pp.pprint(data) + + return files + + +def hashcache(fn=None, *, verbose=False, loader=load_data, dumper=dump_data): + """Cache results by hash of the function, arguments and kwargs. + + Set hashcache.cache to the directory you want the cache saved in. + Default = cache + """ + + def wrapper(func, *args, **kwargs): + + hsh = get_hash(func, args, kwargs) + + # Try getting the data first + success, data = loader(hsh, verbose) + + if success: + return data + + # we did not succeed, so we run the function, and cache it + # We store some metadata for future analysis. + t0 = time.time() + value = func(*args, **kwargs) + tf = time.time() + + # functions with mutable arguments can change the arguments, which + # is a problem here. We just warn the user. Nothing else makes + # sense, the mutability may be intentional. + if not hsh == get_hash(func, args, kwargs): + print("WARNING something mutated, future" " calls will not use the cache.") + + # Try a bunch of ways to get a username. + try: + user = os.getlogin() + except OSError: + user = os.environ.get("USER") + + data = { + "output": value, + "hash": hsh, + "func": func.__code__.co_name, # This is the function name + "module": func.__module__, + "args": args, + "kwargs": kwargs, + "standardized-kwargs": get_standardized_args(func, args, kwargs), + "version": hashcache.version, + "cwd": os.getcwd(), # Is this a good idea? Could it leak + # sensitive information from the path? + # should we include other info like + # hostname? + "user": user, + "run-at": t0, + "run-at-human": time.asctime(time.localtime(t0)), + "elapsed_time": tf - t0, + } + + dumper(hsh, data, verbose) + return value + + # This silliness is because I want to have the decorator work with and + # without arguments + # + # @hashcache + # def f(...) + # + # and + # @hashcache(verbose=True) + # def f(...) + # + # yea, it feels gross. + if fn is not None: + return functools.partial(wrapper, fn) + else: + + def decorator(func): + newrapper = functools.partial(wrapper, func) + return functools.update_wrapper(newrapper, func) + + return decorator + + +hashcache.cache = "cache" +hashcache.version = "0.0.3"