From 2c6f027d3574e598044378eff0d393974238b82d Mon Sep 17 00:00:00 2001 From: elnuno Date: Wed, 22 Mar 2017 18:49:23 -0300 Subject: [PATCH 1/7] Allow sorting query parameters. Gives a nice increase in cache hits for naive apps that send unordered queries. --- cachecontrol/adapter.py | 6 +++++- cachecontrol/caches/file_cache.py | 4 ++-- cachecontrol/controller.py | 25 +++++++++++++++++-------- cachecontrol/wrapper.py | 6 ++++-- 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/cachecontrol/adapter.py b/cachecontrol/adapter.py index 195d49d9..dd4287b4 100644 --- a/cachecontrol/adapter.py +++ b/cachecontrol/adapter.py @@ -17,17 +17,20 @@ def __init__(self, cache=None, serializer=None, heuristic=None, cacheable_methods=None, + sort_query=False, *args, **kw): super(CacheControlAdapter, self).__init__(*args, **kw) self.cache = cache or DictCache() self.heuristic = heuristic self.cacheable_methods = cacheable_methods or ('GET',) + self.sort_query = sort_query controller_factory = controller_class or CacheController self.controller = controller_factory( self.cache, cache_etags=cache_etags, serializer=serializer, + sort_query=sort_query ) def send(self, request, cacheable_methods=None, **kw): @@ -117,7 +120,8 @@ def _update_chunk_length(self): # See if we should invalidate the cache. if request.method in self.invalidating_methods and resp.ok: - cache_url = self.controller.cache_url(request.url) + cache_url = self.controller.cache_url(request.url, + sort_query=self.sort_query) self.cache.delete(cache_url) # Give the request a from_cache attr to let people use it diff --git a/cachecontrol/caches/file_cache.py b/cachecontrol/caches/file_cache.py index be9b94dd..456c4565 100644 --- a/cachecontrol/caches/file_cache.py +++ b/cachecontrol/caches/file_cache.py @@ -124,10 +124,10 @@ def delete(self, key): pass -def url_to_file_path(url, filecache): +def url_to_file_path(url, filecache, sort_query=False): """Return the file cache path based on the URL. This does not ensure the file exists! """ - key = CacheController.cache_url(url) + key = CacheController.cache_url(url, sort_query=sort_query) return filecache._fn(key) diff --git a/cachecontrol/controller.py b/cachecontrol/controller.py index 1e1f9b6e..19cf03d5 100644 --- a/cachecontrol/controller.py +++ b/cachecontrol/controller.py @@ -31,14 +31,15 @@ class CacheController(object): """An interface to see if request should cached or not. """ def __init__(self, cache=None, cache_etags=True, serializer=None, - status_codes=None): + status_codes=None, sort_query=False): self.cache = cache or DictCache() self.cache_etags = cache_etags self.serializer = serializer or Serializer() self.cacheable_status_codes = status_codes or (200, 203, 300, 301) + self.sort_query = sort_query @classmethod - def _urlnorm(cls, uri): + def _urlnorm(cls, uri, sort_query=False): """Normalize the URL to create a safe key for the cache""" (scheme, authority, path, query, fragment) = parse_uri(uri) if not scheme or not authority: @@ -50,6 +51,14 @@ def _urlnorm(cls, uri): if not path: path = "/" + # Sorting the query might induce behavior changes in the response. + # Use with care. However, assuming param randomization, a query with + # four params has a 96% chance of missing the cache on the second + # request if a response has already been recorded. The chance of a + # hit grows to 50% after a dozen requests. + if query and sort_query: + query = '&'.join(sorted(query.split('&'))) + # Could do syntax based normalization of the URI before # computing the digest. See Section 6.2.2 of Std 66. request_uri = query and "?".join([path, query]) or path @@ -58,8 +67,8 @@ def _urlnorm(cls, uri): return defrag_uri @classmethod - def cache_url(cls, uri): - return cls._urlnorm(uri) + def cache_url(cls, uri, sort_query=False): + return cls._urlnorm(uri, sort_query=sort_query) def parse_cache_control(self, headers): """ @@ -90,7 +99,7 @@ def cached_request(self, request): Return a cached response if it exists in the cache, otherwise return False. """ - cache_url = self.cache_url(request.url) + cache_url = self.cache_url(request.url, sort_query=self.sort_query) logger.debug('Looking up "%s" in the cache', cache_url) cc = self.parse_cache_control(request.headers) @@ -207,7 +216,7 @@ def cached_request(self, request): return False def conditional_headers(self, request): - cache_url = self.cache_url(request.url) + cache_url = self.cache_url(request.url, sort_query=self.sort_query) resp = self.serializer.loads(request, self.cache.get(cache_url)) new_headers = {} @@ -255,7 +264,7 @@ def cache_response(self, request, response, body=None, cc_req = self.parse_cache_control(request.headers) cc = self.parse_cache_control(response_headers) - cache_url = self.cache_url(request.url) + cache_url = self.cache_url(request.url, sort_query=self.sort_query) logger.debug('Updating cache with response from "%s"', cache_url) # Delete it from the cache if we happen to have it stored there @@ -317,7 +326,7 @@ def update_cached_response(self, request, response): This should only ever be called when we've sent an ETag and gotten a 304 as the response. """ - cache_url = self.cache_url(request.url) + cache_url = self.cache_url(request.url, sort_query=self.sort_query) cached_response = self.serializer.loads( request, diff --git a/cachecontrol/wrapper.py b/cachecontrol/wrapper.py index b50a6e27..4b707afe 100644 --- a/cachecontrol/wrapper.py +++ b/cachecontrol/wrapper.py @@ -9,7 +9,8 @@ def CacheControl(sess, heuristic=None, controller_class=None, adapter_class=None, - cacheable_methods=None): + cacheable_methods=None, + sort_query=False): cache = cache or DictCache() adapter_class = adapter_class or CacheControlAdapter @@ -19,7 +20,8 @@ def CacheControl(sess, serializer=serializer, heuristic=heuristic, controller_class=controller_class, - cacheable_methods=cacheable_methods + cacheable_methods=cacheable_methods, + sort_query=sort_query ) sess.mount('http://', adapter) sess.mount('https://', adapter) From 43bd91ef7db9f387eb3b5642171f5a654d1d9881 Mon Sep 17 00:00:00 2001 From: elnuno Date: Wed, 22 Mar 2017 18:51:17 -0300 Subject: [PATCH 2/7] Some tests. Probably worth pruning. --- tests/test_adapter.py | 7 ++++ tests/test_cache_control.py | 29 ++++++++++++++++ tests/test_storage_filecache.py | 59 ++++++++++++++++++++++++++++++++- 3 files changed, 94 insertions(+), 1 deletion(-) diff --git a/tests/test_adapter.py b/tests/test_adapter.py index 556c6d3d..bd55f14a 100644 --- a/tests/test_adapter.py +++ b/tests/test_adapter.py @@ -54,3 +54,10 @@ def test_close(self): sess.close() assert cache.close.called + +def test_cache_control_sets_sort_query(): + for bool_ in (True, False): + sess = CacheControl(Session(), mock.Mock(spec=DictCache), + sort_query=bool_) + assert sess.adapters['http://'].sort_query == bool_ + assert sess.adapters['http://'].controller.sort_query == bool_ diff --git a/tests/test_cache_control.py b/tests/test_cache_control.py index 77b8f90d..7254458f 100644 --- a/tests/test_cache_control.py +++ b/tests/test_cache_control.py @@ -4,6 +4,8 @@ import pytest from mock import ANY, Mock import time +from random import shuffle +import string from cachecontrol import CacheController from cachecontrol.cache import DictCache @@ -225,3 +227,30 @@ def test_cached_request_with_bad_max_age_headers_not_returned(self): self.c.cache = DictCache({self.url: resp}) assert not self.req({}) + +def test_cache_url_sorting(): + letter_n_numbers = list(enumerate(string.ascii_lowercase[3:], start=4)) + suff = '&' + '&'.join('%s=%s' % (k, v) for v, k in letter_n_numbers) + + def get_param(url): + """Mock losing order when processing params""" + shuffle(letter_n_numbers) + params = {k: v for v, k in letter_n_numbers} + url = url.replace(suff, '') + query = '&' + '&'.join('%s=%s' % item for item in params.items()) + return url + query + + no_query = 'http://example.com' + unsorted_query = 'http://example.com?b=2&c=3&a=1' + suff + sorted_query = 'http://example.com?a=1&b=2&c=3' + suff + + cache_url = CacheController.cache_url + assert cache_url(no_query, sort_query=True) == cache_url(no_query) + assert cache_url(unsorted_query) != cache_url(sorted_query) + assert cache_url(unsorted_query, True) == cache_url(sorted_query) + randomized = get_param(unsorted_query) + assert randomized != unsorted_query + assert cache_url(randomized) != cache_url(sorted_query) + assert cache_url(randomized, True) == cache_url(sorted_query) + randomized_again = get_param(unsorted_query) + assert cache_url(randomized, True) == cache_url(randomized_again, True) diff --git a/tests/test_storage_filecache.py b/tests/test_storage_filecache.py index f075a079..f13df00a 100644 --- a/tests/test_storage_filecache.py +++ b/tests/test_storage_filecache.py @@ -4,12 +4,13 @@ import os import string -from random import randint, sample +from random import randint, sample, shuffle import pytest import requests from cachecontrol import CacheControl from cachecontrol.caches import FileCache +from cachecontrol.caches.file_cache import url_to_file_path from lockfile import LockFile from lockfile.mkdirlockfile import MkdirLockFile @@ -110,3 +111,59 @@ def test_lock_class(self, tmpdir): lock_class = object() cache = FileCache(str(tmpdir), lock_class=lock_class) assert cache.lock_class is lock_class + + def test_url_to_file_path(self, tmpdir): + cache = FileCache(str(tmpdir)) + # We'll add a long sorted suffix so that unsorted queries have a low + # collision probability (about 3.8e-23 for each sorted/unsorted + # comparison). + letter_n_numbers = list(enumerate(string.ascii_lowercase[3:], start=4)) + suff = '&' + '&'.join('%s=%s' % (k, v) for v, k in letter_n_numbers) + + def get_param(url): + """Mock losing order when processing params""" + shuffle(letter_n_numbers) + params = {k: v for v, k in letter_n_numbers} + url = url.replace(suff, '') + query = '&' + '&'.join('%s=%s' % item for item in params.items()) + return url + query + + urls = { + 'no_query': 'http://example.com', + 'unsorted_query': 'http://example.com?b=2&c=3&a=1' + suff, + 'sorted_query': 'http://example.com?a=1&b=2&c=3' + suff, + 'unsorted_empty_value': 'http://example.com?b=&c=&a=1' + suff, + 'sorted_empty_value': 'http://example.com?a=1&b=&c=3' + suff, + 'unsorted_repeated_key': 'http://example.com?b=2&c=3&b=0' + '&c=5&a=1' + suff, + 'sorted_repeated_key': 'http://example.com?a=1&b=0&b=2&c=3&' + 'c=5' + suff} + + unoquery = url_to_file_path(urls['no_query'], cache, sort_query=False) + snoquery = url_to_file_path(urls['no_query'], cache, sort_query=True) + assert unoquery == snoquery + urls.pop('no_query') + + sortedpaths = {urlname: url_to_file_path(urlvalue, cache, True) for + urlname, urlvalue in urls.items()} + + for key, value in urls.items(): + if key.startswith('sorted'): + assert url_to_file_path(value, cache, True) == sortedpaths[key] + + unsortedpaths = {urlname: url_to_file_path(urlvalue, cache, False) for + urlname, urlvalue in urls.items()} + + for key, url in urls.items(): + if key.startswith('unsorted'): + assert sortedpaths[key] != unsortedpaths[key] + else: + assert sortedpaths[key] == unsortedpaths[key] + + # Randomize and sort params + sort_param = url_to_file_path(get_param(url), cache, True) + assert sort_param == sortedpaths[key] + + # Randomize but don't sort params + unsort_param = url_to_file_path(get_param(url), cache, False) + assert unsort_param != unsortedpaths[key] From cf1debbe12f8d1c0e10a0c4bc487e0569efdb918 Mon Sep 17 00:00:00 2001 From: elnuno Date: Wed, 22 Mar 2017 18:51:49 -0300 Subject: [PATCH 3/7] Add a section about query sorting to the tips doc. --- docs/tips.rst | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/docs/tips.rst b/docs/tips.rst index b603300f..3a7d4e38 100644 --- a/docs/tips.rst +++ b/docs/tips.rst @@ -53,6 +53,17 @@ If you are caching requests that use a large number of query string parameters, consider sorting them to ensure that the request is properly cached. +.. note:: + + Assuming random parameter order, a request with four parameters + has a >95% chance of missing the cache on the second time, because + it grows with the number of permutations (1/24 for four + parameters, 1/125 for five, etc.). However, this problem doesn't + arise while the order stays the same. So if the same dict is used + as params many times it'll likely hit the cache. FileCache suffers + more from this when used on different Python runs, as this makes + order more random. + Requests supports passing both dictionaries and lists of tuples as the param argument in a request. For example: :: @@ -60,3 +71,24 @@ param argument in a request. For example: :: By ordering your params, you can be sure the cache key will be consistent across requests and you are caching effectively. + +For convenience, the sorting of query parameters can be done in +cachecontrol itself. For example: :: + + # Unsorted + sess = cachecontrol.CacheControl(requests.Session(), sort_query=False, + heuristic=ExpiresAfter(days=1)) + params = [('a', 'b'), ('c', 'd')] + resp = sess.get('http://www.reddit.com', params=params) + print(resp.from_cache) # False (first fetch) + resp = sess.get('http://www.reddit.com', params=reversed(params)) + print(resp.from_cache) # False (!) + + sess = cachecontrol.CacheControl(requests.Session(), sort_query=True, + heuristic=ExpiresAfter(days=1)) + params = [('a', 'b'), ('c', 'd')] + resp = sess.get('http://www.reddit.com', params=params) + print(resp.from_cache) # False (first fetch) + resp = sess.get('http://www.reddit.com', params=reversed(params)) + print(resp.from_cache) # True :) + From 60f00bdf088101e17820095941a9728e39778c36 Mon Sep 17 00:00:00 2001 From: elnuno Date: Tue, 11 Apr 2017 15:13:30 -0300 Subject: [PATCH 4/7] Trivial fix and a test (for old requests versions) for #137. --- cachecontrol/adapter.py | 2 +- tests/test_regressions.py | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/cachecontrol/adapter.py b/cachecontrol/adapter.py index dd4287b4..c6d38100 100644 --- a/cachecontrol/adapter.py +++ b/cachecontrol/adapter.py @@ -105,7 +105,7 @@ def build_response(self, request, response, from_cache=False, response, ) ) - if response.chunked: + if hasattr(response, 'chunked') and response.chunked: super_update_chunk_length = response._update_chunk_length def _update_chunk_length(self): diff --git a/tests/test_regressions.py b/tests/test_regressions.py index 33e0edf3..9efd9ced 100644 --- a/tests/test_regressions.py +++ b/tests/test_regressions.py @@ -2,7 +2,7 @@ import pytest -from cachecontrol import CacheControl +from cachecontrol import CacheControl, CacheControlAdapter from cachecontrol.caches import FileCache from cachecontrol.filewrapper import CallbackFileWrapper from requests import Session @@ -29,3 +29,18 @@ def test_getattr_during_gc(): vars(s).clear() # gc does this. with pytest.raises(AttributeError): s.x + + +def test_handle_no_chunked_attr(): + + class NoChunked(CacheControlAdapter): + def build_response(self, request, response, from_cache=False, + cacheable_methods=None): + if hasattr(response, 'chunked'): + pytest.skip('Requests is new enough, test makes no sense.') + # delattr(response, 'chunked') + return super().build_response(request, response, from_cache, + cacheable_methods) + sess = Session() + sess.mount('http://', NoChunked()) + sess.get('http://httpbin.org/cache/60') From 8cf339e98339643a78e48194d2e4d7a290ed63c3 Mon Sep 17 00:00:00 2001 From: elnuno Date: Tue, 11 Apr 2017 15:16:18 -0300 Subject: [PATCH 5/7] Revert "Allow sorting query parameters. Gives a nice increase in cache hits for naive apps that send unordered queries." This reverts commit 2c6f027d3574e598044378eff0d393974238b82d. --- cachecontrol/adapter.py | 6 +----- cachecontrol/caches/file_cache.py | 4 ++-- cachecontrol/controller.py | 25 ++++++++----------------- cachecontrol/wrapper.py | 6 ++---- 4 files changed, 13 insertions(+), 28 deletions(-) diff --git a/cachecontrol/adapter.py b/cachecontrol/adapter.py index c6d38100..defa1147 100644 --- a/cachecontrol/adapter.py +++ b/cachecontrol/adapter.py @@ -17,20 +17,17 @@ def __init__(self, cache=None, serializer=None, heuristic=None, cacheable_methods=None, - sort_query=False, *args, **kw): super(CacheControlAdapter, self).__init__(*args, **kw) self.cache = cache or DictCache() self.heuristic = heuristic self.cacheable_methods = cacheable_methods or ('GET',) - self.sort_query = sort_query controller_factory = controller_class or CacheController self.controller = controller_factory( self.cache, cache_etags=cache_etags, serializer=serializer, - sort_query=sort_query ) def send(self, request, cacheable_methods=None, **kw): @@ -120,8 +117,7 @@ def _update_chunk_length(self): # See if we should invalidate the cache. if request.method in self.invalidating_methods and resp.ok: - cache_url = self.controller.cache_url(request.url, - sort_query=self.sort_query) + cache_url = self.controller.cache_url(request.url) self.cache.delete(cache_url) # Give the request a from_cache attr to let people use it diff --git a/cachecontrol/caches/file_cache.py b/cachecontrol/caches/file_cache.py index 456c4565..be9b94dd 100644 --- a/cachecontrol/caches/file_cache.py +++ b/cachecontrol/caches/file_cache.py @@ -124,10 +124,10 @@ def delete(self, key): pass -def url_to_file_path(url, filecache, sort_query=False): +def url_to_file_path(url, filecache): """Return the file cache path based on the URL. This does not ensure the file exists! """ - key = CacheController.cache_url(url, sort_query=sort_query) + key = CacheController.cache_url(url) return filecache._fn(key) diff --git a/cachecontrol/controller.py b/cachecontrol/controller.py index 19cf03d5..1e1f9b6e 100644 --- a/cachecontrol/controller.py +++ b/cachecontrol/controller.py @@ -31,15 +31,14 @@ class CacheController(object): """An interface to see if request should cached or not. """ def __init__(self, cache=None, cache_etags=True, serializer=None, - status_codes=None, sort_query=False): + status_codes=None): self.cache = cache or DictCache() self.cache_etags = cache_etags self.serializer = serializer or Serializer() self.cacheable_status_codes = status_codes or (200, 203, 300, 301) - self.sort_query = sort_query @classmethod - def _urlnorm(cls, uri, sort_query=False): + def _urlnorm(cls, uri): """Normalize the URL to create a safe key for the cache""" (scheme, authority, path, query, fragment) = parse_uri(uri) if not scheme or not authority: @@ -51,14 +50,6 @@ def _urlnorm(cls, uri, sort_query=False): if not path: path = "/" - # Sorting the query might induce behavior changes in the response. - # Use with care. However, assuming param randomization, a query with - # four params has a 96% chance of missing the cache on the second - # request if a response has already been recorded. The chance of a - # hit grows to 50% after a dozen requests. - if query and sort_query: - query = '&'.join(sorted(query.split('&'))) - # Could do syntax based normalization of the URI before # computing the digest. See Section 6.2.2 of Std 66. request_uri = query and "?".join([path, query]) or path @@ -67,8 +58,8 @@ def _urlnorm(cls, uri, sort_query=False): return defrag_uri @classmethod - def cache_url(cls, uri, sort_query=False): - return cls._urlnorm(uri, sort_query=sort_query) + def cache_url(cls, uri): + return cls._urlnorm(uri) def parse_cache_control(self, headers): """ @@ -99,7 +90,7 @@ def cached_request(self, request): Return a cached response if it exists in the cache, otherwise return False. """ - cache_url = self.cache_url(request.url, sort_query=self.sort_query) + cache_url = self.cache_url(request.url) logger.debug('Looking up "%s" in the cache', cache_url) cc = self.parse_cache_control(request.headers) @@ -216,7 +207,7 @@ def cached_request(self, request): return False def conditional_headers(self, request): - cache_url = self.cache_url(request.url, sort_query=self.sort_query) + cache_url = self.cache_url(request.url) resp = self.serializer.loads(request, self.cache.get(cache_url)) new_headers = {} @@ -264,7 +255,7 @@ def cache_response(self, request, response, body=None, cc_req = self.parse_cache_control(request.headers) cc = self.parse_cache_control(response_headers) - cache_url = self.cache_url(request.url, sort_query=self.sort_query) + cache_url = self.cache_url(request.url) logger.debug('Updating cache with response from "%s"', cache_url) # Delete it from the cache if we happen to have it stored there @@ -326,7 +317,7 @@ def update_cached_response(self, request, response): This should only ever be called when we've sent an ETag and gotten a 304 as the response. """ - cache_url = self.cache_url(request.url, sort_query=self.sort_query) + cache_url = self.cache_url(request.url) cached_response = self.serializer.loads( request, diff --git a/cachecontrol/wrapper.py b/cachecontrol/wrapper.py index 4b707afe..b50a6e27 100644 --- a/cachecontrol/wrapper.py +++ b/cachecontrol/wrapper.py @@ -9,8 +9,7 @@ def CacheControl(sess, heuristic=None, controller_class=None, adapter_class=None, - cacheable_methods=None, - sort_query=False): + cacheable_methods=None): cache = cache or DictCache() adapter_class = adapter_class or CacheControlAdapter @@ -20,8 +19,7 @@ def CacheControl(sess, serializer=serializer, heuristic=heuristic, controller_class=controller_class, - cacheable_methods=cacheable_methods, - sort_query=sort_query + cacheable_methods=cacheable_methods ) sess.mount('http://', adapter) sess.mount('https://', adapter) From 4e76780b2d3843a33c05b303cc96a5491d1297db Mon Sep 17 00:00:00 2001 From: elnuno Date: Tue, 11 Apr 2017 15:17:39 -0300 Subject: [PATCH 6/7] Revert "Some tests. Probably worth pruning." This reverts commit 43bd91ef7db9f387eb3b5642171f5a654d1d9881. --- tests/test_adapter.py | 7 ---- tests/test_cache_control.py | 29 ---------------- tests/test_storage_filecache.py | 59 +-------------------------------- 3 files changed, 1 insertion(+), 94 deletions(-) diff --git a/tests/test_adapter.py b/tests/test_adapter.py index bd55f14a..556c6d3d 100644 --- a/tests/test_adapter.py +++ b/tests/test_adapter.py @@ -54,10 +54,3 @@ def test_close(self): sess.close() assert cache.close.called - -def test_cache_control_sets_sort_query(): - for bool_ in (True, False): - sess = CacheControl(Session(), mock.Mock(spec=DictCache), - sort_query=bool_) - assert sess.adapters['http://'].sort_query == bool_ - assert sess.adapters['http://'].controller.sort_query == bool_ diff --git a/tests/test_cache_control.py b/tests/test_cache_control.py index 7254458f..77b8f90d 100644 --- a/tests/test_cache_control.py +++ b/tests/test_cache_control.py @@ -4,8 +4,6 @@ import pytest from mock import ANY, Mock import time -from random import shuffle -import string from cachecontrol import CacheController from cachecontrol.cache import DictCache @@ -227,30 +225,3 @@ def test_cached_request_with_bad_max_age_headers_not_returned(self): self.c.cache = DictCache({self.url: resp}) assert not self.req({}) - -def test_cache_url_sorting(): - letter_n_numbers = list(enumerate(string.ascii_lowercase[3:], start=4)) - suff = '&' + '&'.join('%s=%s' % (k, v) for v, k in letter_n_numbers) - - def get_param(url): - """Mock losing order when processing params""" - shuffle(letter_n_numbers) - params = {k: v for v, k in letter_n_numbers} - url = url.replace(suff, '') - query = '&' + '&'.join('%s=%s' % item for item in params.items()) - return url + query - - no_query = 'http://example.com' - unsorted_query = 'http://example.com?b=2&c=3&a=1' + suff - sorted_query = 'http://example.com?a=1&b=2&c=3' + suff - - cache_url = CacheController.cache_url - assert cache_url(no_query, sort_query=True) == cache_url(no_query) - assert cache_url(unsorted_query) != cache_url(sorted_query) - assert cache_url(unsorted_query, True) == cache_url(sorted_query) - randomized = get_param(unsorted_query) - assert randomized != unsorted_query - assert cache_url(randomized) != cache_url(sorted_query) - assert cache_url(randomized, True) == cache_url(sorted_query) - randomized_again = get_param(unsorted_query) - assert cache_url(randomized, True) == cache_url(randomized_again, True) diff --git a/tests/test_storage_filecache.py b/tests/test_storage_filecache.py index f13df00a..f075a079 100644 --- a/tests/test_storage_filecache.py +++ b/tests/test_storage_filecache.py @@ -4,13 +4,12 @@ import os import string -from random import randint, sample, shuffle +from random import randint, sample import pytest import requests from cachecontrol import CacheControl from cachecontrol.caches import FileCache -from cachecontrol.caches.file_cache import url_to_file_path from lockfile import LockFile from lockfile.mkdirlockfile import MkdirLockFile @@ -111,59 +110,3 @@ def test_lock_class(self, tmpdir): lock_class = object() cache = FileCache(str(tmpdir), lock_class=lock_class) assert cache.lock_class is lock_class - - def test_url_to_file_path(self, tmpdir): - cache = FileCache(str(tmpdir)) - # We'll add a long sorted suffix so that unsorted queries have a low - # collision probability (about 3.8e-23 for each sorted/unsorted - # comparison). - letter_n_numbers = list(enumerate(string.ascii_lowercase[3:], start=4)) - suff = '&' + '&'.join('%s=%s' % (k, v) for v, k in letter_n_numbers) - - def get_param(url): - """Mock losing order when processing params""" - shuffle(letter_n_numbers) - params = {k: v for v, k in letter_n_numbers} - url = url.replace(suff, '') - query = '&' + '&'.join('%s=%s' % item for item in params.items()) - return url + query - - urls = { - 'no_query': 'http://example.com', - 'unsorted_query': 'http://example.com?b=2&c=3&a=1' + suff, - 'sorted_query': 'http://example.com?a=1&b=2&c=3' + suff, - 'unsorted_empty_value': 'http://example.com?b=&c=&a=1' + suff, - 'sorted_empty_value': 'http://example.com?a=1&b=&c=3' + suff, - 'unsorted_repeated_key': 'http://example.com?b=2&c=3&b=0' - '&c=5&a=1' + suff, - 'sorted_repeated_key': 'http://example.com?a=1&b=0&b=2&c=3&' - 'c=5' + suff} - - unoquery = url_to_file_path(urls['no_query'], cache, sort_query=False) - snoquery = url_to_file_path(urls['no_query'], cache, sort_query=True) - assert unoquery == snoquery - urls.pop('no_query') - - sortedpaths = {urlname: url_to_file_path(urlvalue, cache, True) for - urlname, urlvalue in urls.items()} - - for key, value in urls.items(): - if key.startswith('sorted'): - assert url_to_file_path(value, cache, True) == sortedpaths[key] - - unsortedpaths = {urlname: url_to_file_path(urlvalue, cache, False) for - urlname, urlvalue in urls.items()} - - for key, url in urls.items(): - if key.startswith('unsorted'): - assert sortedpaths[key] != unsortedpaths[key] - else: - assert sortedpaths[key] == unsortedpaths[key] - - # Randomize and sort params - sort_param = url_to_file_path(get_param(url), cache, True) - assert sort_param == sortedpaths[key] - - # Randomize but don't sort params - unsort_param = url_to_file_path(get_param(url), cache, False) - assert unsort_param != unsortedpaths[key] From cc4a5b89a035b28d8c6fe74c39d050a4be7220af Mon Sep 17 00:00:00 2001 From: elnuno Date: Tue, 11 Apr 2017 15:17:47 -0300 Subject: [PATCH 7/7] Revert "Add a section about query sorting to the tips doc." This reverts commit cf1debbe12f8d1c0e10a0c4bc487e0569efdb918. --- docs/tips.rst | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/docs/tips.rst b/docs/tips.rst index 3a7d4e38..b603300f 100644 --- a/docs/tips.rst +++ b/docs/tips.rst @@ -53,17 +53,6 @@ If you are caching requests that use a large number of query string parameters, consider sorting them to ensure that the request is properly cached. -.. note:: - - Assuming random parameter order, a request with four parameters - has a >95% chance of missing the cache on the second time, because - it grows with the number of permutations (1/24 for four - parameters, 1/125 for five, etc.). However, this problem doesn't - arise while the order stays the same. So if the same dict is used - as params many times it'll likely hit the cache. FileCache suffers - more from this when used on different Python runs, as this makes - order more random. - Requests supports passing both dictionaries and lists of tuples as the param argument in a request. For example: :: @@ -71,24 +60,3 @@ param argument in a request. For example: :: By ordering your params, you can be sure the cache key will be consistent across requests and you are caching effectively. - -For convenience, the sorting of query parameters can be done in -cachecontrol itself. For example: :: - - # Unsorted - sess = cachecontrol.CacheControl(requests.Session(), sort_query=False, - heuristic=ExpiresAfter(days=1)) - params = [('a', 'b'), ('c', 'd')] - resp = sess.get('http://www.reddit.com', params=params) - print(resp.from_cache) # False (first fetch) - resp = sess.get('http://www.reddit.com', params=reversed(params)) - print(resp.from_cache) # False (!) - - sess = cachecontrol.CacheControl(requests.Session(), sort_query=True, - heuristic=ExpiresAfter(days=1)) - params = [('a', 'b'), ('c', 'd')] - resp = sess.get('http://www.reddit.com', params=params) - print(resp.from_cache) # False (first fetch) - resp = sess.get('http://www.reddit.com', params=reversed(params)) - print(resp.from_cache) # True :) -