From 28d0e55cac3cdac00f731c82ae55ece7ba0f9125 Mon Sep 17 00:00:00 2001 From: Andrew Davison Date: Thu, 4 Feb 2021 15:35:25 +0100 Subject: [PATCH 1/4] better management of downloaded files cache --- api/neoview/views.py | 53 +++++++++++++++++------------ api/neural_activity_app/settings.py | 1 + 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/api/neoview/views.py b/api/neoview/views.py index af52bbc..61e7cf9 100644 --- a/api/neoview/views.py +++ b/api/neoview/views.py @@ -1,29 +1,23 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals import os.path +import hashlib +import logging +from time import sleep +from urllib.request import urlopen, urlretrieve, HTTPError +from urllib.parse import urlparse, urlunparse + from django.http import JsonResponse from django.utils.datastructures import MultiValueDictKeyError +from django.conf import settings from rest_framework.views import APIView +from rest_framework import status + from neo.io import get_io import neo -from rest_framework import status -from os.path import basename -try: - from urllib import urlretrieve, HTTPError -except ImportError: - from urllib.request import urlretrieve, HTTPError -try: - from urllib.request import urlopen -except ImportError: - from urllib2 import urlopen -try: - unicode -except NameError: - unicode = str -# import logging -from time import sleep -# logger = logging.getLogger(__name__) + +logger = logging.getLogger(__name__) def custom_get_io(filename): @@ -37,13 +31,30 @@ def custom_get_io(filename): return io +def _get_cache_path(url): + """ + For caching, we store files in a flat directory structure, where the directory name is + based on the URL, but files in the same directory on the original server end up in the + same directory in our cache. + """ + url_parts = urlparse(url) + base_url = urlunparse((url_parts.scheme, url_parts.netloc, os.path.dirname(url_parts.path), "", "", "")) + dir_name = hashlib.sha1(base_url).encode('utf-8').hexdigest() + return os.path.join(settings.get("DOWNLOADED_FILE_CACHE_DIR", ""), + dir_name, + os.path.basename(url_parts.path)) + + def _get_file_from_url(request): url = request.GET.get('url') + # we first open the url to resolve any redirects response = urlopen(url) - filename = basename(response.url) + resolved_url = response.geturl() + + filename = _get_cache_path(resolved_url) if not os.path.isfile(filename): - urlretrieve(url, filename) + urlretrieve(resolved_url, filename) # todo: wrap previous line in try..except so we can return a 404 if the file is not found # or a 500 if the local disk is full @@ -51,7 +62,7 @@ def _get_file_from_url(request): name, ext = os.path.splitext(filename) if ext[1:] in neo.io.AsciiSignalIO.extensions: # ext has a leading '.' metadata_filename = filename.replace(ext, "_about.json") - metadata_url = url.replace(ext, "_about.json") + metadata_url = resolved_url.replace(ext, "_about.json") try: urlretrieve(metadata_url, metadata_filename) except HTTPError: @@ -61,7 +72,7 @@ def _get_file_from_url(request): def _handle_dict(ob): - return {k: unicode(v) for k, v in ob.items()} + return {k: str(v) for k, v in ob.items()} class Block(APIView): diff --git a/api/neural_activity_app/settings.py b/api/neural_activity_app/settings.py index 81ee70c..bafaa3b 100644 --- a/api/neural_activity_app/settings.py +++ b/api/neural_activity_app/settings.py @@ -159,3 +159,4 @@ }, } +DOWNLOADED_FILE_CACHE_DIR = os.path.join(BASE_DIR, "download_cache") From 06d958538fc43171d2bf6fada5be62df4da4094d Mon Sep 17 00:00:00 2001 From: Andrew Davison Date: Thu, 4 Feb 2021 15:43:45 +0100 Subject: [PATCH 2/4] typo --- api/neoview/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/neoview/views.py b/api/neoview/views.py index 61e7cf9..e46eb90 100644 --- a/api/neoview/views.py +++ b/api/neoview/views.py @@ -39,7 +39,7 @@ def _get_cache_path(url): """ url_parts = urlparse(url) base_url = urlunparse((url_parts.scheme, url_parts.netloc, os.path.dirname(url_parts.path), "", "", "")) - dir_name = hashlib.sha1(base_url).encode('utf-8').hexdigest() + dir_name = hashlib.sha1(base_url.encode('utf-8')).hexdigest() return os.path.join(settings.get("DOWNLOADED_FILE_CACHE_DIR", ""), dir_name, os.path.basename(url_parts.path)) From a64d1fb953f02fca51123d9886076552bbcaf20c Mon Sep 17 00:00:00 2001 From: Andrew Davison Date: Thu, 4 Feb 2021 15:46:10 +0100 Subject: [PATCH 3/4] forgot how Django settings.py works --- api/neoview/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/neoview/views.py b/api/neoview/views.py index e46eb90..00efd82 100644 --- a/api/neoview/views.py +++ b/api/neoview/views.py @@ -40,7 +40,7 @@ def _get_cache_path(url): url_parts = urlparse(url) base_url = urlunparse((url_parts.scheme, url_parts.netloc, os.path.dirname(url_parts.path), "", "", "")) dir_name = hashlib.sha1(base_url.encode('utf-8')).hexdigest() - return os.path.join(settings.get("DOWNLOADED_FILE_CACHE_DIR", ""), + return os.path.join(getattr(settings, "DOWNLOADED_FILE_CACHE_DIR", ""), dir_name, os.path.basename(url_parts.path)) From 2bcf8f16a2f65842fe42f5d1a254588f4a0d1f1c Mon Sep 17 00:00:00 2001 From: Andrew Davison Date: Thu, 4 Feb 2021 16:02:12 +0100 Subject: [PATCH 4/4] need to create the directory --- api/neoview/views.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/api/neoview/views.py b/api/neoview/views.py index 00efd82..35dc45d 100644 --- a/api/neoview/views.py +++ b/api/neoview/views.py @@ -40,9 +40,10 @@ def _get_cache_path(url): url_parts = urlparse(url) base_url = urlunparse((url_parts.scheme, url_parts.netloc, os.path.dirname(url_parts.path), "", "", "")) dir_name = hashlib.sha1(base_url.encode('utf-8')).hexdigest() - return os.path.join(getattr(settings, "DOWNLOADED_FILE_CACHE_DIR", ""), - dir_name, - os.path.basename(url_parts.path)) + dir_path = os.path.join(getattr(settings, "DOWNLOADED_FILE_CACHE_DIR", ""), + dir_name) + os.makedirs(dir_path, exist_ok=True) + return os.path.join(dir_path, os.path.basename(url_parts.path)) def _get_file_from_url(request):