Skip to content

Commit

Permalink
Merge pull request #178 from scitran/upload-unification
Browse files Browse the repository at this point in the history
Retrofit of upload logic
  • Loading branch information
kofalt committed Mar 4, 2016
2 parents 1e0c5a5 + 4fdbb93 commit 4402735
Show file tree
Hide file tree
Showing 10 changed files with 627 additions and 58 deletions.
1 change: 1 addition & 0 deletions api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def _format(route):
webapp2.Route(_format(r'/api/<cont_name:{cont_name_re}>/<cid:{cid_re}>/<list_name:tags>'), listhandler.ListHandler, methods=['POST'], name='tags_post'),
webapp2.Route(_format(r'/api/<cont_name:{cont_name_re}>/<cid:{cid_re}>/<list_name:tags>/<value:{tag_re}>'), listhandler.ListHandler, name='tags'),

webapp2.Route(_format(r'/api/<cont_name:{cont_name_re}>/<cid:{cid_re}>/packfile'), listhandler.FileListHandler, name='packfile', handler_method='packfile', methods=['POST']),
webapp2.Route(_format(r'/api/<cont_name:{cont_name_re}>/<cid:{cid_re}>/<list_name:files>'), listhandler.FileListHandler, name='files_post', methods=['POST']),
webapp2.Route(_format(r'/api/<cont_name:{cont_name_re}>/<cid:{cid_re}>/<list_name:files>/<name:{filename_re}>'), listhandler.FileListHandler, name='files'),

Expand Down
3 changes: 3 additions & 0 deletions api/dao/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
class APIStorageException(Exception):
pass

def noop(*args, **kwargs):
pass
28 changes: 28 additions & 0 deletions api/dao/reaperutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,34 @@ def add_file(self, fileinfo):
return_document=pymongo.collection.ReturnDocument.AFTER
)

# TODO: already in code elsewhere? Location?
def get_container(cont_name, _id):
cont_name += 's'
_id = bson.ObjectId(_id)

return config.db[cont_name].find_one({
'_id': _id,
})

def upsert_fileinfo(cont_name, _id, fileinfo):
# TODO: make all functions take singular noun
cont_name += 's'

# TODO: make all functions consume strings
_id = bson.ObjectId(_id)

# OPPORTUNITY: could potentially be atomic if we pass a closure to perform the modification
result = config.db[cont_name].find_one({
'_id': _id,
'files.name': fileinfo['name'],
})

if result is None:
fileinfo['created'] = fileinfo['modified']
return add_fileinfo(cont_name, _id, fileinfo)
else:
return update_fileinfo(cont_name, _id, fileinfo)

def update_fileinfo(cont_name, _id, fileinfo):
update_set = {'files.$.modified': datetime.datetime.utcnow()}
# in this method, we are overriding an existing file.
Expand Down
75 changes: 73 additions & 2 deletions api/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,62 @@

from . import util
from . import config
from . import tempdir as tempfile

log = config.log

DEFAULT_HASH_ALG='sha384'

def move_file(path, target_path):
target_dir = os.path.dirname(target_path)
if not os.path.exists(target_dir):
os.makedirs(target_dir)
shutil.move(path, target_path)

def move_form_file_field_into_cas(file_field):
"""
Given a file form field, move the (downloaded, tempdir-stored) file into the CAS.
Requires an augmented file field; see upload.process_upload() for details.
"""

if not file_field.hash or not file_field.path:
raise Exception("Field is not a file field with hash and path")

base = config.get_item('persistent', 'data_path')
cas = util.path_from_hash(file_field.hash)
move_file(file_field.path, os.path.join(base, cas))

def hash_file_formatted(path, hash_alg=None):
"""
Return the scitran-formatted hash of a file, specified by path.
REVIEW: if there's an intelligent io-copy in python stdlib, I missed it. This uses an arbitrary buffer size :/
"""

hash_alg = hash_alg or DEFAULT_HASH_ALG
hasher = hashlib.new(hash_alg)

BUF_SIZE = 65536

with open(path, 'rb') as f:
while True:
data = f.read(BUF_SIZE)
if not data:
break
hasher.update(data)

return util.format_hash(hash_alg, hasher.hexdigest())


class FileStoreException(Exception):
pass

class HashingFile(file):
def __init__(self, file_path, hash_alg):
super(HashingFile, self).__init__(file_path, "wb")
self.hash_alg = hashlib.new(hash_alg)
self.hash_name = hash_alg

def write(self, data):
self.hash_alg.update(data)
Expand All @@ -34,8 +73,39 @@ def write(self, data):
def get_hash(self):
return self.hash_alg.hexdigest()

def get_formatted_hash(self):
return util.format_hash(self.hash_name, self.get_hash())

ParsedFile = collections.namedtuple('ParsedFile', ['info', 'path'])

def process_form(request, hash_alg=None):
"""
Some workarounds to make webapp2 process forms in an intelligent way, and hash files we process.
Could subsume getHashingFieldStorage.
This is a bit arcane, and deals with webapp2 / uwsgi / python complexities. Ask a team member, sorry!
Returns the processed form, and the tempdir it was stored in.
Keep tempdir in scope until you don't need it anymore; it will be deleted on GC.
"""

hash_alg = hash_alg or DEFAULT_HASH_ALG

# Store form file fields in a tempdir
tempdir = tempfile.TemporaryDirectory(prefix='.tmp', dir=config.get_item('persistent', 'data_path'))
tempdir_path = tempdir.name

# Deep vodoo; docs?
env = request.environ.copy()
env.setdefault('CONTENT_LENGTH', '0')
env['QUERY_STRING'] = ''

# Wall-clock warning: despite its name, getHashingFieldStorage will actually process the entire form to disk. This involves recieving the entire upload stream and storing any files in the tempdir.
form = getHashingFieldStorage(tempdir.name, DEFAULT_HASH_ALG)(
fp=request.body_file, environ=env, keep_blank_values=True
)

return (form, tempdir)

def getHashingFieldStorage(upload_dir, hash_alg):
class HashingFieldStorage(cgi.FieldStorage):
Expand Down Expand Up @@ -74,7 +144,7 @@ class FileStore(object):
The operations could be safely interleaved with other actions like permission checks or database updates.
"""

def __init__(self, request, dest_path, filename=None, hash_alg='sha384'):
def __init__(self, request, dest_path, filename=None, hash_alg=DEFAULT_HASH_ALG):
self.body = request.body_file
self.environ = request.environ.copy()
self.environ.setdefault('CONTENT_LENGTH', '0')
Expand Down Expand Up @@ -136,11 +206,12 @@ def identical(hash_0, path_0, hash_1, path_1):
else:
return hash_0 == hash_1

# TODO: Hopefully deprecated by unification branch
class MultiFileStore(object):
"""This class provides and interface for file uploads.
"""

def __init__(self, request, dest_path, filename=None, hash_alg='sha384'):
def __init__(self, request, dest_path, filename=None, hash_alg=DEFAULT_HASH_ALG):
self.body = request.body_file
self.environ = request.environ.copy()
self.environ.setdefault('CONTENT_LENGTH', '0')
Expand Down
86 changes: 35 additions & 51 deletions api/handlers/listhandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
import datetime

from .. import base
from .. import util
from .. import config
from .. import files
from .. import rules
from .. import config
from .. import validators
from .. import tempdir as tempfile
from .. import upload
from .. import util
from .. import validators
from ..auth import listauth, always_ok
from ..dao import noop
from ..dao import liststorage
from ..dao import APIStorageException

Expand Down Expand Up @@ -353,54 +355,36 @@ def delete(self, cont_name, list_name, **kwargs):
return result

def post(self, cont_name, list_name, **kwargs):
force = self.is_true('force')
_id = kwargs.pop('cid')
container, permchecker, storage, mongo_validator, payload_validator, keycheck = self._initialize_request(cont_name, list_name, _id)

result = None
with tempfile.TemporaryDirectory(prefix='.tmp', dir=config.get_item('persistent', 'data_path')) as tempdir_path:
file_store = files.FileStore(self.request, tempdir_path, filename=kwargs.get('name'))
payload = file_store.payload
file_datetime = datetime.datetime.utcnow()
file_properties = {
'name': file_store.filename,
'size': file_store.size,
'hash': file_store.hash,
'created': file_datetime,
'modified': file_datetime,
}
if file_store.metadata:
file_properties['metadata'] = file_store.metadata
if file_store.metadata.get('mimetype'):
file_properties['mimetype'] = file_store.metadata['mimetype']
if file_store.tags:
file_properties['tags'] = file_store.tags
dest_path = os.path.join(config.get_item('persistent', 'data_path'), util.path_from_hash(file_properties['hash']))
query_params = None
if not force:
method = 'POST'
# Authorize
container, permchecker, storage, mongo_validator, payload_validator, keycheck = self._initialize_request(cont_name + 's', list_name, _id)
permchecker(noop)('POST', _id=_id)

return upload.process_upload(self.request, upload.Strategy.targeted, cont_name, _id)

def packfile(self, cont_name, **kwargs):
_id = kwargs.pop('cid')

if cont_name != 'projects':
raise Exception('Packfiles can only be targeted at projects')

# Authorize: confirm project exists
project = config.db['projects'].find_one({ '_id': bson.ObjectId(_id)})

print project

if project is None:
raise Exception('Project ' + _id + ' does not exist')

# Authorize: confirm user has admin/write perms
if not self.superuser_request:
perms = project.get('permissions', [])

for p in perms:
if p['_id'] == self.uid and p['access'] in ('rw', 'admin'):
break
else:
filename = file_store.filename
filepath = file_store.path
for f in container.get('files', []):
if f['name'] == filename:
if file_store.identical(filepath, f['hash']):
log.debug('Dropping %s (identical)' % filename)
os.remove(filepath)
return {'modified': 0}
else:
log.debug('Replacing %s' % filename)
method = 'PUT'
query_params = {'name':filename}
break
else:
method = 'POST'
file_store.move_file(dest_path)
payload_validator(payload, method)
payload.update(file_properties)
payload['mimetype'] = payload.get('mimetype') or util.guess_mimetype(file_store.filename)
result = keycheck(mongo_validator(permchecker(storage.exec_op)))(method, _id=_id, query_params=query_params, payload=payload)
if not result or result.modified_count != 1:
self.abort(404, 'Element not added in list {} of container {} {}'.format(storage.list_name, storage.cont_name, _id))
rules.create_jobs(config.db, container, cont_name[:-1], payload)
return {'modified': result.modified_count}
raise Exception('Not authorized')

return upload.process_upload(self.request, upload.Strategy.packfile)
Loading

0 comments on commit 4402735

Please sign in to comment.