From 45afa8c90b5a9832e02d0b81632d3277adced343 Mon Sep 17 00:00:00 2001 From: Evgheni C Date: Mon, 8 May 2017 18:47:06 -0700 Subject: [PATCH] Don't store signature in ElasticSearch index * Sort/cutoff by elasticsearch relevance score instead --- image_match/elasticsearch_driver.py | 16 +++++------ image_match/signature_database_base.py | 40 +++++++++++++++++++++++--- tests/test_elasticsearch_driver.py | 15 ++++------ 3 files changed, 50 insertions(+), 21 deletions(-) diff --git a/image_match/elasticsearch_driver.py b/image_match/elasticsearch_driver.py index 1f105cc..a3bcd68 100644 --- a/image_match/elasticsearch_driver.py +++ b/image_match/elasticsearch_driver.py @@ -49,6 +49,7 @@ def __init__(self, es, index='images', doc_type='image', timeout='10s', size=100 def search_single_record(self, rec, pre_filter=None): path = rec.pop('path') signature = rec.pop('signature') + if 'metadata' in rec: rec.pop('metadata') @@ -70,27 +71,26 @@ def search_single_record(self, rec, pre_filter=None): size=self.size, timeout=self.timeout)['hits']['hits'] - sigs = np.array([x['_source']['signature'] for x in res]) - - if sigs.size == 0: + if len(res) == 0: return [] - dists = normalized_distance(sigs, np.array(signature)) - formatted_res = [{'id': x['_id'], 'score': x['_score'], 'metadata': x['_source'].get('metadata'), 'path': x['_source'].get('url', x['_source'].get('path'))} for x in res] - for i, row in enumerate(formatted_res): - row['dist'] = dists[i] - formatted_res = filter(lambda y: y['dist'] < self.distance_cutoff, formatted_res) + formatted_res = filter(lambda y: y['score'] > self.score_cutoff, formatted_res) return formatted_res def insert_single_record(self, rec, refresh_after=False): rec['timestamp'] = datetime.now() + + # Don't store signature in index + if 'signature' in rec: + rec.pop('signature') + self.es.index(index=self.index, doc_type=self.doc_type, body=rec, refresh=refresh_after) def delete_duplicates(self, path): diff --git a/image_match/signature_database_base.py b/image_match/signature_database_base.py index 42eadd0..ca9f2f8 100644 --- a/image_match/signature_database_base.py +++ b/image_match/signature_database_base.py @@ -117,7 +117,7 @@ def insert_single_record(self, rec): raise NotImplementedError def __init__(self, k=16, N=63, n_grid=9, - crop_percentile=(5, 95), distance_cutoff=0.45, + crop_percentile=(5, 95), distance_cutoff=0.45, score_cutoff=9.0, *signature_args, **signature_kwargs): """Set up storage scheme for images @@ -159,6 +159,8 @@ def __init__(self, k=16, N=63, n_grid=9, considering how much variance to keep in the image (default (5, 95)) distance_cutoff (Optional [float]): maximum image signature distance to be considered a match (default 0.45) + score_cutoff (Optional [float]): minimum ElasticSearch relevance score to + be considered a match (default 9.0) *signature_args: Variable length argument list to pass to ImageSignature **signature_kwargs: Arbitrary keyword arguments to pass to ImageSignature @@ -175,7 +177,7 @@ def __init__(self, k=16, N=63, n_grid=9, self.N = N self.n_grid = n_grid - # Check float input + # Check float input for distance cutoff if type(distance_cutoff) is not float: raise TypeError('distance_cutoff should be a float') if distance_cutoff < 0.: @@ -183,6 +185,14 @@ def __init__(self, k=16, N=63, n_grid=9, self.distance_cutoff = distance_cutoff + # Check float input for elasticsearch score cutoff + if type(score_cutoff) is not float: + raise TypeError('score_cutoff should be a float') + if score_cutoff < 0.: + raise ValueError('score_cutoff should be > 0 (got %r)' % score_cutoff) + + self.score_cutoff = score_cutoff + self.crop_percentile = crop_percentile self.gis = ImageSignature(n=n_grid, crop_percentiles=crop_percentile, *signature_args, **signature_kwargs) @@ -222,7 +232,7 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte pre_filter (Optional[dict]): filters list before applying the matching algorithm (default None) Returns: - a formatted list of dicts representing unique matches, sorted by dist + a formatted list of dicts representing unique matches, sorted by dist or score (in case of using ElasticSearch) For example, if three matches are found: @@ -238,6 +248,19 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte 'path': u'https://c2.staticflickr.com/8/7158/6814444991_08d82de57e_z.jpg'} ] + Here is an ElasticSearch example: + + [ + {'score': 4.0, + 'id': u'AVM37oZq0osmmAxpPvx7', + 'path': u'https://pixabay.com/static/uploads/photo/2012/11/28/08/56/mona-lisa-67506_960_720.jpg'}, + {'score': 35.0, + 'id': u'AVM37nMg0osmmAxpPvx6', + 'path': u'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg'}, + {'score': 10.0, + 'id': u'AVM37p530osmmAxpPvx9', + 'path': u'https://c2.staticflickr.com/8/7158/6814444991_08d82de57e_z.jpg'} + ] """ img = self.gis.preprocess_image(path, bytestream) @@ -277,12 +300,21 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte ids = set() unique = [] + hasScore = False for item in result: + if 'score' in item: + hasScore = True + if item['id'] not in ids: unique.append(item) ids.add(item['id']) - r = sorted(unique, key=itemgetter('dist')) + # If data comes from ElasticSearch - sort by score, otherwise - default to sorting by dist + if hasScore: + r = sorted(unique, key=itemgetter('score'), reverse=True) + else: + r = sorted(unique, key=itemgetter('dist')) + return r diff --git a/tests/test_elasticsearch_driver.py b/tests/test_elasticsearch_driver.py index e55fe5b..62647be 100644 --- a/tests/test_elasticsearch_driver.py +++ b/tests/test_elasticsearch_driver.py @@ -17,15 +17,15 @@ DOC_TYPE = 'image' MAPPINGS = { "mappings": { - DOC_TYPE: { + DOC_TYPE: { "dynamic": True, - "properties": { - "metadata": { + "properties": { + "metadata": { "type": "object", "dynamic": True, - "properties": { + "properties": { "tenant_id": { "type": "keyword" } - } + } } } } @@ -122,7 +122,6 @@ def test_lookup_from_url(ses): assert len(r) == 1 assert r[0]['path'] == 'test1.jpg' assert 'score' in r[0] - assert 'dist' in r[0] assert 'id' in r[0] @@ -132,7 +131,6 @@ def test_lookup_from_file(ses): assert len(r) == 1 assert r[0]['path'] == 'test1.jpg' assert 'score' in r[0] - assert 'dist' in r[0] assert 'id' in r[0] def test_lookup_from_bytestream(ses): @@ -142,7 +140,6 @@ def test_lookup_from_bytestream(ses): assert len(r) == 1 assert r[0]['path'] == 'test1.jpg' assert 'score' in r[0] - assert 'dist' in r[0] assert 'id' in r[0] def test_lookup_with_cutoff(ses): @@ -196,7 +193,7 @@ def test_lookup_with_filter_by_metadata(ses): r = ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-3"}}) assert len(r) == 0 - + def test_all_orientations(ses): im = Image.open('test1.jpg')