Don't store signature in ElasticSearch index

* Sort/cutoff by elasticsearch relevance score instead
rhsimplex · May 15, 2017 · 45afa8c · 45afa8c
1 parent d6fe49c
commit 45afa8c
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 21 deletions.
diff --git a/image_match/elasticsearch_driver.py b/image_match/elasticsearch_driver.py
@@ -49,6 +49,7 @@ def __init__(self, es, index='images', doc_type='image', timeout='10s', size=100
     def search_single_record(self, rec, pre_filter=None):
         path = rec.pop('path')
         signature = rec.pop('signature')
+
         if 'metadata' in rec:
             rec.pop('metadata')
 
@@ -70,27 +71,26 @@ def search_single_record(self, rec, pre_filter=None):
                               size=self.size,
                               timeout=self.timeout)['hits']['hits']
 
-        sigs = np.array([x['_source']['signature'] for x in res])
-
-        if sigs.size == 0:
+        if len(res) == 0:
             return []
 
-        dists = normalized_distance(sigs, np.array(signature))
-
         formatted_res = [{'id': x['_id'],
                           'score': x['_score'],
                           'metadata': x['_source'].get('metadata'),
                           'path': x['_source'].get('url', x['_source'].get('path'))}
                          for x in res]
 
-        for i, row in enumerate(formatted_res):
-            row['dist'] = dists[i]
-        formatted_res = filter(lambda y: y['dist'] < self.distance_cutoff, formatted_res)
+        formatted_res = filter(lambda y: y['score'] > self.score_cutoff, formatted_res)
 
         return formatted_res
 
     def insert_single_record(self, rec, refresh_after=False):
         rec['timestamp'] = datetime.now()
+
+        # Don't store signature in index
+        if 'signature' in rec:
+            rec.pop('signature')
+
         self.es.index(index=self.index, doc_type=self.doc_type, body=rec, refresh=refresh_after)
 
     def delete_duplicates(self, path):

diff --git a/image_match/signature_database_base.py b/image_match/signature_database_base.py
@@ -117,7 +117,7 @@ def insert_single_record(self, rec):
         raise NotImplementedError
 
     def __init__(self, k=16, N=63, n_grid=9,
-                 crop_percentile=(5, 95), distance_cutoff=0.45,
+                 crop_percentile=(5, 95), distance_cutoff=0.45, score_cutoff=9.0,
                  *signature_args, **signature_kwargs):
         """Set up storage scheme for images
 
@@ -159,6 +159,8 @@ def __init__(self, k=16, N=63, n_grid=9,
                 considering how much variance to keep in the image (default (5, 95))
             distance_cutoff (Optional [float]): maximum image signature distance to
                 be considered a match (default 0.45)
+            score_cutoff (Optional [float]): minimum ElasticSearch relevance score to
+                be considered a match (default 9.0)
             *signature_args: Variable length argument list to pass to ImageSignature
             **signature_kwargs: Arbitrary keyword arguments to pass to ImageSignature
 
@@ -175,14 +177,22 @@ def __init__(self, k=16, N=63, n_grid=9,
         self.N = N
         self.n_grid = n_grid
 
-        # Check float input
+        # Check float input for distance cutoff
         if type(distance_cutoff) is not float:
             raise TypeError('distance_cutoff should be a float')
         if distance_cutoff < 0.:
             raise ValueError('distance_cutoff should be > 0 (got %r)' % distance_cutoff)
 
         self.distance_cutoff = distance_cutoff
 
+        # Check float input for elasticsearch score cutoff
+        if type(score_cutoff) is not float:
+            raise TypeError('score_cutoff should be a float')
+        if score_cutoff < 0.:
+            raise ValueError('score_cutoff should be > 0 (got %r)' % score_cutoff)
+
+        self.score_cutoff = score_cutoff
+
         self.crop_percentile = crop_percentile
 
         self.gis = ImageSignature(n=n_grid, crop_percentiles=crop_percentile, *signature_args, **signature_kwargs)
@@ -222,7 +232,7 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte
             pre_filter (Optional[dict]): filters list before applying the matching algorithm
                 (default None)
         Returns:
-            a formatted list of dicts representing unique matches, sorted by dist
+            a formatted list of dicts representing unique matches, sorted by dist or score (in case of using ElasticSearch)
 
             For example, if three matches are found:
 
@@ -238,6 +248,19 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte
               'path': u'https://c2.staticflickr.com/8/7158/6814444991_08d82de57e_z.jpg'}
             ]
 
+            Here is an ElasticSearch example:
+
+            [
+             {'score': 4.0,
+              'id': u'AVM37oZq0osmmAxpPvx7',
+              'path': u'https://pixabay.com/static/uploads/photo/2012/11/28/08/56/mona-lisa-67506_960_720.jpg'},
+             {'score': 35.0,
+              'id': u'AVM37nMg0osmmAxpPvx6',
+              'path': u'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg'},
+             {'score': 10.0,
+              'id': u'AVM37p530osmmAxpPvx9',
+              'path': u'https://c2.staticflickr.com/8/7158/6814444991_08d82de57e_z.jpg'}
+            ]
         """
         img = self.gis.preprocess_image(path, bytestream)
 
@@ -277,12 +300,21 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte
 
         ids = set()
         unique = []
+        hasScore = False
         for item in result:
+            if 'score' in item:
+                hasScore = True
+
             if item['id'] not in ids:
                 unique.append(item)
                 ids.add(item['id'])
 
-        r = sorted(unique, key=itemgetter('dist'))
+        # If data comes from ElasticSearch - sort by score, otherwise - default to sorting by dist
+        if hasScore:
+            r = sorted(unique, key=itemgetter('score'), reverse=True)
+        else:
+            r = sorted(unique, key=itemgetter('dist'))
+
         return r
 
 

diff --git a/tests/test_elasticsearch_driver.py b/tests/test_elasticsearch_driver.py
@@ -17,15 +17,15 @@
 DOC_TYPE = 'image'
 MAPPINGS = {
   "mappings": {
-    DOC_TYPE: { 
+    DOC_TYPE: {
       "dynamic": True,
-      "properties": { 
-        "metadata": { 
+      "properties": {
+        "metadata": {
             "type": "object",
             "dynamic": True,
-            "properties": { 
+            "properties": {
                 "tenant_id": { "type": "keyword" }
-            } 
+            }
         }
       }
     }
@@ -122,7 +122,6 @@ def test_lookup_from_url(ses):
     assert len(r) == 1
     assert r[0]['path'] == 'test1.jpg'
     assert 'score' in r[0]
-    assert 'dist' in r[0]
     assert 'id' in r[0]
 
 
@@ -132,7 +131,6 @@ def test_lookup_from_file(ses):
     assert len(r) == 1
     assert r[0]['path'] == 'test1.jpg'
     assert 'score' in r[0]
-    assert 'dist' in r[0]
     assert 'id' in r[0]
 
 def test_lookup_from_bytestream(ses):
@@ -142,7 +140,6 @@ def test_lookup_from_bytestream(ses):
     assert len(r) == 1
     assert r[0]['path'] == 'test1.jpg'
     assert 'score' in r[0]
-    assert 'dist' in r[0]
     assert 'id' in r[0]
 
 def test_lookup_with_cutoff(ses):
@@ -196,7 +193,7 @@ def test_lookup_with_filter_by_metadata(ses):
 
     r = ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-3"}})
     assert len(r) == 0
-    
+
 
 def test_all_orientations(ses):
     im = Image.open('test1.jpg')