Skip to content

Commit

Permalink
Don't store signature in ElasticSearch index
Browse files Browse the repository at this point in the history
* Sort/cutoff by elasticsearch relevance score instead
  • Loading branch information
Evgheni C committed May 15, 2017
1 parent d6fe49c commit 45afa8c
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 21 deletions.
16 changes: 8 additions & 8 deletions image_match/elasticsearch_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def __init__(self, es, index='images', doc_type='image', timeout='10s', size=100
def search_single_record(self, rec, pre_filter=None):
path = rec.pop('path')
signature = rec.pop('signature')

if 'metadata' in rec:
rec.pop('metadata')

Expand All @@ -70,27 +71,26 @@ def search_single_record(self, rec, pre_filter=None):
size=self.size,
timeout=self.timeout)['hits']['hits']

sigs = np.array([x['_source']['signature'] for x in res])

if sigs.size == 0:
if len(res) == 0:
return []

dists = normalized_distance(sigs, np.array(signature))

formatted_res = [{'id': x['_id'],
'score': x['_score'],
'metadata': x['_source'].get('metadata'),
'path': x['_source'].get('url', x['_source'].get('path'))}
for x in res]

for i, row in enumerate(formatted_res):
row['dist'] = dists[i]
formatted_res = filter(lambda y: y['dist'] < self.distance_cutoff, formatted_res)
formatted_res = filter(lambda y: y['score'] > self.score_cutoff, formatted_res)

return formatted_res

def insert_single_record(self, rec, refresh_after=False):
rec['timestamp'] = datetime.now()

# Don't store signature in index
if 'signature' in rec:
rec.pop('signature')

self.es.index(index=self.index, doc_type=self.doc_type, body=rec, refresh=refresh_after)

def delete_duplicates(self, path):
Expand Down
40 changes: 36 additions & 4 deletions image_match/signature_database_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def insert_single_record(self, rec):
raise NotImplementedError

def __init__(self, k=16, N=63, n_grid=9,
crop_percentile=(5, 95), distance_cutoff=0.45,
crop_percentile=(5, 95), distance_cutoff=0.45, score_cutoff=9.0,
*signature_args, **signature_kwargs):
"""Set up storage scheme for images
Expand Down Expand Up @@ -159,6 +159,8 @@ def __init__(self, k=16, N=63, n_grid=9,
considering how much variance to keep in the image (default (5, 95))
distance_cutoff (Optional [float]): maximum image signature distance to
be considered a match (default 0.45)
score_cutoff (Optional [float]): minimum ElasticSearch relevance score to
be considered a match (default 9.0)
*signature_args: Variable length argument list to pass to ImageSignature
**signature_kwargs: Arbitrary keyword arguments to pass to ImageSignature
Expand All @@ -175,14 +177,22 @@ def __init__(self, k=16, N=63, n_grid=9,
self.N = N
self.n_grid = n_grid

# Check float input
# Check float input for distance cutoff
if type(distance_cutoff) is not float:
raise TypeError('distance_cutoff should be a float')
if distance_cutoff < 0.:
raise ValueError('distance_cutoff should be > 0 (got %r)' % distance_cutoff)

self.distance_cutoff = distance_cutoff

# Check float input for elasticsearch score cutoff
if type(score_cutoff) is not float:
raise TypeError('score_cutoff should be a float')
if score_cutoff < 0.:
raise ValueError('score_cutoff should be > 0 (got %r)' % score_cutoff)

self.score_cutoff = score_cutoff

self.crop_percentile = crop_percentile

self.gis = ImageSignature(n=n_grid, crop_percentiles=crop_percentile, *signature_args, **signature_kwargs)
Expand Down Expand Up @@ -222,7 +232,7 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte
pre_filter (Optional[dict]): filters list before applying the matching algorithm
(default None)
Returns:
a formatted list of dicts representing unique matches, sorted by dist
a formatted list of dicts representing unique matches, sorted by dist or score (in case of using ElasticSearch)
For example, if three matches are found:
Expand All @@ -238,6 +248,19 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte
'path': u'https://c2.staticflickr.com/8/7158/6814444991_08d82de57e_z.jpg'}
]
Here is an ElasticSearch example:
[
{'score': 4.0,
'id': u'AVM37oZq0osmmAxpPvx7',
'path': u'https://pixabay.com/static/uploads/photo/2012/11/28/08/56/mona-lisa-67506_960_720.jpg'},
{'score': 35.0,
'id': u'AVM37nMg0osmmAxpPvx6',
'path': u'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg'},
{'score': 10.0,
'id': u'AVM37p530osmmAxpPvx9',
'path': u'https://c2.staticflickr.com/8/7158/6814444991_08d82de57e_z.jpg'}
]
"""
img = self.gis.preprocess_image(path, bytestream)

Expand Down Expand Up @@ -277,12 +300,21 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte

ids = set()
unique = []
hasScore = False
for item in result:
if 'score' in item:
hasScore = True

if item['id'] not in ids:
unique.append(item)
ids.add(item['id'])

r = sorted(unique, key=itemgetter('dist'))
# If data comes from ElasticSearch - sort by score, otherwise - default to sorting by dist
if hasScore:
r = sorted(unique, key=itemgetter('score'), reverse=True)
else:
r = sorted(unique, key=itemgetter('dist'))

return r


Expand Down
15 changes: 6 additions & 9 deletions tests/test_elasticsearch_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@
DOC_TYPE = 'image'
MAPPINGS = {
"mappings": {
DOC_TYPE: {
DOC_TYPE: {
"dynamic": True,
"properties": {
"metadata": {
"properties": {
"metadata": {
"type": "object",
"dynamic": True,
"properties": {
"properties": {
"tenant_id": { "type": "keyword" }
}
}
}
}
}
Expand Down Expand Up @@ -122,7 +122,6 @@ def test_lookup_from_url(ses):
assert len(r) == 1
assert r[0]['path'] == 'test1.jpg'
assert 'score' in r[0]
assert 'dist' in r[0]
assert 'id' in r[0]


Expand All @@ -132,7 +131,6 @@ def test_lookup_from_file(ses):
assert len(r) == 1
assert r[0]['path'] == 'test1.jpg'
assert 'score' in r[0]
assert 'dist' in r[0]
assert 'id' in r[0]

def test_lookup_from_bytestream(ses):
Expand All @@ -142,7 +140,6 @@ def test_lookup_from_bytestream(ses):
assert len(r) == 1
assert r[0]['path'] == 'test1.jpg'
assert 'score' in r[0]
assert 'dist' in r[0]
assert 'id' in r[0]

def test_lookup_with_cutoff(ses):
Expand Down Expand Up @@ -196,7 +193,7 @@ def test_lookup_with_filter_by_metadata(ses):

r = ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-3"}})
assert len(r) == 0


def test_all_orientations(ses):
im = Image.open('test1.jpg')
Expand Down

0 comments on commit 45afa8c

Please sign in to comment.