Skip to content

Commit

Permalink
Upgrade to Elasticsearch 6.x (#14206)
Browse files Browse the repository at this point in the history
* Upgrade to Elasticsearch 6.x

Because ES 6.x removed mapping types, move update_count and download_count in
separate ES indexes. This requires a full reindex, including stats, but we need
to create an entirely new cluster anyway.

ES 5.x compatibility is kept for now.
  • Loading branch information
diox authored May 28, 2020
1 parent c15389f commit ec0d8f9
Show file tree
Hide file tree
Showing 20 changed files with 970 additions and 663 deletions.
54 changes: 37 additions & 17 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,27 @@ dist: xenial
python:
- 3.6

addons:
apt:
packages: &global_deps
- cmake
- swig
- elasticsearch
- gettext
- librsvg2-bin
- pngcrush
- uuid
- libgit2-dev

jobs:
fast_finish: true
include:
- { env: TOXENV=codestyle }
- { env: TOXENV=docs }
- { env: TOXENV=assets }
- { env: TOXENV=es }
- { env: TOXENV=addons-versions-files-ratings }
- { env: TOXENV=es ES_VERSION=5.x }
- { env: TOXENV=es ES_VERSION=6.x }
- { env: TOXENV=devhub }
- { env: TOXENV=reviewers-and-zadmin }
- { env: TOXENV=amo-lib-locales-and-signing }
Expand All @@ -28,31 +41,34 @@ cache:
- node_modules
- $HOME/.gimme

addons:
apt:
sources:
- elasticsearch-5.x
packages:
- cmake
- swig
- elasticsearch
- gettext
- librsvg2-bin
- pngcrush
- uuid
- libgit2-dev

services:
- mysql
- memcached
- elasticsearch
- redis

before_install:
- mysql -e 'create database olympia;'
- export GOPATH=$HOME/go
- export PATH=$HOME/usr/local/go/bin:$GOPATH/bin:$PATH
# Unfortunately we require the most recent libmagic-dev version to make use of more recent
# JSON detection features which we require for the code-manager related APIs
- echo "deb http://archive.ubuntu.com/ubuntu eoan main" | sudo tee -a /etc/apt/sources.list
- sudo apt-get update -qq
- sudo apt-get install -t eoan libmagic-dev libmagic1 libmagic-mgc
- echo "text/markdown md markdown" | sudo tee -a /etc/mime.types
- sudo cp ./docker/etc/mime.types /etc/mime.types
- |
if [ $TOXENV == "es" ]; then
if [ $ES_VERSION == "6.x" ]; then
curl -s -O https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.8.8.deb
sudo dpkg -i --force-confnew elasticsearch-6.8.8.deb
sudo sed -i.old 's/-Xms1g/-Xms128m/' /etc/elasticsearch/jvm.options
sudo sed -i.old 's/-Xmx1g/-Xmx128m/' /etc/elasticsearch/jvm.options
echo -e '-XX:+DisableExplicitGC\n-Djdk.io.permissionsUseCanonicalPath=true\n-Dlog4j.skipJansi=true\n-server\n' | sudo tee -a /etc/elasticsearch/jvm.options
sudo chown -R elasticsearch:elasticsearch /etc/default/elasticsearch
fi
sudo systemctl start elasticsearch
fi
install:
- nvm current
Expand All @@ -65,7 +81,11 @@ before_script:
- mysql --version
- node --version
- java -version
- curl -v http://localhost:9200/
- |
if [ $TOXENV == "es" ]; then
sleep 10;
curl -v http://localhost:9200/;
fi
- sudo touch /addons-server-docker-container

script:
Expand Down
2 changes: 1 addition & 1 deletion conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def unpin_db(request):
request.addfinalizer(pinning.unpin_this_thread)


@pytest.fixture(autouse=True)
@pytest.fixture(autouse=True, scope='class')
def mock_elasticsearch():
"""Mock ElasticSearch in tests by default.
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ services:
- MYSQL_DATABASE=olympia

elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:5.4.1
image: docker.elastic.co/elasticsearch/elasticsearch:6.8.8
environment:
# Disable all xpack related features to avoid unrelated logging
# in docker logs. https://github.com/mozilla/addons-server/issues/8887
Expand Down
247 changes: 124 additions & 123 deletions src/olympia/addons/indexers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import copy

from django.conf import settings

import olympia.core.logger
from olympia import amo
from olympia.amo.indexers import BaseSearchIndexer
Expand Down Expand Up @@ -36,11 +38,100 @@ class AddonIndexer(BaseSearchIndexer):
'summary_l10n_*',
)

index_settings = {
'analysis': {
'analyzer': {
'standard_with_word_split': {
# This analyzer tries to split the text into words by using
# various methods. It also lowercases them and make sure
# each token is only returned once.
# Only use for short things with extremely meaningful
# content like add-on name - it makes too many
# modifications to be useful for things like descriptions,
# for instance.
'tokenizer': 'standard',
'filter': [
'standard', 'custom_word_delimiter', 'lowercase',
'stop', 'custom_dictionary_decompounder', 'unique',
]
},
'trigram': {
# Analyzer that splits the text into trigrams.
'tokenizer': 'ngram_tokenizer',
'filter': [
'lowercase',
]
},
},
'tokenizer': {
'ngram_tokenizer': {
'type': 'ngram',
'min_gram': 3,
'max_gram': 3,
'token_chars': ['letter', 'digit']
}
},
'normalizer': {
'lowercase_keyword_normalizer': {
# By default keywords are indexed 'as-is', but for exact
# name matches we need to lowercase them before indexing,
# so this normalizer does that for us.
'type': 'custom',
'filter': ['lowercase'],
},
},
'filter': {
'custom_word_delimiter': {
# This filter is useful for add-on names that have multiple
# words sticked together in a way that is easy to
# recognize, like FooBar, which should be indexed as FooBar
# and Foo Bar. (preserve_original: True makes us index both
# the original and the split version.)
'type': 'word_delimiter',
'preserve_original': True
},
'custom_dictionary_decompounder': {
# This filter is also useful for add-on names that have
# multiple words sticked together, but without a pattern
# that we can automatically recognize. To deal with those,
# we use a small dictionary of common words. It allows us
# to index 'awesometabpassword' as 'awesome tab password',
# helping users looking for 'tab password' find that addon.
'type': 'dictionary_decompounder',
'word_list': [
'all', 'auto', 'ball', 'bar', 'block', 'blog',
'bookmark', 'browser', 'bug', 'button', 'cat', 'chat',
'click', 'clip', 'close', 'color', 'context', 'cookie',
'cool', 'css', 'delete', 'dictionary', 'down',
'download', 'easy', 'edit', 'fill', 'fire', 'firefox',
'fix', 'flag', 'flash', 'fly', 'forecast', 'fox',
'foxy', 'google', 'grab', 'grease', 'html', 'http',
'image', 'input', 'inspect', 'inspector', 'iris', 'js',
'key', 'keys', 'lang', 'link', 'mail', 'manager',
'map', 'mega', 'menu', 'menus', 'monkey', 'name',
'net', 'new', 'open', 'password', 'persona', 'privacy',
'query', 'screen', 'scroll', 'search', 'secure',
'select', 'smart', 'spring', 'status', 'style',
'super', 'sync', 'tab', 'text', 'think', 'this',
'time', 'title', 'translate', 'tree', 'undo', 'upload',
'url', 'user', 'video', 'window', 'with', 'word',
'zilla',
]
},
}
}
}

@classmethod
def get_model(cls):
from olympia.addons.models import Addon
return Addon

@classmethod
def get_index_alias(cls):
"""Return the index alias name."""
return settings.ES_INDEXES.get('default')

@classmethod
def get_mapping(cls):
doc_name = cls.get_doctype_name()
Expand Down Expand Up @@ -244,7 +335,7 @@ def extract_version(cls, obj, version_obj):
if version_obj.license:
data['license'] = {
'id': version_obj.license.id,
'builtin': version_obj.license.builtin,
'builtin': bool(version_obj.license.builtin),
'url': version_obj.license.url,
}
attach_trans_dict(License, [version_obj.license])
Expand Down Expand Up @@ -363,129 +454,39 @@ def extract_document(cls, obj):

return data


# addons index settings.
INDEX_SETTINGS = {
'analysis': {
'analyzer': {
'standard_with_word_split': {
# This analyzer tries to split the text into words by using
# various methods. It also lowercases them and make sure each
# token is only returned once.
# Only use for short things with extremely meaningful content
# like add-on name - it makes too many modifications to be
# useful for things like descriptions, for instance.
'tokenizer': 'standard',
'filter': [
'standard', 'custom_word_delimiter', 'lowercase', 'stop',
'custom_dictionary_decompounder', 'unique',
]
},
'trigram': {
# Analyzer that splits the text into trigrams.
'tokenizer': 'ngram_tokenizer',
'filter': [
'lowercase',
]
},
},
'tokenizer': {
'ngram_tokenizer': {
'type': 'ngram',
'min_gram': 3,
'max_gram': 3,
'token_chars': ['letter', 'digit']
}
},
'normalizer': {
'lowercase_keyword_normalizer': {
# By default keywords are indexed 'as-is', but for exact name
# matches we need to lowercase them before indexing, so this
# normalizer does that for us.
'type': 'custom',
'filter': ['lowercase'],
},
},
'filter': {
'custom_word_delimiter': {
# This filter is useful for add-on names that have multiple
# words sticked together in a way that is easy to recognize,
# like FooBar, which should be indexed as FooBar and Foo Bar.
# (preserve_original: True makes us index both the original
# and the split version.)
'type': 'word_delimiter',
'preserve_original': True
@classmethod
def create_new_index(cls, index_name):
"""
Create a new index for addons in ES.
Intended to be used by reindexation (and tests), generally a bad idea
to call manually.
"""
index_settings = copy.deepcopy(cls.index_settings)

config = {
'mappings': {
cls.get_doctype_name(): cls.get_mapping(),
},
'custom_dictionary_decompounder': {
# This filter is also useful for add-on names that have
# multiple words sticked together, but without a pattern that
# we can automatically recognize. To deal with those, we use
# a small dictionary of common words. It allows us to index
# 'awesometabpassword' as 'awesome tab password', helping
# users looking for 'tab password' find that add-on.
'type': 'dictionary_decompounder',
'word_list': [
'all', 'auto', 'ball', 'bar', 'block', 'blog', 'bookmark',
'browser', 'bug', 'button', 'cat', 'chat', 'click', 'clip',
'close', 'color', 'context', 'cookie', 'cool', 'css',
'delete', 'dictionary', 'down', 'download', 'easy', 'edit',
'fill', 'fire', 'firefox', 'fix', 'flag', 'flash', 'fly',
'forecast', 'fox', 'foxy', 'google', 'grab', 'grease',
'html', 'http', 'image', 'input', 'inspect', 'inspector',
'iris', 'js', 'key', 'keys', 'lang', 'link', 'mail',
'manager', 'map', 'mega', 'menu', 'menus', 'monkey',
'name', 'net', 'new', 'open', 'password', 'persona',
'privacy', 'query', 'screen', 'scroll', 'search', 'secure',
'select', 'smart', 'spring', 'status', 'style', 'super',
'sync', 'tab', 'text', 'think', 'this', 'time', 'title',
'translate', 'tree', 'undo', 'upload', 'url', 'user',
'video', 'window', 'with', 'word', 'zilla',
]
'settings': {
# create_index will add its own index settings like number of
# shards and replicas.
'index': index_settings
},
}
}
}


def create_new_index(index_name=None):
"""
Create a new index for addons in ES.
Intended to be used by reindexation (and tests), generally a bad idea to
call manually.
"""
if index_name is None:
index_name = AddonIndexer.get_index_alias()
create_index(index_name, config)

index_settings = copy.deepcopy(INDEX_SETTINGS)

config = {
'mappings': get_mappings(),
'settings': {
# create_index will add its own index settings like number of
# shards and replicas.
'index': index_settings
},
}
create_index(index_name, config)


def get_mappings():
"""
Return a dict with all addons-related ES mappings.
"""
indexers = (AddonIndexer,)
return {idxr.get_doctype_name(): idxr.get_mapping() for idxr in indexers}


def reindex_tasks_group(index_name):
"""
Return the group of tasks to execute for a full reindex of addons on the
index called `index_name` (which is not an alias but the real index name).
"""
from olympia.addons.models import Addon
from olympia.addons.tasks import index_addons

ids = Addon.unfiltered.values_list('id', flat=True).order_by('id')
chunk_size = 150
return create_chunked_tasks_signatures(index_addons, list(ids), chunk_size)
@classmethod
def reindex_tasks_group(cls, index_name):
"""
Return the group of tasks to execute for a full reindex of addons on
the index called `index_name` (which is not an alias but the real
index name).
"""
from olympia.addons.tasks import index_addons

ids = cls.get_model().unfiltered.values_list(
'id', flat=True).order_by('id')
chunk_size = 150
return create_chunked_tasks_signatures(
index_addons, list(ids), chunk_size)
Loading

0 comments on commit ec0d8f9

Please sign in to comment.