Upgrade to Elasticsearch 6.x (#14206)

* Upgrade to Elasticsearch 6.x Because ES 6.x removed mapping types, move update_count and download_count in separate ES indexes. This requires a full reindex, including stats, but we need to create an entirely new cluster anyway. ES 5.x compatibility is kept for now.
mozilla · May 28, 2020 · ec0d8f9 · ec0d8f9
1 parent c15389f
commit ec0d8f9
Show file tree

Hide file tree

Showing 20 changed files with 970 additions and 663 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,14 +4,27 @@ dist: xenial
 python:
   - 3.6
 
+addons:
+  apt:
+    packages: &global_deps
+      - cmake
+      - swig
+      - elasticsearch
+      - gettext
+      - librsvg2-bin
+      - pngcrush
+      - uuid
+      - libgit2-dev
+
 jobs:
   fast_finish: true
   include:
     - { env: TOXENV=codestyle }
     - { env: TOXENV=docs }
     - { env: TOXENV=assets }
-    - { env: TOXENV=es }
     - { env: TOXENV=addons-versions-files-ratings }
+    - { env: TOXENV=es ES_VERSION=5.x }
+    - { env: TOXENV=es ES_VERSION=6.x }
     - { env: TOXENV=devhub }
     - { env: TOXENV=reviewers-and-zadmin }
     - { env: TOXENV=amo-lib-locales-and-signing }
@@ -28,31 +41,34 @@ cache:
     - node_modules
     - $HOME/.gimme
 
-addons:
-  apt:
-    sources:
-      - elasticsearch-5.x
-    packages:
-      - cmake
-      - swig
-      - elasticsearch
-      - gettext
-      - librsvg2-bin
-      - pngcrush
-      - uuid
-      - libgit2-dev
-
 services:
   - mysql
   - memcached
-  - elasticsearch
   - redis
 
 before_install:
   - mysql -e 'create database olympia;'
   - export GOPATH=$HOME/go
   - export PATH=$HOME/usr/local/go/bin:$GOPATH/bin:$PATH
+  # Unfortunately we require the most recent libmagic-dev version to make use of more recent
+  # JSON detection features which we require for the code-manager related APIs
+  - echo "deb http://archive.ubuntu.com/ubuntu eoan main" | sudo tee -a /etc/apt/sources.list
+  - sudo apt-get update -qq
+  - sudo apt-get install -t eoan libmagic-dev libmagic1 libmagic-mgc
+  - echo "text/markdown                                   md markdown" | sudo tee -a /etc/mime.types
   - sudo cp ./docker/etc/mime.types /etc/mime.types
+  - |
+    if [ $TOXENV == "es" ]; then
+        if [ $ES_VERSION == "6.x" ]; then
+            curl -s -O https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.8.8.deb
+            sudo dpkg -i --force-confnew elasticsearch-6.8.8.deb
+            sudo sed -i.old 's/-Xms1g/-Xms128m/' /etc/elasticsearch/jvm.options
+            sudo sed -i.old 's/-Xmx1g/-Xmx128m/' /etc/elasticsearch/jvm.options
+            echo -e '-XX:+DisableExplicitGC\n-Djdk.io.permissionsUseCanonicalPath=true\n-Dlog4j.skipJansi=true\n-server\n' | sudo tee -a /etc/elasticsearch/jvm.options
+            sudo chown -R elasticsearch:elasticsearch /etc/default/elasticsearch
+        fi
+        sudo systemctl start elasticsearch
+    fi
 
 install:
   - nvm current
@@ -65,7 +81,11 @@ before_script:
   - mysql --version
   - node --version
   - java -version
-  - curl -v http://localhost:9200/
+  - |
+    if [ $TOXENV == "es" ]; then
+        sleep 10;
+        curl -v http://localhost:9200/;
+    fi
   - sudo touch /addons-server-docker-container
 
 script:

diff --git a/conftest.py b/conftest.py
@@ -23,7 +23,7 @@ def unpin_db(request):
     request.addfinalizer(pinning.unpin_this_thread)
 
 
-@pytest.fixture(autouse=True)
+@pytest.fixture(autouse=True, scope='class')
 def mock_elasticsearch():
     """Mock ElasticSearch in tests by default.
 

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -60,7 +60,7 @@ services:
       - MYSQL_DATABASE=olympia
 
   elasticsearch:
-    image: docker.elastic.co/elasticsearch/elasticsearch:5.4.1
+    image: docker.elastic.co/elasticsearch/elasticsearch:6.8.8
     environment:
       # Disable all xpack related features to avoid unrelated logging
       # in docker logs. https://github.com/mozilla/addons-server/issues/8887

diff --git a/src/olympia/addons/indexers.py b/src/olympia/addons/indexers.py
@@ -1,5 +1,7 @@
 import copy
 
+from django.conf import settings
+
 import olympia.core.logger
 from olympia import amo
 from olympia.amo.indexers import BaseSearchIndexer
@@ -36,11 +38,100 @@ class AddonIndexer(BaseSearchIndexer):
         'summary_l10n_*',
     )
 
+    index_settings = {
+        'analysis': {
+            'analyzer': {
+                'standard_with_word_split': {
+                    # This analyzer tries to split the text into words by using
+                    # various methods. It also lowercases them and make sure
+                    # each token is only returned once.
+                    # Only use for short things with extremely meaningful
+                    # content like add-on name - it makes too many
+                    # modifications to be useful for things like descriptions,
+                    # for instance.
+                    'tokenizer': 'standard',
+                    'filter': [
+                        'standard', 'custom_word_delimiter', 'lowercase',
+                        'stop', 'custom_dictionary_decompounder', 'unique',
+                    ]
+                },
+                'trigram': {
+                    # Analyzer that splits the text into trigrams.
+                    'tokenizer': 'ngram_tokenizer',
+                    'filter': [
+                        'lowercase',
+                    ]
+                },
+            },
+            'tokenizer': {
+                'ngram_tokenizer': {
+                    'type': 'ngram',
+                    'min_gram': 3,
+                    'max_gram': 3,
+                    'token_chars': ['letter', 'digit']
+                }
+            },
+            'normalizer': {
+                'lowercase_keyword_normalizer': {
+                    # By default keywords are indexed 'as-is', but for exact
+                    # name matches we need to lowercase them before indexing,
+                    # so this normalizer does that for us.
+                    'type': 'custom',
+                    'filter': ['lowercase'],
+                },
+            },
+            'filter': {
+                'custom_word_delimiter': {
+                    # This filter is useful for add-on names that have multiple
+                    # words sticked together in a way that is easy to
+                    # recognize, like FooBar, which should be indexed as FooBar
+                    # and Foo Bar. (preserve_original: True makes us index both
+                    # the original and the split version.)
+                    'type': 'word_delimiter',
+                    'preserve_original': True
+                },
+                'custom_dictionary_decompounder': {
+                    # This filter is also useful for add-on names that have
+                    # multiple words sticked together, but without a pattern
+                    # that we can automatically recognize. To deal with those,
+                    # we use a small dictionary of common words. It allows us
+                    # to index 'awesometabpassword'  as 'awesome tab password',
+                    # helping users looking for 'tab password' find that addon.
+                    'type': 'dictionary_decompounder',
+                    'word_list': [
+                        'all', 'auto', 'ball', 'bar', 'block', 'blog',
+                        'bookmark', 'browser', 'bug', 'button', 'cat', 'chat',
+                        'click', 'clip', 'close', 'color', 'context', 'cookie',
+                        'cool', 'css', 'delete', 'dictionary', 'down',
+                        'download', 'easy', 'edit', 'fill', 'fire', 'firefox',
+                        'fix', 'flag', 'flash', 'fly', 'forecast', 'fox',
+                        'foxy', 'google', 'grab', 'grease', 'html', 'http',
+                        'image', 'input', 'inspect', 'inspector', 'iris', 'js',
+                        'key', 'keys', 'lang', 'link', 'mail', 'manager',
+                        'map', 'mega', 'menu', 'menus', 'monkey', 'name',
+                        'net', 'new', 'open', 'password', 'persona', 'privacy',
+                        'query', 'screen', 'scroll', 'search', 'secure',
+                        'select', 'smart', 'spring', 'status', 'style',
+                        'super', 'sync', 'tab', 'text', 'think', 'this',
+                        'time', 'title', 'translate', 'tree', 'undo', 'upload',
+                        'url', 'user', 'video', 'window', 'with', 'word',
+                        'zilla',
+                    ]
+                },
+            }
+        }
+    }
+
     @classmethod
     def get_model(cls):
         from olympia.addons.models import Addon
         return Addon
 
+    @classmethod
+    def get_index_alias(cls):
+        """Return the index alias name."""
+        return settings.ES_INDEXES.get('default')
+
     @classmethod
     def get_mapping(cls):
         doc_name = cls.get_doctype_name()
@@ -244,7 +335,7 @@ def extract_version(cls, obj, version_obj):
             if version_obj.license:
                 data['license'] = {
                     'id': version_obj.license.id,
-                    'builtin': version_obj.license.builtin,
+                    'builtin': bool(version_obj.license.builtin),
                     'url': version_obj.license.url,
                 }
                 attach_trans_dict(License, [version_obj.license])
@@ -363,129 +454,39 @@ def extract_document(cls, obj):
 
         return data
 
-
-# addons index settings.
-INDEX_SETTINGS = {
-    'analysis': {
-        'analyzer': {
-            'standard_with_word_split': {
-                # This analyzer tries to split the text into words by using
-                # various methods. It also lowercases them and make sure each
-                # token is only returned once.
-                # Only use for short things with extremely meaningful content
-                # like add-on name - it makes too many modifications to be
-                # useful for things like descriptions, for instance.
-                'tokenizer': 'standard',
-                'filter': [
-                    'standard', 'custom_word_delimiter', 'lowercase', 'stop',
-                    'custom_dictionary_decompounder', 'unique',
-                ]
-            },
-            'trigram': {
-                # Analyzer that splits the text into trigrams.
-                'tokenizer': 'ngram_tokenizer',
-                'filter': [
-                    'lowercase',
-                ]
-            },
-        },
-        'tokenizer': {
-            'ngram_tokenizer': {
-                'type': 'ngram',
-                'min_gram': 3,
-                'max_gram': 3,
-                'token_chars': ['letter', 'digit']
-            }
-        },
-        'normalizer': {
-            'lowercase_keyword_normalizer': {
-                # By default keywords are indexed 'as-is', but for exact name
-                # matches we need to lowercase them before indexing, so this
-                # normalizer does that for us.
-                'type': 'custom',
-                'filter': ['lowercase'],
-            },
-        },
-        'filter': {
-            'custom_word_delimiter': {
-                # This filter is useful for add-on names that have multiple
-                # words sticked together in a way that is easy to recognize,
-                # like FooBar, which should be indexed as FooBar and Foo Bar.
-                # (preserve_original: True makes us index both the original
-                # and the split version.)
-                'type': 'word_delimiter',
-                'preserve_original': True
+    @classmethod
+    def create_new_index(cls, index_name):
+        """
+        Create a new index for addons in ES.
+
+        Intended to be used by reindexation (and tests), generally a bad idea
+        to call manually.
+        """
+        index_settings = copy.deepcopy(cls.index_settings)
+
+        config = {
+            'mappings': {
+                cls.get_doctype_name(): cls.get_mapping(),
             },
-            'custom_dictionary_decompounder': {
-                # This filter is also useful for add-on names that have
-                # multiple words sticked together, but without a pattern that
-                # we can automatically recognize. To deal with those, we use
-                # a small dictionary of common words. It allows us to index
-                # 'awesometabpassword'  as 'awesome tab password', helping
-                # users looking for 'tab password' find that add-on.
-                'type': 'dictionary_decompounder',
-                'word_list': [
-                    'all', 'auto', 'ball', 'bar', 'block', 'blog', 'bookmark',
-                    'browser', 'bug', 'button', 'cat', 'chat', 'click', 'clip',
-                    'close', 'color', 'context', 'cookie', 'cool', 'css',
-                    'delete', 'dictionary', 'down', 'download', 'easy', 'edit',
-                    'fill', 'fire', 'firefox', 'fix', 'flag', 'flash', 'fly',
-                    'forecast', 'fox', 'foxy', 'google', 'grab', 'grease',
-                    'html', 'http', 'image', 'input', 'inspect', 'inspector',
-                    'iris', 'js', 'key', 'keys', 'lang', 'link', 'mail',
-                    'manager', 'map', 'mega', 'menu', 'menus', 'monkey',
-                    'name', 'net', 'new', 'open', 'password', 'persona',
-                    'privacy', 'query', 'screen', 'scroll', 'search', 'secure',
-                    'select', 'smart', 'spring', 'status', 'style', 'super',
-                    'sync', 'tab', 'text', 'think', 'this', 'time', 'title',
-                    'translate', 'tree', 'undo', 'upload', 'url', 'user',
-                    'video', 'window', 'with', 'word', 'zilla',
-                ]
+            'settings': {
+                # create_index will add its own index settings like number of
+                # shards and replicas.
+                'index': index_settings
             },
         }
-    }
-}
-
-
-def create_new_index(index_name=None):
-    """
-    Create a new index for addons in ES.
-
-    Intended to be used by reindexation (and tests), generally a bad idea to
-    call manually.
-    """
-    if index_name is None:
-        index_name = AddonIndexer.get_index_alias()
+        create_index(index_name, config)
 
-    index_settings = copy.deepcopy(INDEX_SETTINGS)
-
-    config = {
-        'mappings': get_mappings(),
-        'settings': {
-            # create_index will add its own index settings like number of
-            # shards and replicas.
-            'index': index_settings
-        },
-    }
-    create_index(index_name, config)
-
-
-def get_mappings():
-    """
-    Return a dict with all addons-related ES mappings.
-    """
-    indexers = (AddonIndexer,)
-    return {idxr.get_doctype_name(): idxr.get_mapping() for idxr in indexers}
-
-
-def reindex_tasks_group(index_name):
-    """
-    Return the group of tasks to execute for a full reindex of addons on the
-    index called `index_name` (which is not an alias but the real index name).
-    """
-    from olympia.addons.models import Addon
-    from olympia.addons.tasks import index_addons
-
-    ids = Addon.unfiltered.values_list('id', flat=True).order_by('id')
-    chunk_size = 150
-    return create_chunked_tasks_signatures(index_addons, list(ids), chunk_size)
+    @classmethod
+    def reindex_tasks_group(cls, index_name):
+        """
+        Return the group of tasks to execute for a full reindex of addons on
+        the index called `index_name` (which is not an alias but the real
+        index name).
+        """
+        from olympia.addons.tasks import index_addons
+
+        ids = cls.get_model().unfiltered.values_list(
+            'id', flat=True).order_by('id')
+        chunk_size = 150
+        return create_chunked_tasks_signatures(
+            index_addons, list(ids), chunk_size)