diff --git a/docs/basics/0002-installation.md b/docs/basics/0002-installation.md index f4de2d1d0b..73fcfbcdfb 100644 --- a/docs/basics/0002-installation.md +++ b/docs/basics/0002-installation.md @@ -78,7 +78,7 @@ admin@control.example.com:~$ commcare-cloud -h usage: commcare-cloud [-h] [--control] {64-test,development,echis,icds,icds-new,pna,production,softlayer,staging,swiss} - {bootstrap-users,ansible-playbook,django-manage,aps,tmux,ap,validate-environment-settings,restart-elasticsearch,deploy-stack,service,update-supervisor-confs,update-users,ping,migrate_couchdb,lookup,run-module,update-config,mosh,after-reboot,ssh,downtime,fab,update-local-known-hosts,migrate-couchdb,run-shell-command} + {bootstrap-users,ansible-playbook,django-manage,aps,tmux,ap,validate-environment-settings,deploy-stack,service,update-supervisor-confs,update-users,ping,migrate_couchdb,lookup,run-module,update-config,mosh,after-reboot,ssh,downtime,fab,update-local-known-hosts,migrate-couchdb,run-shell-command} ... ``` diff --git a/docs/changelog/index.md b/docs/changelog/index.md deleted file mode 100644 index c0893c1b17..0000000000 --- a/docs/changelog/index.md +++ /dev/null @@ -1,27 +0,0 @@ -# About this changelog - -The following changes to `commcare-cloud` that require your attention, -newest first. - -Changes that will require an action from anyone choosing -to update on or after the date listed will be maked "_action required_". -Those which are worth taking a look at but which may or may not require -an action on your part will be marked "_action optional_". - - -## Changelog - -### **2018-07-25** [Update Supervisor](0003-update-supervisor.md) -Ubuntu 14.04 `apt-get install supervisor` installs supervisor 3.0b. -We occasionally have issues that could be related to supervisor, -such as processes not stopping correctly. -To rule it out as a possible cause, -we decided it was better to be on a later version of supervisor, -and one that's not in beta. - -### **2018-07-13** [Update supervisor service definitions](0002-supervisor-service-definitions.md) (_action required_) -There are several CommCare specific processes that are defined in supervisor -configuration files. This change decouples the process definitions from code. - -### **2018-06-11** [Added encrypted temporary directory](0001-add-encrypted-tmp.md) (_action required_) -Some of the CommCare processes make use of temporary files to store client data (such as data exports) so in order to keep that data protected we have modified the setup to use an encrypted temporary directory. diff --git a/docs/commcare-cloud/commands/index.md b/docs/commcare-cloud/commands/index.md index 6aa5b9ea5f..be7f339986 100644 --- a/docs/commcare-cloud/commands/index.md +++ b/docs/commcare-cloud/commands/index.md @@ -10,7 +10,7 @@ All `commcare-cloud` commands take the following form: ``` commcare-cloud [--control] - {bootstrap-users,ansible-playbook,django-manage,aps,tmux,ap,validate-environment-settings,restart-elasticsearch,deploy-stack,service,update-supervisor-confs,update-users,ping,migrate_couchdb,lookup,run-module,update-config,copy-files,mosh,after-reboot,ssh,downtime,fab,update-local-known-hosts,list-dbs,migrate-couchdb,run-shell-command} + {bootstrap-users,ansible-playbook,django-manage,aps,tmux,ap,validate-environment-settings,deploy-stack,service,update-supervisor-confs,update-users,ping,migrate_couchdb,lookup,run-module,update-config,copy-files,mosh,after-reboot,ssh,downtime,fab,update-local-known-hosts,list-dbs,migrate-couchdb,run-shell-command} ... ``` @@ -716,29 +716,6 @@ for more detail in what can go here. authenticate using the pem file (or prompt for root password if there is no pem file) -#### `restart-elasticsearch` - -Do a rolling restart of elasticsearch. - -``` -commcare-cloud restart-elasticsearch [--use-factory-auth] -``` - -**This command is deprecated.** Use - -``` -commcare-cloud service elasticsearch restart -``` - -instead. - -##### Optional Arguments - -###### `--use-factory-auth` - -authenticate using the pem file (or prompt for root password if there is no pem file) - - #### `bootstrap-users` Add users to a set of new machines as root. @@ -865,8 +842,8 @@ Manage services. ``` commcare-cloud service [--only PROCESS_PATTERN] - {celery,commcare,couchdb,couchdb2,elasticsearch,formplayer,kafka,nginx,pillowtop,postgresql,rabbitmq,redis,riakcs,touchforms,webworker} - [{celery,commcare,couchdb,couchdb2,elasticsearch,formplayer,kafka,nginx,pillowtop,postgresql,rabbitmq,redis,riakcs,touchforms,webworker} ...] + {celery,commcare,couchdb,couchdb2,elasticsearch,elasticsearch-classic,formplayer,kafka,nginx,pillowtop,postgresql,rabbitmq,redis,riakcs,touchforms,webworker} + [{celery,commcare,couchdb,couchdb2,elasticsearch,elasticsearch-classic,formplayer,kafka,nginx,pillowtop,postgresql,rabbitmq,redis,riakcs,touchforms,webworker} ...] {start,stop,restart,status,help} ``` @@ -888,7 +865,7 @@ service and the `pgbouncer` service. We'll call the actual services ##### Positional Arguments -###### `{celery,commcare,couchdb,couchdb2,elasticsearch,formplayer,kafka,nginx,pillowtop,postgresql,rabbitmq,redis,riakcs,touchforms,webworker}` +###### `{celery,commcare,couchdb,couchdb2,elasticsearch,elasticsearch-classic,formplayer,kafka,nginx,pillowtop,postgresql,rabbitmq,redis,riakcs,touchforms,webworker}` The name of the service group(s) to apply the action to. There is a preset list of service groups that are supported. diff --git a/src/commcare_cloud/ansible/roles/elasticsearch/tasks/rolling_restart.yml b/src/commcare_cloud/ansible/roles/elasticsearch/tasks/rolling_restart.yml index ebf7a6bfd3..500f9a58e0 100644 --- a/src/commcare_cloud/ansible/roles/elasticsearch/tasks/rolling_restart.yml +++ b/src/commcare_cloud/ansible/roles/elasticsearch/tasks/rolling_restart.yml @@ -6,19 +6,35 @@ retries: 20 delay: 3 changed_when: result.stdout.find('"acknowledged":true') != -1 + tags: action_stop - name: stop node become: true service: name=elasticsearch state=stopped + tags: action_stop - name: wait for a few seconds for ES to stop pause: seconds=10 + tags: action_stop + +- name: get es instances to kill + shell: "ps aux | pgrep -f 'elasticsearc[h]'" + register: es_pids + failed_when: es_pids.rc != 0 and es_pids.rc != 1 + tags: action_stop + +- name: kill elasticsearch instances + shell: "pkill -f 'elasticsearc[h]'" + when: es_pids.rc != 0 and es_pids.rc != 1 + tags: action_stop - name: start node become: true service: name=elasticsearch state=started + tags: action_start - debug: msg="Sometimes we try to start the node too soon. If hung start node manually" + tags: action_start - name: wait for node to restart shell: "curl -I -s -m 2 http://{{es_host}}:9200 | head -n 1" @@ -26,6 +42,7 @@ until: result.stdout == "HTTP/1.1 200 OK" retries: 200 delay: 3 + tags: action_start - name: enable cluster routing shell: "curl -XPUT {{es_host}}:9200/_cluster/settings -d '{\"transient\" : {\"cluster.routing.allocation.enable\" : \"all\" }}'" @@ -34,6 +51,7 @@ retries: 20 delay: 3 changed_when: result.stdout.find('"acknowledged":true') != -1 + tags: action_start - name: wait for cluster to stabilize shell: "curl -s -m 2 {{es_host}}:9200/_cat/health | cut -d ' ' -f 4" @@ -41,3 +59,4 @@ until: result.stdout.find("green") != -1 retries: 200 delay: 30 + tags: action_start diff --git a/src/commcare_cloud/commands/ansible/ansible_playbook.py b/src/commcare_cloud/commands/ansible/ansible_playbook.py index 2e5a881049..aeea99e863 100644 --- a/src/commcare_cloud/commands/ansible/ansible_playbook.py +++ b/src/commcare_cloud/commands/ansible/ansible_playbook.py @@ -210,32 +210,6 @@ def run(self, args, unknown_args): return AnsiblePlaybook(self.parser).run(args, unknown_args, always_skip_check=True) -class RestartElasticsearch(_AnsiblePlaybookAlias): - command = 'restart-elasticsearch' - help = """ - Do a rolling restart of elasticsearch. - - **This command is deprecated.** Use - - ``` - commcare-cloud service elasticsearch restart - ``` - - instead. - """ - - def run(self, args, unknown_args): - args.playbook = 'es_rolling_restart.yml' - if not ask('Have you stopped all the elastic pillows?', strict=True, quiet=args.quiet): - return 0 # exit code - puts(colored.yellow( - "This will cause downtime on the order of seconds to minutes,\n" - "except in a few cases where an index is replicated across multiple nodes.")) - if not ask('Do a rolling restart of the ES cluster?', strict=True, quiet=args.quiet): - return 0 # exit code - return AnsiblePlaybook(self.parser).run(args, unknown_args) - - class BootstrapUsers(_AnsiblePlaybookAlias): command = 'bootstrap-users' help = """ diff --git a/src/commcare_cloud/commands/ansible/service.py b/src/commcare_cloud/commands/ansible/service.py index a6704cf57b..9545bd4de1 100644 --- a/src/commcare_cloud/commands/ansible/service.py +++ b/src/commcare_cloud/commands/ansible/service.py @@ -2,6 +2,7 @@ from abc import ABCMeta, abstractmethod, abstractproperty from collections import defaultdict, OrderedDict from itertools import groupby +import sys import attr import six @@ -14,6 +15,7 @@ get_celery_workers, get_pillowtop_processes ) +from commcare_cloud.cli_utils import ask from commcare_cloud.commands.ansible.run_module import run_ansible_module from commcare_cloud.commands.command_base import CommandBase, Argument from commcare_cloud.environment.main import get_environment @@ -274,10 +276,53 @@ class Nginx(AnsibleService): inventory_groups = ['proxy'] -class Elasticsearch(AnsibleService): +class ElasticsearchClassic(AnsibleService): + name = 'elasticsearch-classic' + service_name = 'elasticsearch' + inventory_groups = ['elasticsearch'] + + +class Elasticsearch(ServiceBase): name = 'elasticsearch' + service_name = 'elasticsearch' inventory_groups = ['elasticsearch'] + def execute_action(self, action, host_pattern=None, process_pattern=None): + if action == 'status': + return ElasticsearchClassic(self.environment, self.ansible_context).execute_action(action, host_pattern, process_pattern) + else: + if not ask( + "This function does more than stop and start the elasticsearch service. " + "For that, use elasticsearch-classic." + "\nStop will: stop pillows, stop es, and kill -9 if any processes still exist " + "after a period of time. " + "\nStart will start pillows and start elasticsearch " + "\nRestart is a stop followed by a start.\n Continue?", strict=False): + return 0 # exit code + if action == 'stop' or action == 'restart': + self._act_on_pillows(action='stop') + self._run_rolling_restart_yml(tags='action_stop') + + if action == 'start' or action == 'restart': + self._run_rolling_restart_yml(tags='action_start') + self._act_on_pillows(action='start') + + def _act_on_pillows(self, action): + # Used to stop or start pillows + service = Pillowtop(self.environment, AnsibleContext(None)) + exit_code = service.run(action=action) + if not exit_code == 0: + print("ERROR while trying to {} pillows. Exiting.".format(action)) + sys.exit(1) + + def _run_rolling_restart_yml(self, tags): + from commcare_cloud.commands.ansible.ansible_playbook import run_ansible_playbook + run_ansible_playbook(environment=self.environment, + playbook='es_rolling_restart.yml', + ansible_context=AnsibleContext(args=None), + unknown_args=['--tags={}'.format(tags)], + skip_check=True) + class Couchdb(AnsibleService): name = 'couchdb' @@ -491,6 +536,7 @@ def get_processes_by_host(all_hosts, process_descriptors, process_pattern=None): Couchdb2, RabbitMq, Elasticsearch, + ElasticsearchClassic, Redis, Riakcs, Kafka, diff --git a/src/commcare_cloud/commcare_cloud.py b/src/commcare_cloud/commcare_cloud.py index 4779ea4f57..834f7f3fde 100644 --- a/src/commcare_cloud/commcare_cloud.py +++ b/src/commcare_cloud/commcare_cloud.py @@ -19,7 +19,7 @@ from .commands.ansible.ansible_playbook import ( AnsiblePlaybook, - UpdateConfig, AfterReboot, RestartElasticsearch, BootstrapUsers, DeployStack, + UpdateConfig, AfterReboot, BootstrapUsers, DeployStack, UpdateUsers, UpdateSupervisorConfs, UpdateLocalKnownHosts, ) from commcare_cloud.commands.ansible.service import Service @@ -53,7 +53,6 @@ DeployStack, UpdateConfig, AfterReboot, - RestartElasticsearch, BootstrapUsers, UpdateUsers, UpdateSupervisorConfs,