-
Notifications
You must be signed in to change notification settings - Fork 8
/
ansible-autoremediation.yml
262 lines (262 loc) · 14.7 KB
/
ansible-autoremediation.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# auto-remediation and self-healing with Ansible Automation Platform (Tower)
---
-
hosts: localhost
name: "ansible auto-remediation handler for dynatrace"
tasks:
# Problem State OPEN
- name: "problem state is OPEN"
block:
# Before auto-remedation
-
name: "update problem with comment before auto-remediation"
when: state == "OPEN"
uri:
url: https://{{ dynatrace_api_url }}/api/v1/problem/details/{{ pid }}/comments?Api-Token={{ dynatrace_token }}
method: POST
body:
comment: Problem notification received by Ansible Tower. Attempting to perform auto-remediation.
user: "Job {{ tower_job_id }}"
context: Ansible Tower
body_format: json
status_code: 200
-
name: "metrics - attempted remediations (ansible.remediations.attempted)"
uri:
url: http://localhost:14499/metrics/ingest
method: POST
headers:
content-type: "text/plain; charset=utf-8"
body: "ansible.{{ metrics_group }}.remediations.attempted,id=\"{{ tower_job_id }}\",template=\"{{ tower_job_template_name }}\",type=\"{{ tower_job_launch_type }}\" 1"
status_code: 202
ignore_errors: yes
when: metrics is defined and metrics == True and metrics_group is defined
# Determine the remediation action if one exists
# Determine root cause if it exists
-
name: "retrieve problem details from dynatrace api"
when: state == "OPEN"
uri:
url: "https://{{ dynatrace_api_url }}/api/v1/problem/details/{{ pid }}?Api-Token={{ dynatrace_token }}"
status_code: 200
register: dynatrace_problem_details_response
-
name: "determine if there is a root cause identified"
when: state == "OPEN"
set_fact:
dynatrace_rootcause_detected: "{{ dynatrace_problem_details_response.json.result.hasRootCause | bool }}"
-
name: "identify the root cause entity"
when: state == "OPEN" and dynatrace_rootcause_detected == True and item.isRootCause == True and dynatrace_offset is defined
set_fact:
dynatrace_rootcause_entity: "{{ item.entityId }}"
dynatrace_rootcause_status: "{{ item.status }}"
dynatrace_rootcause_startTime: "{{ (item.startTime - (dynatrace_offset * 60000)) }}"
with_items: "{{ dynatrace_problem_details_response.json.result.rankedEvents }}"
-
name: "output the root cause identity"
debug:
var: dynatrace_rootcause_entity
when: dynatrace_rootcause_entity is defined
# Check for root cause remediation action from deployment events
- name: "check for root cause remediation from deployment events"
block:
-
name: "retrieve root cause entity events from dynatrace api"
when: state == "OPEN" and dynatrace_rootcause_detected and dynatrace_rootcause_entity is defined
uri:
url: "https://{{ dynatrace_api_url }}/api/v1/events?from={{ dynatrace_rootcause_startTime }}&eventType=CUSTOM_DEPLOYMENT&entityId={{ dynatrace_rootcause_entity }}&Api-Token={{ dynatrace_token }}"
status_code: 200
register: dynatrace_events_deployments_response
-
name: "determine if there are any deployment events"
when: state == "OPEN" and dynatrace_rootcause_detected and dynatrace_events_deployments_response is defined
set_fact:
dynatrace_events_deployments_detected: "{{ dynatrace_events_deployments_response.json.totalEventCount }}"
-
name: "retrieve event remediation action"
when: state == "OPEN" and dynatrace_rootcause_detected and dynatrace_events_deployments_detected != 0
set_fact:
dynatrace_remediation_action: "{{ item.remediationAction }}"
dynatrace_remediation_name: "{{ item.deploymentName }}"
dynatrace_remediation_type: "Deployment Event"
with_items: "{{ dynatrace_events_deployments_response.json.events }}"
when: dynatrace_rootcause_entity is defined and dynatrace_remediation_action is not defined
# Check for root cause remediation action from configuration events
- name: "check for root cause remediation action from configuration events"
block:
-
name: "retrieve root cause entity events from dynatrace api"
when: state == "OPEN" and dynatrace_rootcause_detected and dynatrace_rootcause_entity is defined and dynatrace_remediation_action is not defined
uri:
url: "https://{{ dynatrace_api_url }}/api/v1/events?from={{ dynatrace_rootcause_startTime }}&eventType=CUSTOM_CONFIGURATION&entityId={{ dynatrace_rootcause_entity }}&Api-Token={{ dynatrace_token }}"
status_code: 200
register: dynatrace_events_configurations_response
-
name: "determine if there are any configuration events"
when: state == "OPEN" and dynatrace_rootcause_detected and dynatrace_events_configurations_response is defined and dynatrace_remediation_action is not defined
set_fact:
dynatrace_events_configurations_detected: "{{ dynatrace_events_configurations_response.json.totalEventCount }}"
-
name: "retrieve event remediation action"
when: state == "OPEN" and dynatrace_rootcause_detected and dynatrace_events_configurations_detected != 0 and dynatrace_remediation_action is not defined
set_fact:
dynatrace_remediation_action: "{{ item.customProperties.Remediation }}"
dynatrace_remediation_name: "{{ item.annotationDescription }}"
dynatrace_remediation_type: "Configuration Change"
with_items: "{{ dynatrace_events_configurations_response.json.events }}"
when: dynatrace_rootcause_entity is defined and dynatrace_remediation_action is not defined
- name: "trigger auto-remediation action tasks"
block:
-
name: "output the remediation action"
debug:
var: dynatrace_remediation_action
when: dynatrace_remediation_action is defined
# Update problem card with auto-remediation action
-
name: "update problem with comment after finding remediation action"
when: state == "OPEN" and dynatrace_remediation_name is defined
uri:
url: https://{{ dynatrace_api_url }}/api/v1/problem/details/{{ pid }}/comments?Api-Token={{ dynatrace_token }}
method: POST
body:
comment: "Determined '{{ dynatrace_remediation_type }}' problem remediation action for '{{ dynatrace_remediation_name }}'."
user: "Job {{ tower_job_id }}"
context: Ansible Tower
body_format: json
status_code: 200
# Trigger Ansible auto-remediation action
-
name: "call ansible remediation action url"
when: state == "OPEN" and '/launch/' in dynatrace_remediation_action
uri:
url: "https://{{ ansible_tower_host }}{{ dynatrace_remediation_action }}"
status_code: 201
user: "{{ ansible_tower_user }}"
password: "{{ ansible_tower_password }}"
method: POST
force_basic_auth: yes
validate_certs: no
register: ansible_remediation_response
-
name: "retrieve ansible job status"
when: ansible_remediation_response is defined
uri:
url: "https://{{ ansible_tower_host }}{{ ansible_remediation_response.json.url }}"
status_code: 200
user: "{{ ansible_tower_user }}"
password: "{{ ansible_tower_password }}"
method: GET
force_basic_auth: yes
validate_certs: no
register: ansible_remediation_job_response
until: ansible_remediation_job_response.json.status != 'running'
retries: 5
delay: 60
-
name: "validate ansible job status is successful"
when: ansible_remediation_response is defined and ansible_remediation_job_response is defined and ansible_remediation_job_response.json.status == 'successful'
set_fact:
ansible_remediation_successful: True
# After auto-remediation is triggered
-
name: "update problem with comment after remediation"
when: state == "OPEN" and ansible_remediation_response.json.job is defined and ansible_remediation_successful == True
uri:
url: https://{{ dynatrace_api_url }}/api/v1/problem/details/{{ pid }}/comments?Api-Token={{ dynatrace_token }}
method: POST
body:
comment: "Successfully triggered auto-remediation. See Ansible Tower Job: '{{ ansible_remediation_response.json.job }}'"
user: "Job {{ tower_job_id }}"
context: Ansible Tower
body_format: json
status_code: 200
when: dynatrace_remediation_action is defined
# When a remediation action is not found
- name: "no remediation action found"
block:
-
name: "update problem with comment indicating remediation not found"
when: state == "OPEN" and dynatrace_remediation_action is not defined
uri:
url: https://{{ dynatrace_api_url }}/api/v1/problem/details/{{ pid }}/comments?Api-Token={{ dynatrace_token }}
method: POST
body:
comment: "Unable to trigger auto-remediation. A remediation action was not found for this problem!"
user: "Job {{ tower_job_id }}"
context: Ansible Tower
body_format: json
status_code: 200
-
name: "metrics - failed remediations (ansible.remediations.failed)"
uri:
url: http://localhost:14499/metrics/ingest
method: POST
headers:
content-type: "text/plain; charset=utf-8"
body: "ansible.{{ metrics_group }}.remediations.failed,id=\"{{ tower_job_id }}\",template=\"{{ tower_job_template_name }}\",type=\"{{ tower_job_launch_type }}\" 1"
status_code: 202
ignore_errors: yes
when: metrics is defined and metrics == True and metrics_group is defined
when: dynatrace_remediation_action is not defined
when: state == "OPEN"
# Problem State RESOLVED
- name: "problem state is RESOLVED"
block:
-
name: "retrieve problem comments from dynatrace api"
when: state == "RESOLVED"
uri:
url: https://{{ dynatrace_api_url }}/api/v1/problem/details/{{ pid }}/comments?Api-Token={{ dynatrace_token }}
status_code: 200
register: dynatrace_problem_comments_response
-
name: "determine if problem was auto-remediated by ansible tower"
when: state == "RESOLVED" and dynatrace_problem_comments_response is defined and 'Successfully triggered auto-remediation.' in item.content
set_fact:
dynatrace_remediation_triggered: true
with_items: "{{ dynatrace_problem_comments_response.json.comments }}"
-
name: "update problem with comment after resolved notification"
when: state == "RESOLVED" and dynatrace_remediation_triggered is defined and dynatrace_remediation_triggered == True
uri:
url: https://{{ dynatrace_api_url }}/api/v1/problem/details/{{ pid }}/comments?Api-Token={{ dynatrace_token }}
method: POST
body:
comment: Problem state changed to RESOLVED. Ansible Tower self-healing successful. *drops mic*
user: "Job {{ tower_job_id }}"
context: Ansible Tower
body_format: json
status_code: 200
-
name: "metrics - successful remediations (ansible.remediations.successful)"
uri:
url: http://localhost:14499/metrics/ingest
method: POST
headers:
content-type: "text/plain; charset=utf-8"
body: "ansible.{{ metrics_group }}.remediations.successful,id=\"{{ tower_job_id }}\",template=\"{{ tower_job_template_name }}\",type=\"{{ tower_job_launch_type }}\" 1"
status_code: 202
ignore_errors: yes
when: metrics is defined and metrics == True and metrics_group is defined
-
name: "retrieve remediation action details from problem comments"
when: state == "RESOLVED" and dynatrace_remediation_triggered == True and dynatrace_problem_comments_response is defined and dynatrace_remediation_triggered is defined and 'problem remediation action for' in item.content
set_fact:
dynatrace_remediation_comment: "{{ item.content }}"
with_items: "{{ dynatrace_problem_comments_response.json.comments }}"
-
name: "send email notification with remediation success"
when: state == "RESOLVED" and dynatrace_remediation_triggered is defined and dynatrace_remediation_triggered == True and send_email == True
mail:
subtype: html
host: "{{ smtp_host }}"
port: "{{ smtp_port }}"
username: "{{ smtp_user }}"
password: "{{ smtp_password }}"
to: "{{ email_receiver }}"
from: "{{ smtp_from }}"
subject: "PROBLEM {{ problemId }} {{ problemTitle }} | Dynatrace and Ansible have automatically resolved a problem"
body: "<div><img src=\"https://tpc-easytravel-demo.s3.amazonaws.com/dynatrace_logo.png\"><img src=\"https://tpc-easytravel-demo.s3.amazonaws.com/ansible_redhat_logo.png\"></div><div><h2>Dynatrace automatically detected a '<span style=\"color:red\">{{ problemSeverity }}</span>' severity problem which impacts '<span style=\"color:red\">{{ problemImpact }}</span>' type entities.</h2><hr><h3>The problem was <span style=\"color:green\">resolved automatically</span>. Dynatrace, with Ansible, '{{ dynatrace_remediation_comment }}'.</p>View the problem details in Dynatrace here '{{ problemUrl }}'.</h3></div><div><img src=\"https://tpc-easytravel-demo.s3.amazonaws.com/davis_photo.png\"></div>"
when: state == "RESOLVED"