From b658e721b5fd8e1cef40ffd52ed61d2b74e83faa Mon Sep 17 00:00:00 2001 From: 5hubh49 Date: Thu, 27 Jun 2024 08:39:05 +0530 Subject: [PATCH 1/3] add storage class analyzer --- in-cluster/default.yaml | 431 +--------------------------------- in-cluster/default.yaml.bak | 454 ++++++++++++++++++++++++++++++++++++ 2 files changed, 459 insertions(+), 426 deletions(-) create mode 100644 in-cluster/default.yaml.bak diff --git a/in-cluster/default.yaml b/in-cluster/default.yaml index 9f0f998..5914f21 100644 --- a/in-cluster/default.yaml +++ b/in-cluster/default.yaml @@ -3,373 +3,8 @@ kind: SupportBundle metadata: name: default spec: - uri: https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/in-cluster/default.yaml - collectors: - - runPod: - name: ekco-resources - namespace: kurl - podSpec: - containers: - - name: inspect-ekco-pods - image: adamancini/netshoot - command: ["sh", "-c", "--"] - args: - [ - "kubectl get pod -n kurl --selector app=ekc-operator --field-selector status.phase=Running -o json | jq -r .items[]", - ] - restartPolicy: Never - dnsPolicy: ClusterFirst - serviceAccount: ekco - - copyFromHost: - collectorName: "copy apiserver audit logs" - image: alpine - hostPath: "/var/log/apiserver/" - name: "logs" - extractArchive: true - - copyFromHost: - collectorName: "copy kURL logs" - image: alpine - hostPath: "/var/log/kurl/" - name: "logs" - extractArchive: true - - clusterInfo: {} - - clusterResources: {} - - ceph: {} - - longhorn: {} - - exec: - args: - - "http://localhost:3030/goroutines" - collectorName: kotsadm-goroutines - command: - - curl - containerName: kotsadm - name: kots/admin_console - selector: - - app=kotsadm - timeout: 10s - - exec: - args: - - "http://localhost:3030/goroutines" - collectorName: kotsadm-operator-goroutines - command: - - curl - containerName: kotsadm-operator - name: kots/admin_console - selector: - - app=kotsadm-operator - timeout: 10s - - logs: - collectorName: velero-logs - namespace: velero - name: kots/velero - - logs: - collectorName: kurl-control-plane - name: kots/kurl/control-plane - selector: - - tier=control-plane - - logs: - collectorName: kotsadm-postgres-db - name: kots/admin_console - selector: - - app=kotsadm-postgres - - logs: - collectorName: kotsadm-api - name: kots/admin_console - selector: - - app=kotsadm-api - - logs: - collectorName: kotsadm-operator - name: kots/admin_console - selector: - - app=kotsadm-operator - - logs: - collectorName: kotsadm - name: kots/admin_console - selector: - - app=kotsadm - - logs: - collectorName: kurl-proxy-kotsadm - name: kots/admin_console - selector: - - app=kurl-proxy-kotsadm - - logs: - collectorName: kotsadm-dex - name: kots/admin_console - selector: - - app=kotsadm-dex - - logs: - collectorName: kotsadm-fs-minio - name: kots/admin_console - selector: - - app=kotsadm-fs-minio - - logs: - collectorName: kotsadm-s3-ops - name: kots/admin_console - selector: - - app=kotsadm-s3-ops - - logs: - collectorName: registry - name: kots/kurl - selector: - - app=registry - namespace: kurl - - logs: - collectorName: ekc-operator - name: kots/kurl - selector: - - app=ekc-operator - namespace: kurl - - secret: - collectorName: kotsadm-replicated-registry - name: kotsadm-replicated-registry # NOTE: this will not live under the kots/ directory like other collectors - includeValue: false - key: .dockerconfigjson - - logs: - collectorName: rook-ceph-logs - namespace: rook-ceph - name: kots/rook - - logs: - collectorName: coredns-logs - namespace: kube-system - name: logs/coredns - selector: - - k8s-app=kube-dns - - exec: - collectorName: weave-status - command: - - /home/weave/weave - args: - - --local - - status - containerName: weave - exclude: "" - name: kots/kurl/weave - namespace: kube-system - selector: - - name=weave-net - timeout: 10s - - exec: - collectorName: weave-report - command: - - /home/weave/weave - args: - - --local - - report - containerName: weave - exclude: "" - name: kots/kurl/weave - namespace: kube-system - selector: - - name=weave-net - timeout: 10s - - logs: - collectorName: rqlite-logs - name: kots/rqlite/logs - selector: - - app=kotsadm-rqlite - - logs: - collectorName: weave-net - selector: - - name=weave-net - namespace: kube-system - name: kots/kurl/weave - - logs: - collectorName: minio - selector: - - app=minio - namespace: minio - name: kots/kurl/minio - - logs: - collectorName: ha-minio - selector: - - app=ha-minio - namespace: minio - name: kots/kurl/ha-minio - - exec: - args: - - "http://goldpinger.kurl.svc.cluster.local:80/check_all" - collectorName: goldpinger-statistics - command: - - curl - containerName: kotsadm - name: kots/goldpinger - selector: - - app=kotsadm - timeout: 10s - - copyFromHost: - collectorName: kurl-host-preflights - name: kots/kurl/host-preflights - hostPath: /var/lib/kurl/host-preflights - extractArchive: true - image: alpine - imagePullPolicy: IfNotPresent - timeout: 1m - - configMap: - collectorName: coredns - name: coredns - namespace: kube-system - includeAllData: true - - configMap: - collectorName: kube-proxy - name: kube-proxy - namespace: kube-system - includeAllData: true - - configMap: - collectorName: kubeadm-config - name: kubeadm-config - namespace: kube-system - includeAllData: true - - configMap: - collectorName: kubelet-config - name: kubelet-config - namespace: kube-system - includeAllData: true - - configMap: - collectorName: kurl-config - name: kurl-config - namespace: kube-system - includeAllData: true - - configMap: - collectorName: weave-net - name: weave-net - namespace: kube-system - includeAllData: true - - configMap: - collectorName: ekco-config - name: ekco-config - namespace: kurl - includeAllData: true - - configMap: - collectorName: kurl-current-config - name: kurl-current-config # NOTE: this will not live under the kots/ directory like other collectors - namespace: kurl - includeAllData: true - - configMap: - collectorName: kurl-last-config - name: kurl-last-config # NOTE: this will not live under the kots/ directory like other collectors - namespace: kurl - includeAllData: true - - logs: - collectorName: projectcontour-logs - namespace: projectcontour - name: projectcontour/logs - - runPod: - name: rqlite-status - namespace: default - podSpec: - containers: - - name: rqlite-status - image: busybox:1 - command: ["wget"] - args: ["-q", "-T", "5", "http://kotsadm-rqlite:4001/status?pretty", "-O-"] - - runPod: - name: rqlite-nodes - namespace: default - podSpec: - containers: - - name: rqlite-nodes - image: busybox:1 - command: ["wget"] - args: ["-q", "-T", "5", "http://kotsadm-rqlite:4001/nodes?pretty&ver=2", "-O-"] - - http: - collectorName: replicated.app-health-check - get: - url: https://replicated.app/healthz + # uri: https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/in-cluster/default.yaml analyzers: - - deploymentStatus: - checkName: Check EKCO is operational - name: ekc-operator - namespace: kurl - outcomes: - - fail: - when: absent - message: EKCO is not installed - please add the EKCO component to your kURL spec and re-run the installer script - - fail: - when: "< 1" - message: EKCO does not have any Ready pods - - pass: - message: EKCO is installed and running - - textAnalyze: - checkName: Check installed EKCO version for critical fixes - fileName: cluster-resources/deployments/kurl.json - regexGroups: '"image": "replicated/ekco:v(?P\d+)\.(?P\d+)\.(?P\d+)"' - outcomes: - - warn: - when: "Minor < 4" - message: A critical update for cluster certificate rotation has been released in EKCO 0.4.0. Please upgrade to the latest available version. - - warn: - when: "Minor < 19" - message: A critical fix for registry certificate rotation has been released in EKCO 0.19.3. Please upgrade to the latest available version. - - pass: - when: "Minor > 20" - message: EKCO version is recent - - containerRuntime: - outcomes: - - fail: - when: "== gvisor" - message: The Admin Console does not support using the gvisor runtime - - pass: - message: A supported container runtime is present on all nodes - - cephStatus: {} - - longhorn: {} - - clusterPodStatuses: - outcomes: - - fail: - when: "!= Healthy" - message: "Status: {{ .Status.Reason }}" - - statefulsetStatus: {} - - deploymentStatus: {} - - jobStatus: {} - - replicasetStatus: {} - - weaveReport: - reportFileGlob: kots/kurl/weave/kube-system/*/weave-report-stdout.txt - - textAnalyze: - checkName: Weave Status - exclude: "" - ignoreIfNoFiles: true - fileName: kots/kurl/weave/kube-system/weave-net-*/weave-status-stdout.txt - outcomes: - - fail: - message: Weave is not ready - - pass: - message: Weave is ready - regex: 'Status: ready' - - textAnalyze: - checkName: Weave Report - exclude: "" - ignoreIfNoFiles: true - fileName: kots/kurl/weave/kube-system/weave-net-*/weave-report-stdout.txt - outcomes: - - fail: - message: Weave is not ready - - pass: - message: Weave is ready - regex: '"Ready": true' - - textAnalyze: - checkName: Weave IP Allocation - exclude: "" - ignoreIfNoFiles: true - regex: 'IP allocation was seeded by different peers' - fileName: kots/kurl/weave/weave-net-*/weave.log - outcomes: - - fail: - when: "true" - message: IP Allocation issues detected. Please run `rm /var/lib/weave/weave-netdata.db && reboot` on each node to resolve this. - - pass: - when: "false" - message: Weave is ready, there are no IP allocation issues. - - textAnalyze: - checkName: Inter-pod Networking - exclude: "" - ignoreIfNoFiles: true - fileName: kots/goldpinger/*/kotsadm-*/goldpinger-statistics-stdout.txt - outcomes: - - fail: - when: "OK = false" - message: Some nodes have pod communication issues - - pass: - message: Goldpinger can communicate properly - regexGroups: '"OK": ?(?P\w+)' - nodeResources: checkName: Node status check outcomes: @@ -389,66 +24,10 @@ spec: - fail: when: "!= Healthy" # Catch all unhealthy pods. A pod is considered healthy if it has a status of Completed, or Running and all of its containers are ready. message: A Contour pod, {{ .Name }}, is unhealthy with a status of {{ .Status.Reason }}. Restarting the pod may fix the issue. - - textAnalyze: - checkName: longhorn multipath conflict - ignoreIfNoFiles: true - fileName: longhorn/longhorn-system/logs/longhorn-csi-plugin-*/longhorn-csi-plugin.log - outcomes: - - fail: - when: "true" - uri: "https://longhorn.io/kb/troubleshooting-volume-with-multipath/" - message: "Longhorn volumes may be in use by system multipath." - - pass: - when: "false" - message: "No block-device conflicts detected" - regex: '.*is apparently in use by the system;.*' - - textAnalyze: - checkName: Minio disk full - fileName: cluster-resources/pods/logs/kurl/registry-*/registry.log - ignoreIfNoFiles: true - regex: '.*XMinioStorageFull: Storage backend has reached its minimum free disk threshold.*' - outcomes: - - fail: - when: "true" - message: "Minio Disk Full" - - pass: - when: "false" - message: "Minio Disk Ok" - - textAnalyze: - checkName: Known issue with Rook < 1.4 - exclude: "" - ignoreIfNoFiles: true - fileName: /ceph/status.json - regex: '\"ceph_release\": \"nautilus\"|\"status\": \"HEALTH_WARN\"' - outcomes: - - fail: - when: "true" - message: "If you have been removing and adding nodes then, you might want ensure that you are not facing the scenario described in the community topic: https://community.replicated.com/t/1099" - - pass: - when: "false" - message: "You are not using a Rook versions < 1.4 and/or your Ceph status is OK" - - textAnalyze: - checkName: Rook rbd filesystem consistency - fileName: /kots/rook/rook-ceph-agent-*.log - ignoreIfNoFiles: true - regex: 'UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.' - outcomes: - - fail: - when: "true" - message: "One or more rook rbd(s) were detected to have filesystem inconsistencies and require manual intervention" - - pass: - when: "false" - message: "Rook filesystem consistency ok" - - jsonCompare: - checkName: https://replicated.app host health check - fileName: replicated.app-health-check.json - path: "response.status" - value: "200" + - storageClass: + checkName: Check for default storage class outcomes: - fail: - when: "false" - message: https://replicated.app is unhealthy. License and software update checks from replicated will fail. If this is locked down environment, please check your proxy settings. - uri: https://kurl.sh/docs/install-with-kurl/proxy-installs + message: No default storage class found - pass: - when: "true" - message: https://replicated.app host is healthy + message: Default storage class found diff --git a/in-cluster/default.yaml.bak b/in-cluster/default.yaml.bak new file mode 100644 index 0000000..9f0f998 --- /dev/null +++ b/in-cluster/default.yaml.bak @@ -0,0 +1,454 @@ +apiVersion: troubleshoot.sh/v1beta2 +kind: SupportBundle +metadata: + name: default +spec: + uri: https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/in-cluster/default.yaml + collectors: + - runPod: + name: ekco-resources + namespace: kurl + podSpec: + containers: + - name: inspect-ekco-pods + image: adamancini/netshoot + command: ["sh", "-c", "--"] + args: + [ + "kubectl get pod -n kurl --selector app=ekc-operator --field-selector status.phase=Running -o json | jq -r .items[]", + ] + restartPolicy: Never + dnsPolicy: ClusterFirst + serviceAccount: ekco + - copyFromHost: + collectorName: "copy apiserver audit logs" + image: alpine + hostPath: "/var/log/apiserver/" + name: "logs" + extractArchive: true + - copyFromHost: + collectorName: "copy kURL logs" + image: alpine + hostPath: "/var/log/kurl/" + name: "logs" + extractArchive: true + - clusterInfo: {} + - clusterResources: {} + - ceph: {} + - longhorn: {} + - exec: + args: + - "http://localhost:3030/goroutines" + collectorName: kotsadm-goroutines + command: + - curl + containerName: kotsadm + name: kots/admin_console + selector: + - app=kotsadm + timeout: 10s + - exec: + args: + - "http://localhost:3030/goroutines" + collectorName: kotsadm-operator-goroutines + command: + - curl + containerName: kotsadm-operator + name: kots/admin_console + selector: + - app=kotsadm-operator + timeout: 10s + - logs: + collectorName: velero-logs + namespace: velero + name: kots/velero + - logs: + collectorName: kurl-control-plane + name: kots/kurl/control-plane + selector: + - tier=control-plane + - logs: + collectorName: kotsadm-postgres-db + name: kots/admin_console + selector: + - app=kotsadm-postgres + - logs: + collectorName: kotsadm-api + name: kots/admin_console + selector: + - app=kotsadm-api + - logs: + collectorName: kotsadm-operator + name: kots/admin_console + selector: + - app=kotsadm-operator + - logs: + collectorName: kotsadm + name: kots/admin_console + selector: + - app=kotsadm + - logs: + collectorName: kurl-proxy-kotsadm + name: kots/admin_console + selector: + - app=kurl-proxy-kotsadm + - logs: + collectorName: kotsadm-dex + name: kots/admin_console + selector: + - app=kotsadm-dex + - logs: + collectorName: kotsadm-fs-minio + name: kots/admin_console + selector: + - app=kotsadm-fs-minio + - logs: + collectorName: kotsadm-s3-ops + name: kots/admin_console + selector: + - app=kotsadm-s3-ops + - logs: + collectorName: registry + name: kots/kurl + selector: + - app=registry + namespace: kurl + - logs: + collectorName: ekc-operator + name: kots/kurl + selector: + - app=ekc-operator + namespace: kurl + - secret: + collectorName: kotsadm-replicated-registry + name: kotsadm-replicated-registry # NOTE: this will not live under the kots/ directory like other collectors + includeValue: false + key: .dockerconfigjson + - logs: + collectorName: rook-ceph-logs + namespace: rook-ceph + name: kots/rook + - logs: + collectorName: coredns-logs + namespace: kube-system + name: logs/coredns + selector: + - k8s-app=kube-dns + - exec: + collectorName: weave-status + command: + - /home/weave/weave + args: + - --local + - status + containerName: weave + exclude: "" + name: kots/kurl/weave + namespace: kube-system + selector: + - name=weave-net + timeout: 10s + - exec: + collectorName: weave-report + command: + - /home/weave/weave + args: + - --local + - report + containerName: weave + exclude: "" + name: kots/kurl/weave + namespace: kube-system + selector: + - name=weave-net + timeout: 10s + - logs: + collectorName: rqlite-logs + name: kots/rqlite/logs + selector: + - app=kotsadm-rqlite + - logs: + collectorName: weave-net + selector: + - name=weave-net + namespace: kube-system + name: kots/kurl/weave + - logs: + collectorName: minio + selector: + - app=minio + namespace: minio + name: kots/kurl/minio + - logs: + collectorName: ha-minio + selector: + - app=ha-minio + namespace: minio + name: kots/kurl/ha-minio + - exec: + args: + - "http://goldpinger.kurl.svc.cluster.local:80/check_all" + collectorName: goldpinger-statistics + command: + - curl + containerName: kotsadm + name: kots/goldpinger + selector: + - app=kotsadm + timeout: 10s + - copyFromHost: + collectorName: kurl-host-preflights + name: kots/kurl/host-preflights + hostPath: /var/lib/kurl/host-preflights + extractArchive: true + image: alpine + imagePullPolicy: IfNotPresent + timeout: 1m + - configMap: + collectorName: coredns + name: coredns + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kube-proxy + name: kube-proxy + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kubeadm-config + name: kubeadm-config + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kubelet-config + name: kubelet-config + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kurl-config + name: kurl-config + namespace: kube-system + includeAllData: true + - configMap: + collectorName: weave-net + name: weave-net + namespace: kube-system + includeAllData: true + - configMap: + collectorName: ekco-config + name: ekco-config + namespace: kurl + includeAllData: true + - configMap: + collectorName: kurl-current-config + name: kurl-current-config # NOTE: this will not live under the kots/ directory like other collectors + namespace: kurl + includeAllData: true + - configMap: + collectorName: kurl-last-config + name: kurl-last-config # NOTE: this will not live under the kots/ directory like other collectors + namespace: kurl + includeAllData: true + - logs: + collectorName: projectcontour-logs + namespace: projectcontour + name: projectcontour/logs + - runPod: + name: rqlite-status + namespace: default + podSpec: + containers: + - name: rqlite-status + image: busybox:1 + command: ["wget"] + args: ["-q", "-T", "5", "http://kotsadm-rqlite:4001/status?pretty", "-O-"] + - runPod: + name: rqlite-nodes + namespace: default + podSpec: + containers: + - name: rqlite-nodes + image: busybox:1 + command: ["wget"] + args: ["-q", "-T", "5", "http://kotsadm-rqlite:4001/nodes?pretty&ver=2", "-O-"] + - http: + collectorName: replicated.app-health-check + get: + url: https://replicated.app/healthz + analyzers: + - deploymentStatus: + checkName: Check EKCO is operational + name: ekc-operator + namespace: kurl + outcomes: + - fail: + when: absent + message: EKCO is not installed - please add the EKCO component to your kURL spec and re-run the installer script + - fail: + when: "< 1" + message: EKCO does not have any Ready pods + - pass: + message: EKCO is installed and running + - textAnalyze: + checkName: Check installed EKCO version for critical fixes + fileName: cluster-resources/deployments/kurl.json + regexGroups: '"image": "replicated/ekco:v(?P\d+)\.(?P\d+)\.(?P\d+)"' + outcomes: + - warn: + when: "Minor < 4" + message: A critical update for cluster certificate rotation has been released in EKCO 0.4.0. Please upgrade to the latest available version. + - warn: + when: "Minor < 19" + message: A critical fix for registry certificate rotation has been released in EKCO 0.19.3. Please upgrade to the latest available version. + - pass: + when: "Minor > 20" + message: EKCO version is recent + - containerRuntime: + outcomes: + - fail: + when: "== gvisor" + message: The Admin Console does not support using the gvisor runtime + - pass: + message: A supported container runtime is present on all nodes + - cephStatus: {} + - longhorn: {} + - clusterPodStatuses: + outcomes: + - fail: + when: "!= Healthy" + message: "Status: {{ .Status.Reason }}" + - statefulsetStatus: {} + - deploymentStatus: {} + - jobStatus: {} + - replicasetStatus: {} + - weaveReport: + reportFileGlob: kots/kurl/weave/kube-system/*/weave-report-stdout.txt + - textAnalyze: + checkName: Weave Status + exclude: "" + ignoreIfNoFiles: true + fileName: kots/kurl/weave/kube-system/weave-net-*/weave-status-stdout.txt + outcomes: + - fail: + message: Weave is not ready + - pass: + message: Weave is ready + regex: 'Status: ready' + - textAnalyze: + checkName: Weave Report + exclude: "" + ignoreIfNoFiles: true + fileName: kots/kurl/weave/kube-system/weave-net-*/weave-report-stdout.txt + outcomes: + - fail: + message: Weave is not ready + - pass: + message: Weave is ready + regex: '"Ready": true' + - textAnalyze: + checkName: Weave IP Allocation + exclude: "" + ignoreIfNoFiles: true + regex: 'IP allocation was seeded by different peers' + fileName: kots/kurl/weave/weave-net-*/weave.log + outcomes: + - fail: + when: "true" + message: IP Allocation issues detected. Please run `rm /var/lib/weave/weave-netdata.db && reboot` on each node to resolve this. + - pass: + when: "false" + message: Weave is ready, there are no IP allocation issues. + - textAnalyze: + checkName: Inter-pod Networking + exclude: "" + ignoreIfNoFiles: true + fileName: kots/goldpinger/*/kotsadm-*/goldpinger-statistics-stdout.txt + outcomes: + - fail: + when: "OK = false" + message: Some nodes have pod communication issues + - pass: + message: Goldpinger can communicate properly + regexGroups: '"OK": ?(?P\w+)' + - nodeResources: + checkName: Node status check + outcomes: + - fail: + when: "nodeCondition(Ready) == False" + message: "Not all nodes are online." + - fail: + when: "nodeCondition(Ready) == Unknown" + message: "Not all nodes are online." + - pass: + message: "All nodes are online." + - clusterPodStatuses: + checkName: contour pods unhealthy + namespaces: + - projectcontour + outcomes: + - fail: + when: "!= Healthy" # Catch all unhealthy pods. A pod is considered healthy if it has a status of Completed, or Running and all of its containers are ready. + message: A Contour pod, {{ .Name }}, is unhealthy with a status of {{ .Status.Reason }}. Restarting the pod may fix the issue. + - textAnalyze: + checkName: longhorn multipath conflict + ignoreIfNoFiles: true + fileName: longhorn/longhorn-system/logs/longhorn-csi-plugin-*/longhorn-csi-plugin.log + outcomes: + - fail: + when: "true" + uri: "https://longhorn.io/kb/troubleshooting-volume-with-multipath/" + message: "Longhorn volumes may be in use by system multipath." + - pass: + when: "false" + message: "No block-device conflicts detected" + regex: '.*is apparently in use by the system;.*' + - textAnalyze: + checkName: Minio disk full + fileName: cluster-resources/pods/logs/kurl/registry-*/registry.log + ignoreIfNoFiles: true + regex: '.*XMinioStorageFull: Storage backend has reached its minimum free disk threshold.*' + outcomes: + - fail: + when: "true" + message: "Minio Disk Full" + - pass: + when: "false" + message: "Minio Disk Ok" + - textAnalyze: + checkName: Known issue with Rook < 1.4 + exclude: "" + ignoreIfNoFiles: true + fileName: /ceph/status.json + regex: '\"ceph_release\": \"nautilus\"|\"status\": \"HEALTH_WARN\"' + outcomes: + - fail: + when: "true" + message: "If you have been removing and adding nodes then, you might want ensure that you are not facing the scenario described in the community topic: https://community.replicated.com/t/1099" + - pass: + when: "false" + message: "You are not using a Rook versions < 1.4 and/or your Ceph status is OK" + - textAnalyze: + checkName: Rook rbd filesystem consistency + fileName: /kots/rook/rook-ceph-agent-*.log + ignoreIfNoFiles: true + regex: 'UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.' + outcomes: + - fail: + when: "true" + message: "One or more rook rbd(s) were detected to have filesystem inconsistencies and require manual intervention" + - pass: + when: "false" + message: "Rook filesystem consistency ok" + - jsonCompare: + checkName: https://replicated.app host health check + fileName: replicated.app-health-check.json + path: "response.status" + value: "200" + outcomes: + - fail: + when: "false" + message: https://replicated.app is unhealthy. License and software update checks from replicated will fail. If this is locked down environment, please check your proxy settings. + uri: https://kurl.sh/docs/install-with-kurl/proxy-installs + - pass: + when: "true" + message: https://replicated.app host is healthy From 4bed36e7431d590e60f71e5a3fa164c8431b9b90 Mon Sep 17 00:00:00 2001 From: 5hubh49 Date: Fri, 28 Jun 2024 14:01:32 +0530 Subject: [PATCH 2/3] added default storage analyzer --- in-cluster/default.yaml.bak | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/in-cluster/default.yaml.bak b/in-cluster/default.yaml.bak index 9f0f998..a98cc8c 100644 --- a/in-cluster/default.yaml.bak +++ b/in-cluster/default.yaml.bak @@ -3,7 +3,7 @@ kind: SupportBundle metadata: name: default spec: - uri: https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/in-cluster/default.yaml + # uri: https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/in-cluster/default.yaml collectors: - runPod: name: ekco-resources @@ -452,3 +452,10 @@ spec: - pass: when: "true" message: https://replicated.app host is healthy + - storageClass: + checkName: Check for default storage class + outcomes: + - fail: + message: No default storage class found + - pass: + message: Default storage class found From fc597497dde66d42d0e087395abb63ac9b182230 Mon Sep 17 00:00:00 2001 From: 5hubh49 Date: Fri, 28 Jun 2024 14:04:03 +0530 Subject: [PATCH 3/3] uri uncommented --- in-cluster/default.yaml | 430 ++++++++++++++++++++++++++++++++- in-cluster/default.yaml.bak | 461 ------------------------------------ 2 files changed, 429 insertions(+), 462 deletions(-) delete mode 100644 in-cluster/default.yaml.bak diff --git a/in-cluster/default.yaml b/in-cluster/default.yaml index 5914f21..19af5bb 100644 --- a/in-cluster/default.yaml +++ b/in-cluster/default.yaml @@ -3,8 +3,373 @@ kind: SupportBundle metadata: name: default spec: - # uri: https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/in-cluster/default.yaml + uri: https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/in-cluster/default.yaml + collectors: + - runPod: + name: ekco-resources + namespace: kurl + podSpec: + containers: + - name: inspect-ekco-pods + image: adamancini/netshoot + command: ["sh", "-c", "--"] + args: + [ + "kubectl get pod -n kurl --selector app=ekc-operator --field-selector status.phase=Running -o json | jq -r .items[]", + ] + restartPolicy: Never + dnsPolicy: ClusterFirst + serviceAccount: ekco + - copyFromHost: + collectorName: "copy apiserver audit logs" + image: alpine + hostPath: "/var/log/apiserver/" + name: "logs" + extractArchive: true + - copyFromHost: + collectorName: "copy kURL logs" + image: alpine + hostPath: "/var/log/kurl/" + name: "logs" + extractArchive: true + - clusterInfo: {} + - clusterResources: {} + - ceph: {} + - longhorn: {} + - exec: + args: + - "http://localhost:3030/goroutines" + collectorName: kotsadm-goroutines + command: + - curl + containerName: kotsadm + name: kots/admin_console + selector: + - app=kotsadm + timeout: 10s + - exec: + args: + - "http://localhost:3030/goroutines" + collectorName: kotsadm-operator-goroutines + command: + - curl + containerName: kotsadm-operator + name: kots/admin_console + selector: + - app=kotsadm-operator + timeout: 10s + - logs: + collectorName: velero-logs + namespace: velero + name: kots/velero + - logs: + collectorName: kurl-control-plane + name: kots/kurl/control-plane + selector: + - tier=control-plane + - logs: + collectorName: kotsadm-postgres-db + name: kots/admin_console + selector: + - app=kotsadm-postgres + - logs: + collectorName: kotsadm-api + name: kots/admin_console + selector: + - app=kotsadm-api + - logs: + collectorName: kotsadm-operator + name: kots/admin_console + selector: + - app=kotsadm-operator + - logs: + collectorName: kotsadm + name: kots/admin_console + selector: + - app=kotsadm + - logs: + collectorName: kurl-proxy-kotsadm + name: kots/admin_console + selector: + - app=kurl-proxy-kotsadm + - logs: + collectorName: kotsadm-dex + name: kots/admin_console + selector: + - app=kotsadm-dex + - logs: + collectorName: kotsadm-fs-minio + name: kots/admin_console + selector: + - app=kotsadm-fs-minio + - logs: + collectorName: kotsadm-s3-ops + name: kots/admin_console + selector: + - app=kotsadm-s3-ops + - logs: + collectorName: registry + name: kots/kurl + selector: + - app=registry + namespace: kurl + - logs: + collectorName: ekc-operator + name: kots/kurl + selector: + - app=ekc-operator + namespace: kurl + - secret: + collectorName: kotsadm-replicated-registry + name: kotsadm-replicated-registry # NOTE: this will not live under the kots/ directory like other collectors + includeValue: false + key: .dockerconfigjson + - logs: + collectorName: rook-ceph-logs + namespace: rook-ceph + name: kots/rook + - logs: + collectorName: coredns-logs + namespace: kube-system + name: logs/coredns + selector: + - k8s-app=kube-dns + - exec: + collectorName: weave-status + command: + - /home/weave/weave + args: + - --local + - status + containerName: weave + exclude: "" + name: kots/kurl/weave + namespace: kube-system + selector: + - name=weave-net + timeout: 10s + - exec: + collectorName: weave-report + command: + - /home/weave/weave + args: + - --local + - report + containerName: weave + exclude: "" + name: kots/kurl/weave + namespace: kube-system + selector: + - name=weave-net + timeout: 10s + - logs: + collectorName: rqlite-logs + name: kots/rqlite/logs + selector: + - app=kotsadm-rqlite + - logs: + collectorName: weave-net + selector: + - name=weave-net + namespace: kube-system + name: kots/kurl/weave + - logs: + collectorName: minio + selector: + - app=minio + namespace: minio + name: kots/kurl/minio + - logs: + collectorName: ha-minio + selector: + - app=ha-minio + namespace: minio + name: kots/kurl/ha-minio + - exec: + args: + - "http://goldpinger.kurl.svc.cluster.local:80/check_all" + collectorName: goldpinger-statistics + command: + - curl + containerName: kotsadm + name: kots/goldpinger + selector: + - app=kotsadm + timeout: 10s + - copyFromHost: + collectorName: kurl-host-preflights + name: kots/kurl/host-preflights + hostPath: /var/lib/kurl/host-preflights + extractArchive: true + image: alpine + imagePullPolicy: IfNotPresent + timeout: 1m + - configMap: + collectorName: coredns + name: coredns + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kube-proxy + name: kube-proxy + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kubeadm-config + name: kubeadm-config + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kubelet-config + name: kubelet-config + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kurl-config + name: kurl-config + namespace: kube-system + includeAllData: true + - configMap: + collectorName: weave-net + name: weave-net + namespace: kube-system + includeAllData: true + - configMap: + collectorName: ekco-config + name: ekco-config + namespace: kurl + includeAllData: true + - configMap: + collectorName: kurl-current-config + name: kurl-current-config # NOTE: this will not live under the kots/ directory like other collectors + namespace: kurl + includeAllData: true + - configMap: + collectorName: kurl-last-config + name: kurl-last-config # NOTE: this will not live under the kots/ directory like other collectors + namespace: kurl + includeAllData: true + - logs: + collectorName: projectcontour-logs + namespace: projectcontour + name: projectcontour/logs + - runPod: + name: rqlite-status + namespace: default + podSpec: + containers: + - name: rqlite-status + image: busybox:1 + command: ["wget"] + args: ["-q", "-T", "5", "http://kotsadm-rqlite:4001/status?pretty", "-O-"] + - runPod: + name: rqlite-nodes + namespace: default + podSpec: + containers: + - name: rqlite-nodes + image: busybox:1 + command: ["wget"] + args: ["-q", "-T", "5", "http://kotsadm-rqlite:4001/nodes?pretty&ver=2", "-O-"] + - http: + collectorName: replicated.app-health-check + get: + url: https://replicated.app/healthz analyzers: + - deploymentStatus: + checkName: Check EKCO is operational + name: ekc-operator + namespace: kurl + outcomes: + - fail: + when: absent + message: EKCO is not installed - please add the EKCO component to your kURL spec and re-run the installer script + - fail: + when: "< 1" + message: EKCO does not have any Ready pods + - pass: + message: EKCO is installed and running + - textAnalyze: + checkName: Check installed EKCO version for critical fixes + fileName: cluster-resources/deployments/kurl.json + regexGroups: '"image": "replicated/ekco:v(?P\d+)\.(?P\d+)\.(?P\d+)"' + outcomes: + - warn: + when: "Minor < 4" + message: A critical update for cluster certificate rotation has been released in EKCO 0.4.0. Please upgrade to the latest available version. + - warn: + when: "Minor < 19" + message: A critical fix for registry certificate rotation has been released in EKCO 0.19.3. Please upgrade to the latest available version. + - pass: + when: "Minor > 20" + message: EKCO version is recent + - containerRuntime: + outcomes: + - fail: + when: "== gvisor" + message: The Admin Console does not support using the gvisor runtime + - pass: + message: A supported container runtime is present on all nodes + - cephStatus: {} + - longhorn: {} + - clusterPodStatuses: + outcomes: + - fail: + when: "!= Healthy" + message: "Status: {{ .Status.Reason }}" + - statefulsetStatus: {} + - deploymentStatus: {} + - jobStatus: {} + - replicasetStatus: {} + - weaveReport: + reportFileGlob: kots/kurl/weave/kube-system/*/weave-report-stdout.txt + - textAnalyze: + checkName: Weave Status + exclude: "" + ignoreIfNoFiles: true + fileName: kots/kurl/weave/kube-system/weave-net-*/weave-status-stdout.txt + outcomes: + - fail: + message: Weave is not ready + - pass: + message: Weave is ready + regex: 'Status: ready' + - textAnalyze: + checkName: Weave Report + exclude: "" + ignoreIfNoFiles: true + fileName: kots/kurl/weave/kube-system/weave-net-*/weave-report-stdout.txt + outcomes: + - fail: + message: Weave is not ready + - pass: + message: Weave is ready + regex: '"Ready": true' + - textAnalyze: + checkName: Weave IP Allocation + exclude: "" + ignoreIfNoFiles: true + regex: 'IP allocation was seeded by different peers' + fileName: kots/kurl/weave/weave-net-*/weave.log + outcomes: + - fail: + when: "true" + message: IP Allocation issues detected. Please run `rm /var/lib/weave/weave-netdata.db && reboot` on each node to resolve this. + - pass: + when: "false" + message: Weave is ready, there are no IP allocation issues. + - textAnalyze: + checkName: Inter-pod Networking + exclude: "" + ignoreIfNoFiles: true + fileName: kots/goldpinger/*/kotsadm-*/goldpinger-statistics-stdout.txt + outcomes: + - fail: + when: "OK = false" + message: Some nodes have pod communication issues + - pass: + message: Goldpinger can communicate properly + regexGroups: '"OK": ?(?P\w+)' - nodeResources: checkName: Node status check outcomes: @@ -24,6 +389,69 @@ spec: - fail: when: "!= Healthy" # Catch all unhealthy pods. A pod is considered healthy if it has a status of Completed, or Running and all of its containers are ready. message: A Contour pod, {{ .Name }}, is unhealthy with a status of {{ .Status.Reason }}. Restarting the pod may fix the issue. + - textAnalyze: + checkName: longhorn multipath conflict + ignoreIfNoFiles: true + fileName: longhorn/longhorn-system/logs/longhorn-csi-plugin-*/longhorn-csi-plugin.log + outcomes: + - fail: + when: "true" + uri: "https://longhorn.io/kb/troubleshooting-volume-with-multipath/" + message: "Longhorn volumes may be in use by system multipath." + - pass: + when: "false" + message: "No block-device conflicts detected" + regex: '.*is apparently in use by the system;.*' + - textAnalyze: + checkName: Minio disk full + fileName: cluster-resources/pods/logs/kurl/registry-*/registry.log + ignoreIfNoFiles: true + regex: '.*XMinioStorageFull: Storage backend has reached its minimum free disk threshold.*' + outcomes: + - fail: + when: "true" + message: "Minio Disk Full" + - pass: + when: "false" + message: "Minio Disk Ok" + - textAnalyze: + checkName: Known issue with Rook < 1.4 + exclude: "" + ignoreIfNoFiles: true + fileName: /ceph/status.json + regex: '\"ceph_release\": \"nautilus\"|\"status\": \"HEALTH_WARN\"' + outcomes: + - fail: + when: "true" + message: "If you have been removing and adding nodes then, you might want ensure that you are not facing the scenario described in the community topic: https://community.replicated.com/t/1099" + - pass: + when: "false" + message: "You are not using a Rook versions < 1.4 and/or your Ceph status is OK" + - textAnalyze: + checkName: Rook rbd filesystem consistency + fileName: /kots/rook/rook-ceph-agent-*.log + ignoreIfNoFiles: true + regex: 'UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.' + outcomes: + - fail: + when: "true" + message: "One or more rook rbd(s) were detected to have filesystem inconsistencies and require manual intervention" + - pass: + when: "false" + message: "Rook filesystem consistency ok" + - jsonCompare: + checkName: https://replicated.app host health check + fileName: replicated.app-health-check.json + path: "response.status" + value: "200" + outcomes: + - fail: + when: "false" + message: https://replicated.app is unhealthy. License and software update checks from replicated will fail. If this is locked down environment, please check your proxy settings. + uri: https://kurl.sh/docs/install-with-kurl/proxy-installs + - pass: + when: "true" + message: https://replicated.app host is healthy - storageClass: checkName: Check for default storage class outcomes: diff --git a/in-cluster/default.yaml.bak b/in-cluster/default.yaml.bak deleted file mode 100644 index a98cc8c..0000000 --- a/in-cluster/default.yaml.bak +++ /dev/null @@ -1,461 +0,0 @@ -apiVersion: troubleshoot.sh/v1beta2 -kind: SupportBundle -metadata: - name: default -spec: - # uri: https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/in-cluster/default.yaml - collectors: - - runPod: - name: ekco-resources - namespace: kurl - podSpec: - containers: - - name: inspect-ekco-pods - image: adamancini/netshoot - command: ["sh", "-c", "--"] - args: - [ - "kubectl get pod -n kurl --selector app=ekc-operator --field-selector status.phase=Running -o json | jq -r .items[]", - ] - restartPolicy: Never - dnsPolicy: ClusterFirst - serviceAccount: ekco - - copyFromHost: - collectorName: "copy apiserver audit logs" - image: alpine - hostPath: "/var/log/apiserver/" - name: "logs" - extractArchive: true - - copyFromHost: - collectorName: "copy kURL logs" - image: alpine - hostPath: "/var/log/kurl/" - name: "logs" - extractArchive: true - - clusterInfo: {} - - clusterResources: {} - - ceph: {} - - longhorn: {} - - exec: - args: - - "http://localhost:3030/goroutines" - collectorName: kotsadm-goroutines - command: - - curl - containerName: kotsadm - name: kots/admin_console - selector: - - app=kotsadm - timeout: 10s - - exec: - args: - - "http://localhost:3030/goroutines" - collectorName: kotsadm-operator-goroutines - command: - - curl - containerName: kotsadm-operator - name: kots/admin_console - selector: - - app=kotsadm-operator - timeout: 10s - - logs: - collectorName: velero-logs - namespace: velero - name: kots/velero - - logs: - collectorName: kurl-control-plane - name: kots/kurl/control-plane - selector: - - tier=control-plane - - logs: - collectorName: kotsadm-postgres-db - name: kots/admin_console - selector: - - app=kotsadm-postgres - - logs: - collectorName: kotsadm-api - name: kots/admin_console - selector: - - app=kotsadm-api - - logs: - collectorName: kotsadm-operator - name: kots/admin_console - selector: - - app=kotsadm-operator - - logs: - collectorName: kotsadm - name: kots/admin_console - selector: - - app=kotsadm - - logs: - collectorName: kurl-proxy-kotsadm - name: kots/admin_console - selector: - - app=kurl-proxy-kotsadm - - logs: - collectorName: kotsadm-dex - name: kots/admin_console - selector: - - app=kotsadm-dex - - logs: - collectorName: kotsadm-fs-minio - name: kots/admin_console - selector: - - app=kotsadm-fs-minio - - logs: - collectorName: kotsadm-s3-ops - name: kots/admin_console - selector: - - app=kotsadm-s3-ops - - logs: - collectorName: registry - name: kots/kurl - selector: - - app=registry - namespace: kurl - - logs: - collectorName: ekc-operator - name: kots/kurl - selector: - - app=ekc-operator - namespace: kurl - - secret: - collectorName: kotsadm-replicated-registry - name: kotsadm-replicated-registry # NOTE: this will not live under the kots/ directory like other collectors - includeValue: false - key: .dockerconfigjson - - logs: - collectorName: rook-ceph-logs - namespace: rook-ceph - name: kots/rook - - logs: - collectorName: coredns-logs - namespace: kube-system - name: logs/coredns - selector: - - k8s-app=kube-dns - - exec: - collectorName: weave-status - command: - - /home/weave/weave - args: - - --local - - status - containerName: weave - exclude: "" - name: kots/kurl/weave - namespace: kube-system - selector: - - name=weave-net - timeout: 10s - - exec: - collectorName: weave-report - command: - - /home/weave/weave - args: - - --local - - report - containerName: weave - exclude: "" - name: kots/kurl/weave - namespace: kube-system - selector: - - name=weave-net - timeout: 10s - - logs: - collectorName: rqlite-logs - name: kots/rqlite/logs - selector: - - app=kotsadm-rqlite - - logs: - collectorName: weave-net - selector: - - name=weave-net - namespace: kube-system - name: kots/kurl/weave - - logs: - collectorName: minio - selector: - - app=minio - namespace: minio - name: kots/kurl/minio - - logs: - collectorName: ha-minio - selector: - - app=ha-minio - namespace: minio - name: kots/kurl/ha-minio - - exec: - args: - - "http://goldpinger.kurl.svc.cluster.local:80/check_all" - collectorName: goldpinger-statistics - command: - - curl - containerName: kotsadm - name: kots/goldpinger - selector: - - app=kotsadm - timeout: 10s - - copyFromHost: - collectorName: kurl-host-preflights - name: kots/kurl/host-preflights - hostPath: /var/lib/kurl/host-preflights - extractArchive: true - image: alpine - imagePullPolicy: IfNotPresent - timeout: 1m - - configMap: - collectorName: coredns - name: coredns - namespace: kube-system - includeAllData: true - - configMap: - collectorName: kube-proxy - name: kube-proxy - namespace: kube-system - includeAllData: true - - configMap: - collectorName: kubeadm-config - name: kubeadm-config - namespace: kube-system - includeAllData: true - - configMap: - collectorName: kubelet-config - name: kubelet-config - namespace: kube-system - includeAllData: true - - configMap: - collectorName: kurl-config - name: kurl-config - namespace: kube-system - includeAllData: true - - configMap: - collectorName: weave-net - name: weave-net - namespace: kube-system - includeAllData: true - - configMap: - collectorName: ekco-config - name: ekco-config - namespace: kurl - includeAllData: true - - configMap: - collectorName: kurl-current-config - name: kurl-current-config # NOTE: this will not live under the kots/ directory like other collectors - namespace: kurl - includeAllData: true - - configMap: - collectorName: kurl-last-config - name: kurl-last-config # NOTE: this will not live under the kots/ directory like other collectors - namespace: kurl - includeAllData: true - - logs: - collectorName: projectcontour-logs - namespace: projectcontour - name: projectcontour/logs - - runPod: - name: rqlite-status - namespace: default - podSpec: - containers: - - name: rqlite-status - image: busybox:1 - command: ["wget"] - args: ["-q", "-T", "5", "http://kotsadm-rqlite:4001/status?pretty", "-O-"] - - runPod: - name: rqlite-nodes - namespace: default - podSpec: - containers: - - name: rqlite-nodes - image: busybox:1 - command: ["wget"] - args: ["-q", "-T", "5", "http://kotsadm-rqlite:4001/nodes?pretty&ver=2", "-O-"] - - http: - collectorName: replicated.app-health-check - get: - url: https://replicated.app/healthz - analyzers: - - deploymentStatus: - checkName: Check EKCO is operational - name: ekc-operator - namespace: kurl - outcomes: - - fail: - when: absent - message: EKCO is not installed - please add the EKCO component to your kURL spec and re-run the installer script - - fail: - when: "< 1" - message: EKCO does not have any Ready pods - - pass: - message: EKCO is installed and running - - textAnalyze: - checkName: Check installed EKCO version for critical fixes - fileName: cluster-resources/deployments/kurl.json - regexGroups: '"image": "replicated/ekco:v(?P\d+)\.(?P\d+)\.(?P\d+)"' - outcomes: - - warn: - when: "Minor < 4" - message: A critical update for cluster certificate rotation has been released in EKCO 0.4.0. Please upgrade to the latest available version. - - warn: - when: "Minor < 19" - message: A critical fix for registry certificate rotation has been released in EKCO 0.19.3. Please upgrade to the latest available version. - - pass: - when: "Minor > 20" - message: EKCO version is recent - - containerRuntime: - outcomes: - - fail: - when: "== gvisor" - message: The Admin Console does not support using the gvisor runtime - - pass: - message: A supported container runtime is present on all nodes - - cephStatus: {} - - longhorn: {} - - clusterPodStatuses: - outcomes: - - fail: - when: "!= Healthy" - message: "Status: {{ .Status.Reason }}" - - statefulsetStatus: {} - - deploymentStatus: {} - - jobStatus: {} - - replicasetStatus: {} - - weaveReport: - reportFileGlob: kots/kurl/weave/kube-system/*/weave-report-stdout.txt - - textAnalyze: - checkName: Weave Status - exclude: "" - ignoreIfNoFiles: true - fileName: kots/kurl/weave/kube-system/weave-net-*/weave-status-stdout.txt - outcomes: - - fail: - message: Weave is not ready - - pass: - message: Weave is ready - regex: 'Status: ready' - - textAnalyze: - checkName: Weave Report - exclude: "" - ignoreIfNoFiles: true - fileName: kots/kurl/weave/kube-system/weave-net-*/weave-report-stdout.txt - outcomes: - - fail: - message: Weave is not ready - - pass: - message: Weave is ready - regex: '"Ready": true' - - textAnalyze: - checkName: Weave IP Allocation - exclude: "" - ignoreIfNoFiles: true - regex: 'IP allocation was seeded by different peers' - fileName: kots/kurl/weave/weave-net-*/weave.log - outcomes: - - fail: - when: "true" - message: IP Allocation issues detected. Please run `rm /var/lib/weave/weave-netdata.db && reboot` on each node to resolve this. - - pass: - when: "false" - message: Weave is ready, there are no IP allocation issues. - - textAnalyze: - checkName: Inter-pod Networking - exclude: "" - ignoreIfNoFiles: true - fileName: kots/goldpinger/*/kotsadm-*/goldpinger-statistics-stdout.txt - outcomes: - - fail: - when: "OK = false" - message: Some nodes have pod communication issues - - pass: - message: Goldpinger can communicate properly - regexGroups: '"OK": ?(?P\w+)' - - nodeResources: - checkName: Node status check - outcomes: - - fail: - when: "nodeCondition(Ready) == False" - message: "Not all nodes are online." - - fail: - when: "nodeCondition(Ready) == Unknown" - message: "Not all nodes are online." - - pass: - message: "All nodes are online." - - clusterPodStatuses: - checkName: contour pods unhealthy - namespaces: - - projectcontour - outcomes: - - fail: - when: "!= Healthy" # Catch all unhealthy pods. A pod is considered healthy if it has a status of Completed, or Running and all of its containers are ready. - message: A Contour pod, {{ .Name }}, is unhealthy with a status of {{ .Status.Reason }}. Restarting the pod may fix the issue. - - textAnalyze: - checkName: longhorn multipath conflict - ignoreIfNoFiles: true - fileName: longhorn/longhorn-system/logs/longhorn-csi-plugin-*/longhorn-csi-plugin.log - outcomes: - - fail: - when: "true" - uri: "https://longhorn.io/kb/troubleshooting-volume-with-multipath/" - message: "Longhorn volumes may be in use by system multipath." - - pass: - when: "false" - message: "No block-device conflicts detected" - regex: '.*is apparently in use by the system;.*' - - textAnalyze: - checkName: Minio disk full - fileName: cluster-resources/pods/logs/kurl/registry-*/registry.log - ignoreIfNoFiles: true - regex: '.*XMinioStorageFull: Storage backend has reached its minimum free disk threshold.*' - outcomes: - - fail: - when: "true" - message: "Minio Disk Full" - - pass: - when: "false" - message: "Minio Disk Ok" - - textAnalyze: - checkName: Known issue with Rook < 1.4 - exclude: "" - ignoreIfNoFiles: true - fileName: /ceph/status.json - regex: '\"ceph_release\": \"nautilus\"|\"status\": \"HEALTH_WARN\"' - outcomes: - - fail: - when: "true" - message: "If you have been removing and adding nodes then, you might want ensure that you are not facing the scenario described in the community topic: https://community.replicated.com/t/1099" - - pass: - when: "false" - message: "You are not using a Rook versions < 1.4 and/or your Ceph status is OK" - - textAnalyze: - checkName: Rook rbd filesystem consistency - fileName: /kots/rook/rook-ceph-agent-*.log - ignoreIfNoFiles: true - regex: 'UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.' - outcomes: - - fail: - when: "true" - message: "One or more rook rbd(s) were detected to have filesystem inconsistencies and require manual intervention" - - pass: - when: "false" - message: "Rook filesystem consistency ok" - - jsonCompare: - checkName: https://replicated.app host health check - fileName: replicated.app-health-check.json - path: "response.status" - value: "200" - outcomes: - - fail: - when: "false" - message: https://replicated.app is unhealthy. License and software update checks from replicated will fail. If this is locked down environment, please check your proxy settings. - uri: https://kurl.sh/docs/install-with-kurl/proxy-installs - - pass: - when: "true" - message: https://replicated.app host is healthy - - storageClass: - checkName: Check for default storage class - outcomes: - - fail: - message: No default storage class found - - pass: - message: Default storage class found