Dataflow Engine Chaos #4260
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Dataflow Engine Chaos | |
on: | |
schedule: | |
- cron: '0 17-23 * * *' # run at minute 0 every hour from 01:00 ~ 07:00 UTC+8 | |
workflow_dispatch: | |
inputs: | |
pr: | |
description: 'Which PR do you want to trigger (use PR number, such as 6127)' | |
required: true | |
default: '' | |
# See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency. | |
concurrency: | |
group: ${{ github.ref }}-${{ github.workflow }} | |
cancel-in-progress: true | |
# A workflow run is made up of one or more jobs that can run sequentially or in parallel | |
jobs: | |
# This workflow contains a single job called "base" | |
base: | |
# The type of runner that the job will run on | |
runs-on: ubuntu-20.04 | |
timeout-minutes: 50 | |
strategy: | |
fail-fast: false | |
matrix: | |
chaos-obj: | |
[ | |
"pod-failure-dataflow", | |
"pod-kill-dataflow", | |
"network-partition-dataflow", | |
"network-emulation-dataflow", | |
"time-shift-dataflow", | |
] | |
# Steps represent a sequence of tasks that will be executed as part of the job | |
steps: | |
- uses: actions/checkout@v2 | |
- name: check out code by workerflow dispatch PR | |
if: ${{ github.event.inputs.pr != '' }} | |
uses: actions/checkout@v2 | |
with: | |
ref: refs/pull/${{ github.event.inputs.pr }}/head | |
- uses: actions/setup-go@v3 | |
with: | |
go-version: '1.21' | |
- name: Cache go modules | |
uses: actions/cache@v2 | |
with: | |
path: ~/go/pkg/mod | |
key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }} | |
- name: Create k8s Kind Cluster | |
uses: helm/[email protected] | |
with: | |
cluster_name: dataflow-engine-cluster | |
config: ${{ github.workspace }}/engine/chaos/manifests/kind-cluster.yaml | |
- name: Print cluster information | |
run: | | |
kubectl config view | |
kubectl cluster-info | |
kubectl get nodes | |
kubectl get pods -n kube-system | |
kubectl get sc | |
kubectl version | |
helm version | |
- name: Build dataflow engine binary | |
run: | | |
make tiflow tiflow-chaos-case | |
cp -r $GITHUB_WORKSPACE/engine/chaos/cases/conf/ $GITHUB_WORKSPACE/bin/engine-conf | |
- name: Build Dataflow engine docker image | |
run: | | |
docker build -f $GITHUB_WORKSPACE/engine/chaos/manifests/Dockerfile -t dataflow:chaos $GITHUB_WORKSPACE/bin | |
docker image list | |
- name: Load docker image to kind cluster | |
run: | | |
kind load docker-image dataflow:chaos --name dataflow-engine-cluster | |
# Set up upstream instances | |
- name: Set up sources | |
run: | | |
kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml | |
kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml | |
kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml | |
- name: Wait for sources ready # kubectl wait --all not working | |
run: | | |
kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s || true | |
kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s || true | |
kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s || true | |
sleep 10 | |
echo show pvc | |
kubectl get pvc -l app=sources -o wide | |
echo show pv | |
kubectl get pv -o wide | |
echo show svc | |
kubectl get svc -l app=sources -o wide | |
echo show sts | |
kubectl get sts -l app=sources -o wide | |
echo show po | |
kubectl get po -l app=sources -o wide | |
echo describe po | |
kubectl describe po -l app=sources | |
echo describe pvc | |
kubectl describe pvc -l app=sources | |
kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s | |
kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s | |
kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s | |
# Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator) | |
- name: Set up TiDB | |
run: | | |
kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml | |
kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml | |
kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml | |
- name: Wait for TiDB ready | |
run: | | |
kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true | |
echo show pvc | |
kubectl get pvc -l app=tidb -o wide | |
echo show pv | |
kubectl get pv -o wide | |
echo show svc | |
kubectl get svc -l app=tidb -o wide | |
echo show sts | |
kubectl get sts -l app=tidb -o wide | |
echo show po | |
kubectl get po -l app=tidb -o wide | |
echo describe po | |
kubectl describe po -l app=tidb | |
echo describe pvc | |
kubectl describe pvc -l app=tidb | |
kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s | |
# Set up minio and create a bucket for tests | |
- name: Set up minio | |
run: | | |
kubectl apply -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml | |
kubectl get -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml | |
kubectl describe -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml | |
- name: Wait for minio ready | |
run: | | |
kubectl wait --for=condition=Ready pod/chaos-minio-0 --timeout=10m || true | |
echo show pvc | |
kubectl get pvc -l app=minio -o wide | |
echo show pv | |
kubectl get pv -o wide | |
echo show svc | |
kubectl get svc -l app=minio -o wide | |
echo show sts | |
kubectl get sts -l app=minio -o wide | |
echo show po | |
kubectl get po -l app=minio -o wide | |
echo describe po | |
kubectl describe po -l app=minio | |
echo describe pvc | |
kubectl describe pvc -l app=minio | |
kubectl wait --for=condition=Ready pod/chaos-minio-0 --timeout=0s | |
- name: Set up minio-create-bucket job | |
run: | | |
kubectl apply -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml | |
kubectl get -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml | |
kubectl describe -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml | |
kubectl wait --for=condition=Complete job/chaos-minio-create-bucket-job --timeout=2m | |
# Set up metastore and basic services | |
- name: Set up metastore and basic services | |
run: | | |
helm install -f $GITHUB_WORKSPACE/deployments/engine/helm/tiflow/values.yaml chaos $GITHUB_WORKSPACE/deployments/engine/helm/tiflow | |
helm list | |
sleep 5 | |
kubectl get pods | |
- name: Wait for metastore ready | |
run: | | |
kubectl wait --for=condition=Ready pod/chaos-metastore-mysql-0 --timeout=60s || true | |
kubectl wait --for=condition=Ready pod/chaos-metastore-etcd-0 --timeout=60s || true | |
echo show pvc | |
kubectl get pvc -l app=chaos-metastore-etcd -o wide | |
echo show pv | |
kubectl get pv -o wide | |
echo show svc | |
kubectl get svc -l app=chaos-metastore-etcd -o wide | |
echo show sts | |
kubectl get sts -l app=chaos-metastore-etcd -o wide | |
echo show po | |
kubectl get po -l app=chaos-metastore-etcd -o wide | |
echo describe po | |
kubectl describe po -l app=chaos-metastore-etcd | |
echo describe pvc | |
kubectl describe pvc -l app=chaos-metastore-etcd | |
echo show pvc | |
kubectl get pvc -l app=chaos-metastore-mysql -o wide | |
echo show pv | |
kubectl get pv -o wide | |
echo show svc | |
kubectl get svc -l app=chaos-metastore-mysql -o wide | |
echo show sts | |
kubectl get sts -l app=chaos-metastore-mysql -o wide | |
echo show po | |
kubectl get po -l app=chaos-metastore-mysql -o wide | |
echo describe po | |
kubectl describe po -l app=chaos-metastore-framework | |
echo describe pvc | |
kubectl describe pvc -l app=chaos-metastore-framework | |
- name: Wait for server-master ready | |
run: | | |
kubectl wait --for=condition=Ready pod -l app=chaos-server-master --all --timeout=20s|| true | |
echo "<<<<< show pvc >>>>>" | |
kubectl get pvc -l app=chaos-server-master -o wide | |
echo "<<<<< show pv >>>>>" | |
kubectl get pv -o wide | |
echo "<<<<< show svc >>>>>" | |
kubectl get svc -l app=chaos-server-master -o wide | |
echo "<<<<< show sts >>>>>" | |
kubectl get sts -l app=chaos-server-master -o wide | |
echo "<<<<< show po >>>>>" | |
kubectl get po -l app=chaos-server-master -o wide | |
echo "<<<<< describe po >>>>>" | |
kubectl describe po -l app=chaos-server-master | |
echo "<<<<< describe pvc >>>>>" | |
kubectl describe pvc -l app=chaos-server-master | |
echo "<<<<< show current log for chaos-server-master-0 >>>>>" | |
kubectl logs chaos-server-master-0 || true | |
echo "<<<<< show previous log for chaos-server-master-0 >>>>>" | |
kubectl logs chaos-server-master-0 -p || true | |
echo "<<<<< show current log for chaos-server-master-1 >>>>>" | |
kubectl logs chaos-server-master-1 || true | |
echo "<<<<< show previous log for chaos-server-master-1 >>>>>" | |
kubectl logs chaos-server-master-1 -p || true | |
echo "<<<<< show current log for chaos-server-master-2 >>>>>" | |
kubectl logs chaos-server-master-2 || true | |
echo "<<<<< show previous log for chaos-server-master-2 >>>>>" | |
kubectl logs chaos-server-master-2 -p || true | |
kubectl logs chaos-server-master-0 -c wait-mysql || true | |
- name: Wait for executor ready | |
run: | | |
kubectl wait --for=condition=Ready pod -l app=chaos-executor --all --timeout=15s|| true | |
echo "<<<<< show pvc >>>>>" | |
kubectl get pvc -l app=chaos-executor -o wide | |
echo "<<<<< show pv >>>>>" | |
kubectl get pv -o wide | |
echo "<<<<< show svc >>>>>" | |
kubectl get svc -l app=chaos-executor -o wide | |
echo "<<<<< show sts >>>>>" | |
kubectl get sts -l app=chaos-executor -o wide | |
echo "<<<<< show po >>>>>" | |
kubectl get po -l app=chaos-executor -o wide | |
echo "<<<<< describe po >>>>>" | |
kubectl describe po -l app=chaos-executor | |
echo "<<<<< describe pvc >>>>>" | |
kubectl describe pvc -l app=chaos-executor | |
echo "<<<<< show current log for chaos-executor-0 >>>>>" | |
kubectl logs chaos-executor-0 || true | |
echo "<<<<< show previous log for chaos-executor-0 >>>>>" | |
kubectl logs chaos-executor-0 -p || true | |
echo "<<<<< show current log for chaos-executor-1 >>>>>" | |
kubectl logs chaos-executor-1 || true | |
echo "<<<<< show previous log for worker-master-1 >>>>>" | |
kubectl logs chaos-executor-1 -p || true | |
echo "<<<<< show current log for chaos-executor-2 >>>>>" | |
kubectl logs chaos-executor-2 || true | |
echo "<<<<< show previous log for chaos-executor-2 >>>>>" | |
kubectl logs chaos-executor-2 -p || true | |
kubectl logs chaos-executor-0 -c wait-server-master || true | |
- name: Set up chaos test cases | |
run: | | |
kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml | |
kubectl get -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml | |
kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml | |
kubectl get pods | |
# FIXME: remove this after fix https://github.com/pingcap/tiflow/issues/7304 | |
- name: Wait DM enter sync stage | |
run: | | |
for idx in $(seq 0 300); do | |
echo "wait dm enter sync stage" | |
if kubectl logs job.batch/chaos-test-case | grep "full mode of the task has completed" ; then | |
break | |
fi | |
sleep 1 | |
done | |
- name: Encode chaos-mesh action | |
run: | | |
echo CFG_BASE64=$(base64 -w 0 $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml) >> $GITHUB_ENV | |
- name: Run chaos mesh action | |
uses: chaos-mesh/chaos-mesh-action@master | |
env: | |
CFG_BASE64: ${{ env.CFG_BASE64 }} | |
# check whether complete with 1m * 20 times. | |
- name: Wait for chaos test case complete | |
run: | | |
$GITHUB_WORKSPACE/engine/chaos/scripts/check-case.sh | |
- name: Pause all chaos | |
if: ${{ always() }} | |
run: | | |
kubectl delete -f $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml | |
- name: Dump goroutines | |
if: ${{ failure() }} | |
run: | | |
# Add a delay if test fails, to check whether the cluster can recover after chaos is removed | |
sleep 60 | |
kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "server-master"|xargs -I{} kubectl exec -i -c server-master {} -- wget http://127.0.0.1:10240/debug/pprof/goroutine?debug=2 -O /log/{}_goroutines.log || true | |
kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "executor"|xargs -I{} kubectl exec -i -c executor {} -- wget http://127.0.0.1:10241/debug/pprof/goroutine?debug=2 -O /log/{}_goroutines.log || true | |
- name: Copy logs to hack permission | |
if: ${{ always() }} | |
run: | | |
mkdir ./logs | |
kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "server-master|executor"|xargs -I{} kubectl cp {}:/log ./logs || true | |
kind export logs ./logs/kind --name dataflow-engine-cluster | |
sudo chown -R runner ./logs | |
# Upload logs as artifact seems not stable, so we set `continue-on-error: true` here. | |
- name: Upload logs | |
continue-on-error: true | |
uses: actions/upload-artifact@v2 | |
if: ${{ always() }} | |
with: | |
name: chaos-base-logs.${{ matrix.chaos-obj }} | |
path: | | |
./logs | |
# Send feishu notification if failed. | |
- name: Feishu notification | |
continue-on-error: true | |
uses: foxundermoon/feishu-action@v2 | |
if: ${{ failure() }} | |
with: | |
url: ${{ secrets.ENGINE_FEISHU_NOTIFY_URL }} | |
msg_type: text | |
content: | | |
text: | | |
dataflow engine chaos job failed, see https://github.com/pingcap/tiflow/actions/runs/${{ github.run_id }} |