Skip to content

Commit

Permalink
Support collect logs for failed agents and controller for supportbundle
Browse files Browse the repository at this point in the history
Signed-off-by: Hang Yan <[email protected]>
  • Loading branch information
hangyan committed Nov 20, 2024
1 parent b6d4238 commit 16d109f
Show file tree
Hide file tree
Showing 3 changed files with 254 additions and 27 deletions.
188 changes: 172 additions & 16 deletions pkg/antctl/raw/supportbundle/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,16 @@ import (
"golang.org/x/sync/errgroup"
"golang.org/x/time/rate"
"gopkg.in/yaml.v2"

apierrors "k8s.io/apimachinery/pkg/api/errors"

corev1 "k8s.io/api/core/v1"

"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
k8sruntime "k8s.io/apimachinery/pkg/runtime"
utilerror "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/klog/v2"
Expand All @@ -46,7 +52,11 @@ import (
"antrea.io/antrea/pkg/apis/crd/v1beta1"
systemv1beta1 "antrea.io/antrea/pkg/apis/system/v1beta1"
antrea "antrea.io/antrea/pkg/client/clientset/versioned"

systemclientset "antrea.io/antrea/pkg/client/clientset/versioned/typed/system/v1beta1"

"antrea.io/antrea/pkg/util/compress"
"antrea.io/antrea/pkg/util/k8s"
)

const (
Expand Down Expand Up @@ -584,6 +594,20 @@ func controllerRemoteRunE(cmd *cobra.Command, args []string) error {
return fmt.Errorf("failed to create clientset: %w", err)
}

if err := os.MkdirAll(option.dir, 0700|os.ModeDir); err != nil {
return fmt.Errorf("error when creating output dir: %w", err)
}

f, err := os.Create(filepath.Join(option.dir, "clusterinfo"))
if err != nil {
return err
}
defer f.Close()
err = getClusterInfo(f, k8sClientset)
if err != nil {
return err
}

var controllerClient systemclientset.SupportBundleInterface
var agentClients map[string]systemclientset.SupportBundleInterface

Expand Down Expand Up @@ -628,29 +652,17 @@ func controllerRemoteRunE(cmd *cobra.Command, args []string) error {
return fmt.Errorf("no matched Nodes found to collect agent bundles")
}

if err := os.MkdirAll(option.dir, 0700|os.ModeDir); err != nil {
return fmt.Errorf("error when creating output dir: %w", err)
}
amount := len(agentClients) * 2
if controllerClient != nil {
amount += 2
}
bar := barTmpl.Start(amount)
defer bar.Finish()
defer bar.Set("prefix", "Finish ")
f, err := os.Create(filepath.Join(option.dir, "clusterinfo"))
if err != nil {
return err
}
defer f.Close()
err = getClusterInfo(f, k8sClientset)
if err != nil {
return err
}

results := requestAll(ctx, agentClients, controllerClient, bar)
results = downloadAll(ctx, agentClients, controllerClient, dir, bar, results)
return processResults(results, dir)
return processResults(ctx, antreaClientset, k8sClientset, results, dir)
}

func genErrorMsg(resultMap map[string]error) string {
Expand All @@ -662,8 +674,9 @@ func genErrorMsg(resultMap map[string]error) string {
}

// processResults will output the failed nodes and their reasons if any. If no data was collected,
// error is returned, otherwise will return nil.
func processResults(resultMap map[string]error, dir string) error {
// error is returned, otherwise will return nil. For failed nodes and controller, will also trying to get logs from
// kubernetes api.
func processResults(ctx context.Context, antreaClientset antrea.Interface, k8sClient kubernetes.Interface, resultMap map[string]error, dir string) error {
resultStr := ""
var failedNodes []string
allFailed := true
Expand All @@ -679,7 +692,8 @@ func processResults(resultMap map[string]error, dir string) error {
}
}

if resultMap[""] != nil {
controllerFail := resultMap[""] != nil
if controllerFail {
fmt.Println("Controller Info Failed Reason: " + resultMap[""].Error())
}

Expand All @@ -692,9 +706,151 @@ func processResults(resultMap map[string]error, dir string) error {
err = writeFailedNodes(dir, failedNodes)
}

// download logs from kubernetes api
if failedNodes != nil {
err = downloadAgentBundleFromKubernetes(ctx, antreaClientset, k8sClient, failedNodes, dir)
if err != nil {
fmt.Println("Failed to download agent bundle from kubernetes api: " + err.Error())
} else {
allFailed = false
}
}
if controllerFail {
err = downloadControlleBundleFromKubernetes(ctx, antreaClientset, k8sClient, dir)
if err != nil {
fmt.Println("Failed to download controller bundle from kubernetes api: " + err.Error())
} else {
allFailed = false
}
}

if allFailed {
return fmt.Errorf("no data was collected: %s", genErrorMsg(resultMap))
} else {
return err
}
}

func downloadControlleBundleFromKubernetes(ctx context.Context, antreaClientset antrea.Interface, k8sClient kubernetes.Interface, dir string) error {
var errors []error
tmpDir, err := afero.TempDir(defaultFS, "", "bundle_tmp_")
if err != nil {
return err
}
defer defaultFS.RemoveAll(tmpDir)

controllerInfo, err := antreaClientset.CrdV1beta1().AntreaControllerInfos().Get(ctx, "antrea-controller", metav1.GetOptions{})
if err == nil {
data, err := yaml.Marshal(controllerInfo)
if err == nil {
err = afero.WriteFile(defaultFS, filepath.Join(tmpDir, "controllerinfo"), data, 0644)
errors = append(errors, err)
}
}
errors = append(errors, err)
pods, err := k8sClient.CoreV1().Pods("kube-system").List(ctx, metav1.ListOptions{
ResourceVersion: "0",
LabelSelector: "app=antrea,component=antrea-controller",
})
errors = append(errors, err)
if err == nil {
for _, pod := range pods.Items {
err = downloadPodLogs(ctx, k8sClient, "controller", pod.Namespace, pod.Name, k8s.GetPodContainerNames(&pod), dir, tmpDir)
errors = append(errors, err)
}
}
return utilerror.NewAggregate(errors)
}

func downloadAgentBundleFromKubernetes(ctx context.Context, antreaClientset antrea.Interface, k8sClient kubernetes.Interface, failedNodes []string, dir string) error {
agentInfoList, err := antreaClientset.CrdV1beta1().AntreaAgentInfos().List(ctx, metav1.ListOptions{ResourceVersion: "0"})
if err != nil {
return err
}

agentInfoMap := map[string]v1beta1.AntreaAgentInfo{}
for _, agentInfo := range agentInfoList.Items {
agentInfoMap[agentInfo.Name] = agentInfo
}
pods, err := k8sClient.CoreV1().Pods("kube-system").List(ctx, metav1.ListOptions{
ResourceVersion: "0",
LabelSelector: "app=antrea,component=antrea-agent",
})
if err != nil {
return err
}
failedNodeSet := sets.NewString(failedNodes...)
var errors []error
for _, pod := range pods.Items {
if !failedNodeSet.Has(pod.Spec.NodeName) {
continue
}
tmpDir, err := afero.TempDir(defaultFS, "", "bundle_tmp_")
if err != nil {
errors = append(errors, err)
continue
}
defer defaultFS.RemoveAll(tmpDir)

if agentInfo, ok := agentInfoMap[pod.Spec.NodeName]; ok {
data, err := yaml.Marshal(agentInfo)
errors = append(errors, err)
if err == nil {
err = afero.WriteFile(defaultFS, filepath.Join(tmpDir, "agentinfo"), data, 0644)
errors = append(errors, err)
}
}
err = downloadPodLogs(ctx, k8sClient, "agent_"+pod.Spec.NodeName, pod.Namespace, pod.Name, k8s.GetPodContainerNames(&pod), dir, tmpDir)
errors = append(errors, err)

}
return utilerror.NewAggregate(errors)
}

func downloadPodLogs(ctx context.Context, k8sClient kubernetes.Interface, comp string, namespace string, podName string, containers []string, dir string, tmpDir string) error {
var errors []error
for _, containerName := range containers {
containerDirName := containerName
if strings.HasPrefix(containerName, "antrea-") {
containerDirName = strings.ReplaceAll(containerName, "antrea-", "")
}

podLogDir := filepath.Join(tmpDir, "logs", containerDirName)
err := os.MkdirAll(podLogDir, 0755)
if err != nil {
return err
}
fileName := filepath.Join(podLogDir, containerName+".log")
f, err := defaultFS.Create(fileName)
if err != nil {
errors = append(errors, err)
continue
}
defer f.Close()
logOption := &corev1.PodLogOptions{
Container: containerName,
}
logs := k8sClient.CoreV1().Pods(namespace).GetLogs(podName, logOption)
logStream, err := logs.Stream(ctx)
if err != nil {
errors = append(errors, err)
continue
}

_, err = io.Copy(f, logStream)
errors = append(errors, err)
err = logStream.Close()
errors = append(errors, err)
}

gzFileName := filepath.Join(dir, comp+".tar.gz")
f, err := defaultFS.Create(gzFileName)
if err != nil {
errors = append(errors, err)
} else {
defer f.Close()
_, err := compress.PackDir(defaultFS, tmpDir, f)
errors = append(errors, err)
}
return utilerror.NewAggregate(errors)
}
81 changes: 70 additions & 11 deletions pkg/antctl/raw/supportbundle/command_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,60 @@ var (
Name: "node-3",
},
}
controllerPod = &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "antrea-controller-1",
Namespace: "kube-system",
Labels: map[string]string{
"app": "antrea",
"component": "antrea-controller",
},
},
Spec: v1.PodSpec{
NodeName: "node-1",
Containers: []v1.Container{
{
Name: "antrea-controller",
},
},
},
}
pod1 = &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "antrea-agent-1",
Namespace: "kube-system",
Labels: map[string]string{
"app": "antrea",
"component": "antrea-agent",
},
},
Spec: v1.PodSpec{
NodeName: "node-1",
Containers: []v1.Container{
{
Name: "antrea-agent",
},
},
},
}
pod2 = &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "antrea-agent-2",
Namespace: "kube-system",
Labels: map[string]string{
"app": "antrea",
"component": "antrea-agent",
},
},
Spec: v1.PodSpec{
NodeName: "node-2",
Containers: []v1.Container{
{
Name: "antrea-agent",
},
},
},
}
nameList = []string{"node-1", "node-3"}
)

Expand Down Expand Up @@ -320,9 +374,8 @@ func TestProcessResults(t *testing.T) {
option.dir = path
}()
tests := []struct {
name string
resultMap map[string]error
expectedErr string
name string
resultMap map[string]error
}{
{
name: "All nodes failed",
Expand All @@ -331,7 +384,6 @@ func TestProcessResults(t *testing.T) {
"node-1": fmt.Errorf("error-1"),
"node-2": fmt.Errorf("error-2"),
},
expectedErr: "no data was collected:",
},
{
name: "Not all nodes failed",
Expand All @@ -351,17 +403,24 @@ func TestProcessResults(t *testing.T) {
defaultFS = afero.NewOsFs()
}()

err := processResults(tt.resultMap, option.dir)
if tt.expectedErr != "" {
require.ErrorContains(t, err, tt.expectedErr)
} else {
require.NoError(t, err)
}
// Both test cases above have failed Nodes, hence this file should always be created/
antreaInterface := fakeclientset.NewSimpleClientset(&controllerInfo, agentInfo1, agentInfo2)
k8sClient := fake.NewSimpleClientset(controllerPod, pod1, pod2)
err := processResults(context.TODO(), antreaInterface, k8sClient, tt.resultMap, option.dir)
require.NoError(t, err)
b, err := afero.ReadFile(defaultFS, filepath.Join(option.dir, "failed_nodes"))
require.NoError(t, err)
data := string(b)
for node, err := range tt.resultMap {
fileName := fmt.Sprintf("agent_%s.tar.gz", node)
if node == "" {
fileName = "controller.tar.gz"
}
if err != nil {
ok, checkErr := afero.Exists(defaultFS, filepath.Join(option.dir, fileName))
require.NoError(t, checkErr)
assert.True(t, ok, fmt.Sprintf("expected support bundle file %s not found", fileName))
}

if node == "" {
continue
}
Expand Down
12 changes: 12 additions & 0 deletions pkg/util/k8s/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,15 @@ import v1 "k8s.io/api/core/v1"
func IsPodTerminated(pod *v1.Pod) bool {
return pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded
}

// GetPodContainersNames return all the container names in a pod, including init container.
func GetPodContainerNames(pod *v1.Pod) []string {
var names []string
for _, c := range pod.Spec.InitContainers {
names = append(names, c.Name)
}
for _, c := range pod.Spec.Containers {
names = append(names, c.Name)
}
return names
}

0 comments on commit 16d109f

Please sign in to comment.