talos/pkg/cluster/support.go
Andrey Smirnov a6b010a8b4
chore: update Go to 1.19, Linux to 5.15.58
See https://go.dev/doc/go1.19

Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
2022-08-03 17:03:58 +04:00

786 lines
17 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package cluster
import (
"archive/zip"
"bytes"
"context"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"sync"
"text/tabwriter"
criconstants "github.com/containerd/containerd/pkg/cri/constants"
"github.com/cosi-project/runtime/pkg/resource"
"github.com/cosi-project/runtime/pkg/resource/meta"
"github.com/dustin/go-humanize"
"github.com/hashicorp/go-multierror"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc/codes"
"google.golang.org/protobuf/types/known/emptypb"
"gopkg.in/yaml.v3"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
k8sjson "k8s.io/apimachinery/pkg/runtime/serializer/json"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
"github.com/talos-systems/talos/pkg/cli"
"github.com/talos-systems/talos/pkg/machinery/api/common"
"github.com/talos-systems/talos/pkg/machinery/api/machine"
"github.com/talos-systems/talos/pkg/machinery/client"
"github.com/talos-systems/talos/pkg/machinery/constants"
"github.com/talos-systems/talos/pkg/version"
)
// BundleOptions defines GetSupportBundle options.
type BundleOptions struct {
LogOutput io.Writer
NumWorkers int
Source string
Client *client.Client
Archive *BundleArchive
Progress chan BundleProgress
lastLogMu sync.RWMutex
lastLog string
}
// BundleProgress reports current bundle collection progress.
type BundleProgress struct {
Source string
State string
Total int
Value int
Error error
}
// BundleArchive wraps archive writer in a thread safe implementation.
type BundleArchive struct {
Archive *zip.Writer
archiveMu sync.Mutex
}
// BundleError wraps all bundle collection errors and adds source context.
type BundleError struct {
Source string
err error
}
func (b *BundleError) Error() string {
return b.err.Error()
}
func wrap(options *BundleOptions, err error) error {
return &BundleError{
Source: options.Source,
err: err,
}
}
// Write creates a file in the archive.
func (a *BundleArchive) Write(path string, contents []byte) error {
a.archiveMu.Lock()
defer a.archiveMu.Unlock()
file, err := a.Archive.Create(path)
if err != nil {
return err
}
_, err = file.Write(contents)
if err != nil {
return err
}
return nil
}
// Log writes the line to logger or to stdout if no logger was provided.
func (options *BundleOptions) Log(line string, args ...interface{}) {
options.lastLogMu.Lock()
defer options.lastLogMu.Unlock()
options.lastLog = fmt.Sprintf(line, args...)
if options.LogOutput != nil {
options.LogOutput.Write([]byte(fmt.Sprintf(line, args...))) //nolint:errcheck
return
}
fmt.Printf(line+"\n", args...)
}
type collect func(ctx context.Context, options *BundleOptions) ([]byte, error)
type nodeCollector struct {
filename string
collect collect
}
var nodeCollectors = []nodeCollector{
{"dmesg.log", dmesg},
{"controller-runtime.log", logs("controller-runtime", false)},
{"dependencies.dot", dependencies},
{"mounts", mounts},
{"devices", devices},
{"io", ioPressure},
{"processes", processes},
{"summary", summary},
}
// GetNodeSupportBundle writes all node information we can gather into a zip archive.
//
//nolint:gocyclo
func GetNodeSupportBundle(ctx context.Context, options *BundleOptions) error {
var errors error
cols := nodeCollectors
for _, dynamic := range []struct {
id string
nodeCollectors func(context.Context, *client.Client) ([]nodeCollector, error)
}{
{"system services logs", getServiceLogCollectors},
{"kube-system containers logs", getKubernetesLogCollectors},
{"talos resources", getResources},
} {
var (
dynamicCollectors []nodeCollector
err error
)
dynamicCollectors, err = dynamic.nodeCollectors(ctx, options.Client)
if err != nil {
errors = multierror.Append(errors, wrap(options, fmt.Errorf("failed to get %s %w", dynamic.id, err)))
continue
}
cols = append(cols, dynamicCollectors...)
}
var eg errgroup.Group
colChan := make(chan nodeCollector)
eg.Go(func() error {
defer func() {
close(colChan)
}()
done := 0
for _, nodeCollector := range cols {
select {
case colChan <- nodeCollector:
case <-ctx.Done():
return nil
}
done++
options.lastLogMu.RLock()
line := options.lastLog
options.lastLogMu.RUnlock()
if options.Progress != nil {
options.Progress <- BundleProgress{Source: options.Source, Value: done, Total: len(cols), State: strings.Split(line, "\n")[0]}
}
}
return nil
})
numWorkers := options.NumWorkers
if len(cols) < options.NumWorkers {
numWorkers = len(cols)
}
for i := 0; i < numWorkers; i++ {
eg.Go(func() error {
var errs error
for nodeCollector := range colChan {
if err := runCollector(ctx, nodeCollector, options); err != nil {
errs = multierror.Append(errs, wrap(options, err))
}
}
return errs
})
}
if err := eg.Wait(); err != nil {
return multierror.Append(errors, wrap(options, err))
}
return nil
}
// GetKubernetesSupportBundle writes cluster wide kubernetes information into a zip archive.
//
//nolint:gocyclo
func GetKubernetesSupportBundle(ctx context.Context, options *BundleOptions) error {
var clientset *kubernetes.Clientset
options.Source = "cluster"
var errors error
for _, node := range options.Client.GetEndpoints() {
err := func() error {
kubeconfig, err := options.Client.Kubeconfig(client.WithNodes(ctx, node))
if err != nil {
return err
}
config, err := clientcmd.NewClientConfigFromBytes(kubeconfig)
if err != nil {
return err
}
restconfig, err := config.ClientConfig()
if err != nil {
return err
}
clientset, err = kubernetes.NewForConfig(restconfig)
if err != nil {
return err
}
// just checking that k8s responds
_, err = clientset.CoreV1().Namespaces().Get(ctx, "kube-system", v1.GetOptions{})
return err
}()
if err != nil {
errors = multierror.Append(errors, wrap(options, err))
continue
}
break
}
if clientset == nil {
return multierror.Append(errors, wrap(
options, fmt.Errorf("failed to get kubernetes client, tried nodes %s", strings.Join(options.Client.GetEndpoints(), ", "))),
)
}
cols := []nodeCollector{
{
filename: "kubernetesResources/nodes.yaml",
collect: kubernetesNodes(clientset),
},
{
filename: "kubernetesResources/systemPods.yaml",
collect: systemPods(clientset),
},
}
for i, collector := range cols {
if err := runCollector(ctx, collector, options); err != nil {
errors = multierror.Append(errors, wrap(options, err))
continue
}
if options.Progress != nil {
options.Progress <- BundleProgress{Source: options.Source, Value: i, Total: len(cols), State: strings.Split(options.lastLog, "\n")[0]}
}
}
return errors
}
func runCollector(ctx context.Context, c nodeCollector, options *BundleOptions) error {
var (
data []byte
err error
)
if data, err = c.collect(ctx, options); err != nil {
return fmt.Errorf("failed to get %s: %s, skipped", c.filename, err)
}
if data == nil {
return nil
}
return options.Archive.Write(fmt.Sprintf("%s/%s", options.Source, c.filename), data)
}
func getServiceLogCollectors(ctx context.Context, c *client.Client) ([]nodeCollector, error) {
resp, err := c.ServiceList(ctx)
if err != nil {
return nil, err
}
cols := []nodeCollector{}
for _, msg := range resp.Messages {
for _, s := range msg.Services {
cols = append(
cols,
nodeCollector{
filename: fmt.Sprintf("%s.log", s.Id),
collect: logs(s.Id, false),
},
nodeCollector{
filename: fmt.Sprintf("%s.state", s.Id),
collect: serviceInfo(s.Id),
},
)
}
}
return cols, nil
}
func getKubernetesLogCollectors(ctx context.Context, c *client.Client) ([]nodeCollector, error) {
namespace := criconstants.K8sContainerdNamespace
driver := common.ContainerDriver_CRI
resp, err := c.Containers(ctx, namespace, driver)
if err != nil {
return nil, err
}
cols := []nodeCollector{}
for _, msg := range resp.Messages {
for _, container := range msg.Containers {
parts := strings.Split(container.PodId, "/")
// skip pause containers
if container.Status == "SANDBOX_READY" {
continue
}
exited := ""
if container.Pid == 0 {
exited = "-exited"
}
if parts[0] == "kube-system" {
cols = append(
cols,
nodeCollector{
filename: fmt.Sprintf("%s/%s%s.log", parts[0], container.Name, exited),
collect: logs(container.Id, true),
},
)
}
}
}
return cols, err
}
func getResources(ctx context.Context, c *client.Client) ([]nodeCollector, error) {
responses, err := listResources(ctx, c, meta.NamespaceName, meta.ResourceDefinitionType)
if err != nil {
return nil, err
}
cols := []nodeCollector{}
for _, msg := range responses {
if msg.Resource == nil {
continue
}
b, err := yaml.Marshal(msg.Resource.Spec())
if err != nil {
return nil, err
}
spec := &meta.ResourceDefinitionSpec{}
if err = yaml.Unmarshal(b, spec); err != nil {
return nil, err
}
cols = append(cols, nodeCollector{
filename: fmt.Sprintf("talosResources/%s.yaml", spec.ID()),
collect: talosResource(spec),
})
}
return cols, nil
}
func serviceInfo(id string) collect {
return func(ctx context.Context, options *BundleOptions) ([]byte, error) {
services, err := options.Client.ServiceInfo(ctx, id)
if err != nil {
if services == nil {
return nil, fmt.Errorf("error listing services: %w", err)
}
}
var buf bytes.Buffer
if err := cli.RenderServicesInfo(services, &buf, "", false); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
}
func dmesg(ctx context.Context, options *BundleOptions) ([]byte, error) {
stream, err := options.Client.Dmesg(ctx, false, false)
if err != nil {
return nil, err
}
data := []byte{}
for {
resp, err := stream.Recv()
if err != nil {
if err == io.EOF || client.StatusCode(err) == codes.Canceled {
break
}
return nil, fmt.Errorf("error reading from stream: %w", err)
}
if resp.Metadata != nil {
if resp.Metadata.Error != "" {
fmt.Fprintf(os.Stderr, "%s\n", resp.Metadata.Error)
}
}
data = append(data, resp.GetBytes()...)
}
return data, nil
}
func logs(service string, kubernetes bool) collect {
return func(ctx context.Context, options *BundleOptions) ([]byte, error) {
var (
namespace string
driver common.ContainerDriver
err error
)
if kubernetes {
namespace = criconstants.K8sContainerdNamespace
driver = common.ContainerDriver_CRI
} else {
namespace = constants.SystemContainerdNamespace
driver = common.ContainerDriver_CONTAINERD
}
options.Log("getting %s/%s service logs", namespace, service)
stream, err := options.Client.Logs(ctx, namespace, driver, service, false, -1)
if err != nil {
return nil, err
}
data := []byte{}
for {
resp, err := stream.Recv()
if err != nil {
if err == io.EOF || client.StatusCode(err) == codes.Canceled {
break
}
return nil, fmt.Errorf("error reading from stream: %w", err)
}
if resp.Metadata != nil {
if resp.Metadata.Error != "" {
fmt.Fprintf(os.Stderr, "%s\n", resp.Metadata.Error)
}
}
data = append(data, resp.GetBytes()...)
}
return data, nil
}
}
func dependencies(ctx context.Context, options *BundleOptions) ([]byte, error) {
options.Log("inspecting controller runtime")
resp, err := options.Client.Inspect.ControllerRuntimeDependencies(ctx)
if err != nil {
if resp == nil {
return nil, fmt.Errorf("error getting controller runtime dependencies: %s", err)
}
}
var buf bytes.Buffer
if err = cli.RenderGraph(ctx, options.Client, resp, &buf, true); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
func talosResource(rd *meta.ResourceDefinitionSpec) collect {
return func(ctx context.Context, options *BundleOptions) ([]byte, error) {
options.Log("getting talos resource %s/%s", rd.DefaultNamespace, rd.ID())
responses, err := listResources(ctx, options.Client, rd.DefaultNamespace, rd.ID())
if err != nil {
return nil, err
}
var (
buf bytes.Buffer
hasItems bool
)
encoder := yaml.NewEncoder(&buf)
for _, msg := range responses {
if msg.Resource == nil {
continue
}
data := struct {
Metadata *resource.Metadata `yaml:"metadata"`
Spec interface{} `yaml:"spec"`
}{
Metadata: msg.Resource.Metadata(),
Spec: "<REDACTED>",
}
if rd.Sensitivity != meta.Sensitive {
data.Spec = msg.Resource.Spec()
}
if err = encoder.Encode(&data); err != nil {
return nil, err
}
hasItems = true
}
if !hasItems {
return nil, nil
}
return buf.Bytes(), encoder.Close()
}
}
func kubernetesNodes(client *kubernetes.Clientset) collect {
return func(ctx context.Context, options *BundleOptions) ([]byte, error) {
options.Log("getting kubernetes nodes manifests")
nodes, err := client.CoreV1().Nodes().List(ctx, v1.ListOptions{})
if err != nil {
return nil, err
}
return marshalYAML(nodes)
}
}
func systemPods(client *kubernetes.Clientset) collect {
return func(ctx context.Context, options *BundleOptions) ([]byte, error) {
options.Log("getting pods manifests in kube-system namespace")
nodes, err := client.CoreV1().Pods("kube-system").List(ctx, v1.ListOptions{})
if err != nil {
return nil, err
}
return marshalYAML(nodes)
}
}
func mounts(ctx context.Context, options *BundleOptions) ([]byte, error) {
options.Log("getting mounts")
resp, err := options.Client.Mounts(ctx)
if err != nil {
if resp == nil {
return nil, fmt.Errorf("error getting interfaces: %s", err)
}
}
var buf bytes.Buffer
if err = cli.RenderMounts(resp, &buf, nil); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
func devices(ctx context.Context, options *BundleOptions) ([]byte, error) {
options.Log("reading devices")
r, _, err := options.Client.Read(ctx, "/proc/bus/pci/devices")
if err != nil {
return nil, err
}
defer r.Close() //nolint:errcheck
return io.ReadAll(r)
}
func ioPressure(ctx context.Context, options *BundleOptions) ([]byte, error) {
options.Log("getting disk stats")
resp, err := options.Client.MachineClient.DiskStats(ctx, &emptypb.Empty{})
var filtered interface{}
filtered, err = client.FilterMessages(resp, err)
resp, _ = filtered.(*machine.DiskStatsResponse) //nolint:errcheck
if err != nil {
return nil, err
}
var buf bytes.Buffer
w := tabwriter.NewWriter(&buf, 0, 0, 3, ' ', 0)
fmt.Fprintln(w, "NAME\tIO_TIME\tIO_TIME_WEIGHTED\tDISK_WRITE_SECTORS\tDISK_READ_SECTORS")
for _, msg := range resp.Messages {
for _, stat := range msg.Devices {
fmt.Fprintf(w, "%s\t%d\t%d\t%d\t%d\n",
stat.Name,
stat.IoTimeMs,
stat.IoTimeWeightedMs,
stat.WriteSectors,
stat.ReadSectors,
)
}
}
if err = w.Flush(); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
func processes(ctx context.Context, options *BundleOptions) ([]byte, error) {
options.Log("getting processes snapshot")
resp, err := options.Client.Processes(ctx)
if err != nil {
return nil, err
}
var buf bytes.Buffer
w := tabwriter.NewWriter(&buf, 0, 0, 3, ' ', 0)
fmt.Fprintln(w, "PID\tSTATE\tTHREADS\tCPU-TIME\tVIRTMEM\tRESMEM\tCOMMAND")
for _, msg := range resp.Messages {
procs := msg.Processes
var args string
for _, p := range procs {
switch {
case p.Executable == "":
args = p.Command
case p.Args != "" && strings.Fields(p.Args)[0] == filepath.Base(strings.Fields(p.Executable)[0]):
args = strings.Replace(p.Args, strings.Fields(p.Args)[0], p.Executable, 1)
default:
args = p.Args
}
fmt.Fprintf(w, "%6d\t%1s\t%4d\t%8.2f\t%7s\t%7s\t%s\n",
p.Pid, p.State, p.Threads, p.CpuTime, humanize.Bytes(p.VirtualMemory), humanize.Bytes(p.ResidentMemory), args)
}
}
if err := w.Flush(); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
func summary(ctx context.Context, options *BundleOptions) ([]byte, error) {
resp, err := options.Client.Version(ctx)
if err != nil {
return nil, err
}
var buf bytes.Buffer
for _, m := range resp.Messages {
version.WriteLongVersionFromExisting(&buf, m.Version)
}
return buf.Bytes(), nil
}
func listResources(ctx context.Context, c *client.Client, namespace, resourceType string) ([]client.ResourceResponse, error) {
listClient, err := c.Resources.List(ctx, namespace, resourceType)
if err != nil {
return nil, err
}
resources := []client.ResourceResponse{}
for {
msg, err := listClient.Recv()
if err != nil {
if err == io.EOF || client.StatusCode(err) == codes.Canceled {
return resources, nil
}
return nil, err
}
if msg.Metadata.GetError() != "" {
fmt.Fprintf(os.Stderr, "%s: %s\n", msg.Metadata.GetHostname(), msg.Metadata.GetError())
continue
}
resources = append(resources, msg)
}
}
func marshalYAML(resource runtime.Object) ([]byte, error) {
serializer := k8sjson.NewSerializerWithOptions(
k8sjson.DefaultMetaFactory, nil, nil,
k8sjson.SerializerOptions{
Yaml: true,
Pretty: true,
Strict: true,
},
)
var buf bytes.Buffer
if err := serializer.Encode(resource, &buf); err != nil {
return nil, err
}
return buf.Bytes(), nil
}