talos/pkg/kubernetes/kubernetes.go
Andrey Smirnov 779ac74a08 fix: improve the drain function
Critical bug (I believe) was that drain code entered the loop to evict
the pod after wait for pod to be deleted returned success effectively
evicting pod once again once it got rescheduled to a different node.

Add a global timeout to prevent draining code from running forever.

Filter more pod types which shouldn't be ever drained.

Fixes #3124

Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
2021-02-25 07:02:24 -08:00

459 lines
12 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package kubernetes
import (
"context"
"encoding/json"
"errors"
"fmt"
"log"
"net/url"
"time"
"github.com/talos-systems/crypto/x509"
"github.com/talos-systems/go-retry/retry"
"golang.org/x/sync/errgroup"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
policy "k8s.io/api/policy/v1beta1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/strategicpatch"
"k8s.io/client-go/kubernetes"
restclient "k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
"github.com/talos-systems/talos/pkg/machinery/config/types/v1alpha1/machine"
"github.com/talos-systems/talos/pkg/machinery/constants"
)
const (
// DrainTimeout is maximum time to wait for the node to be drained.
DrainTimeout = 5 * time.Minute
)
// Client represents a set of helper methods for interacting with the
// Kubernetes API.
type Client struct {
*kubernetes.Clientset
}
// NewClientFromKubeletKubeconfig initializes and returns a Client.
func NewClientFromKubeletKubeconfig() (client *Client, err error) {
var config *restclient.Config
config, err = clientcmd.BuildConfigFromFlags("", constants.KubeletKubeconfig)
if err != nil {
return nil, err
}
if config.Timeout == 0 {
config.Timeout = 30 * time.Second
}
var clientset *kubernetes.Clientset
clientset, err = kubernetes.NewForConfig(config)
if err != nil {
return nil, err
}
return &Client{clientset}, nil
}
// NewForConfig initializes and returns a client using the provided config.
func NewForConfig(config *restclient.Config) (client *Client, err error) {
var clientset *kubernetes.Clientset
clientset, err = kubernetes.NewForConfig(config)
if err != nil {
return nil, err
}
return &Client{clientset}, nil
}
// NewClientFromPKI initializes and returns a Client.
//
// nolint: interfacer
func NewClientFromPKI(ca, crt, key []byte, endpoint *url.URL) (client *Client, err error) {
tlsClientConfig := restclient.TLSClientConfig{
CAData: ca,
CertData: crt,
KeyData: key,
}
config := &restclient.Config{
Host: endpoint.String(),
TLSClientConfig: tlsClientConfig,
Timeout: 30 * time.Second,
}
var clientset *kubernetes.Clientset
clientset, err = kubernetes.NewForConfig(config)
if err != nil {
return nil, err
}
return &Client{clientset}, nil
}
// NewTemporaryClientFromPKI initializes a Kubernetes client using a certificate
// with a TTL of 10 minutes.
func NewTemporaryClientFromPKI(ca *x509.PEMEncodedCertificateAndKey, endpoint *url.URL) (client *Client, err error) {
opts := []x509.Option{
x509.CommonName("admin"),
x509.Organization("system:masters"),
x509.NotAfter(time.Now().Add(10 * time.Minute)),
}
k8sCA, err := x509.NewCertificateAuthorityFromCertificateAndKey(ca)
if err != nil {
return nil, fmt.Errorf("failed decoding Kubernetes CA: %w", err)
}
keyPair, err := x509.NewKeyPair(k8sCA, opts...)
if err != nil {
return nil, fmt.Errorf("failed generating temporary cert: %w", err)
}
h, err := NewClientFromPKI(ca.Crt, keyPair.CrtPEM, keyPair.KeyPEM, endpoint)
if err != nil {
return nil, fmt.Errorf("failed to create client: %w", err)
}
return h, nil
}
// MasterIPs returns a list of control plane endpoints (IP addresses).
func (h *Client) MasterIPs(ctx context.Context) (addrs []string, err error) {
endpoints, err := h.CoreV1().Endpoints("default").Get(ctx, "kubernetes", metav1.GetOptions{})
if err != nil {
return nil, err
}
addrs = []string{}
for _, endpoint := range endpoints.Subsets {
for _, addr := range endpoint.Addresses {
addrs = append(addrs, addr.IP)
}
}
return addrs, nil
}
// NodeIPs returns list of node IP addresses by machine type.
func (h *Client) NodeIPs(ctx context.Context, machineType machine.Type) (addrs []string, err error) {
resp, err := h.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return nil, err
}
addrs = []string{}
for _, node := range resp.Items {
_, labelMaster := node.Labels[constants.LabelNodeRoleMaster]
_, labelControlPlane := node.Labels[constants.LabelNodeRoleControlPlane]
skip := true
switch machineType { //nolint: exhaustive
case machine.TypeInit, machine.TypeControlPlane:
skip = !(labelMaster || labelControlPlane)
case machine.TypeJoin:
skip = labelMaster || labelControlPlane
}
if skip {
continue
}
for _, nodeAddress := range node.Status.Addresses {
if nodeAddress.Type == corev1.NodeInternalIP {
addrs = append(addrs, nodeAddress.Address)
break
}
}
}
return addrs, nil
}
// LabelNodeAsMaster labels a node with the required master label and NoSchedule taint.
//
//nolint: gocyclo
func (h *Client) LabelNodeAsMaster(name string, taintNoSchedule bool) (err error) {
n, err := h.CoreV1().Nodes().Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
// The node may appear to have no labels at first, so we check for the
// existence of a well known label to ensure the patch will be successful.
if _, found := n.ObjectMeta.Labels[corev1.LabelHostname]; !found {
return errors.New("could not find hostname label")
}
oldData, err := json.Marshal(n)
if err != nil {
return fmt.Errorf("failed to marshal unmodified node %q into JSON: %w", n.Name, err)
}
n.Labels[constants.LabelNodeRoleMaster] = ""
n.Labels[constants.LabelNodeRoleControlPlane] = ""
if taintNoSchedule {
taintFound := false
// TODO: with K8s 1.21, add new taint LabelNodeRoleControlPlane
for _, taint := range n.Spec.Taints {
if taint.Key == constants.LabelNodeRoleMaster {
taintFound = true
break
}
}
if !taintFound {
n.Spec.Taints = append(n.Spec.Taints, corev1.Taint{
Key: constants.LabelNodeRoleMaster,
Effect: corev1.TaintEffectNoSchedule,
})
}
}
newData, err := json.Marshal(n)
if err != nil {
return fmt.Errorf("failed to marshal modified node %q into JSON: %w", n.Name, err)
}
patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, corev1.Node{})
if err != nil {
return fmt.Errorf("failed to create two way merge patch: %w", err)
}
if _, err := h.CoreV1().Nodes().Patch(context.TODO(), n.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}); err != nil {
if apierrors.IsConflict(err) {
return fmt.Errorf("unable to update node metadata due to conflict: %w", err)
}
return fmt.Errorf("error patching node %q: %w", n.Name, err)
}
return nil
}
// WaitUntilReady waits for a node to be ready.
func (h *Client) WaitUntilReady(name string) error {
return retry.Exponential(10*time.Minute, retry.WithUnits(250*time.Millisecond), retry.WithJitter(50*time.Millisecond), retry.WithErrorLogging(true)).Retry(func() error {
attemptCtx, attemptCtxCancel := context.WithTimeout(context.TODO(), 30*time.Second)
defer attemptCtxCancel()
node, err := h.CoreV1().Nodes().Get(attemptCtx, name, metav1.GetOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
return retry.ExpectedError(err)
}
if apierrors.ReasonForError(err) == metav1.StatusReasonUnknown {
// non-API error, e.g. networking error
return retry.ExpectedError(err)
}
return retry.UnexpectedError(err)
}
for _, cond := range node.Status.Conditions {
if cond.Type == corev1.NodeReady {
if cond.Status != corev1.ConditionTrue {
return retry.ExpectedError(fmt.Errorf("node not ready"))
}
}
}
return nil
})
}
// CordonAndDrain cordons and drains a node in one call.
func (h *Client) CordonAndDrain(ctx context.Context, node string) (err error) {
if err = h.Cordon(ctx, node); err != nil {
return err
}
return h.Drain(ctx, node)
}
// Cordon marks a node as unschedulable.
func (h *Client) Cordon(ctx context.Context, name string) error {
err := retry.Exponential(30*time.Second, retry.WithUnits(250*time.Millisecond), retry.WithJitter(50*time.Millisecond)).RetryWithContext(ctx, func(ctx context.Context) error {
node, err := h.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{})
if err != nil {
return retry.UnexpectedError(err)
}
if node.Spec.Unschedulable {
return nil
}
node.Annotations[constants.AnnotationCordonedKey] = constants.AnnotationCordonedValue
node.Spec.Unschedulable = true
if _, err := h.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}); err != nil {
return retry.ExpectedError(err)
}
return nil
})
if err != nil {
return fmt.Errorf("failed to cordon node %s: %w", name, err)
}
return nil
}
// Uncordon marks a node as schedulable.
//
// If force is set, node will be uncordoned even if cordoned not by Talos.
func (h *Client) Uncordon(name string, force bool) error {
err := retry.Exponential(30*time.Second, retry.WithUnits(250*time.Millisecond), retry.WithJitter(50*time.Millisecond)).Retry(func() error {
attemptCtx, attemptCtxCancel := context.WithTimeout(context.TODO(), 10*time.Second)
defer attemptCtxCancel()
node, err := h.CoreV1().Nodes().Get(attemptCtx, name, metav1.GetOptions{})
if err != nil {
return retry.UnexpectedError(err)
}
if !force && node.Annotations[constants.AnnotationCordonedKey] != constants.AnnotationCordonedValue {
// not cordoned by Talos, skip it
return nil
}
if node.Spec.Unschedulable {
node.Spec.Unschedulable = false
delete(node.Annotations, constants.AnnotationCordonedKey)
if _, err := h.CoreV1().Nodes().Update(attemptCtx, node, metav1.UpdateOptions{}); err != nil {
return retry.ExpectedError(err)
}
}
return nil
})
if err != nil {
return fmt.Errorf("failed to uncordon node %s: %w", name, err)
}
return nil
}
// Drain evicts all pods on a given node.
func (h *Client) Drain(ctx context.Context, node string) error {
ctx, cancel := context.WithTimeout(ctx, DrainTimeout)
defer cancel()
opts := metav1.ListOptions{
FieldSelector: fields.SelectorFromSet(fields.Set{"spec.nodeName": node}).String(),
}
pods, err := h.CoreV1().Pods(metav1.NamespaceAll).List(ctx, opts)
if err != nil {
return fmt.Errorf("cannot get pods for node %s: %w", node, err)
}
var eg errgroup.Group
// Evict each pod.
for _, pod := range pods.Items {
p := pod
eg.Go(func() error {
if _, ok := p.ObjectMeta.Annotations[corev1.MirrorPodAnnotationKey]; ok {
log.Printf("skipping mirror pod %s/%s\n", p.GetNamespace(), p.GetName())
return nil
}
controllerRef := metav1.GetControllerOf(&p)
if controllerRef == nil {
log.Printf("skipping unmanaged pod %s/%s\n", p.GetNamespace(), p.GetName())
return nil
}
if controllerRef.Kind == appsv1.SchemeGroupVersion.WithKind("DaemonSet").Kind {
log.Printf("skipping DaemonSet pod %s/%s\n", p.GetNamespace(), p.GetName())
return nil
}
if !p.DeletionTimestamp.IsZero() {
log.Printf("skipping deleted pod %s/%s\n", p.GetNamespace(), p.GetName())
}
if err := h.evict(ctx, p, int64(60)); err != nil {
log.Printf("WARNING: failed to evict pod: %v", err)
}
return nil
})
}
return eg.Wait()
}
func (h *Client) evict(ctx context.Context, p corev1.Pod, gracePeriod int64) error {
for {
pol := &policy.Eviction{
ObjectMeta: metav1.ObjectMeta{Namespace: p.GetNamespace(), Name: p.GetName()},
DeleteOptions: &metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod},
}
err := h.CoreV1().Pods(p.GetNamespace()).Evict(ctx, pol)
switch {
case apierrors.IsTooManyRequests(err):
time.Sleep(5 * time.Second)
case apierrors.IsNotFound(err):
return nil
case err != nil:
return fmt.Errorf("failed to evict pod %s/%s: %w", p.GetNamespace(), p.GetName(), err)
default:
if err = h.waitForPodDeleted(ctx, &p); err != nil {
return fmt.Errorf("failed waiting on pod %s/%s to be deleted: %w", p.GetNamespace(), p.GetName(), err)
}
return nil
}
}
}
func (h *Client) waitForPodDeleted(ctx context.Context, p *corev1.Pod) error {
return retry.Constant(time.Minute, retry.WithUnits(3*time.Second)).RetryWithContext(ctx, func(ctx context.Context) error {
pod, err := h.CoreV1().Pods(p.GetNamespace()).Get(ctx, p.GetName(), metav1.GetOptions{})
switch {
case apierrors.IsNotFound(err):
return nil
case err != nil:
return retry.UnexpectedError(fmt.Errorf("failed to get pod %s/%s: %w", p.GetNamespace(), p.GetName(), err))
}
if pod.GetUID() != p.GetUID() {
return nil
}
return retry.ExpectedError(errors.New("pod is still running on the node"))
})
}