Andrew Rynhard 69fa63a7b2 refactor: perform upgrade upon reboot
This PR introduces a new strategy for upgrades. Instead of attempting to
zap the partition table, create a new one, and then format the
partitions, this change will only update the `vmlinuz`, and
`initramfs.xz` being used to boot. It introduces an A/B style upgrade
process, which will allow for easy rollbacks. One deviation from our
original intention with upgrades is that this change does not completely
reset a node. It falls just short of that and does not reset the
partition table. This forces us to keep the current partition scheme in
mind as we make changes in the future, because an upgrade assumes a
specific partition scheme. We can improve upgrades further in the
future, but this will at least make them more dependable. Finally, one
more feature in this PR is the ability to keep state. This enables
single node clusters to upgrade since we keep the etcd data around.

Signed-off-by: Andrew Rynhard <andrew@andrewrynhard.com>
2020-03-20 17:32:18 -07:00

109 lines
2.7 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package etcd
import (
"context"
"fmt"
"net/url"
"time"
"go.etcd.io/etcd/clientv3"
"go.etcd.io/etcd/pkg/transport"
"github.com/talos-systems/talos/pkg/config"
"github.com/talos-systems/talos/pkg/config/machine"
"github.com/talos-systems/talos/pkg/constants"
"github.com/talos-systems/talos/pkg/crypto/x509"
"github.com/talos-systems/talos/pkg/kubernetes"
)
// NewClient initializes and returns an etcd client configured to talk to
// a local endpoint.
func NewClient(endpoints []string) (client *clientv3.Client, err error) {
tlsInfo := transport.TLSInfo{
CertFile: constants.KubernetesEtcdPeerCert,
KeyFile: constants.KubernetesEtcdPeerKey,
TrustedCAFile: constants.KubernetesEtcdCACert,
}
tlsConfig, err := tlsInfo.ClientConfig()
if err != nil {
return nil, err
}
client, err = clientv3.New(clientv3.Config{
Endpoints: endpoints,
DialTimeout: 5 * time.Second,
TLS: tlsConfig,
})
if err != nil {
return nil, err
}
return client, nil
}
// NewClientFromControlPlaneIPs initializes and returns an etcd client
// configured to talk to all members.
func NewClientFromControlPlaneIPs(creds *x509.PEMEncodedCertificateAndKey, endpoint *url.URL) (client *clientv3.Client, err error) {
h, err := kubernetes.NewTemporaryClientFromPKI(creds, endpoint)
if err != nil {
return nil, err
}
var endpoints []string
if endpoints, err = h.MasterIPs(); err != nil {
return nil, err
}
// Etcd expects host:port format.
for i := 0; i < len(endpoints); i++ {
endpoints[i] += ":2379"
}
return NewClient(endpoints)
}
// ValidateForUpgrade validates the etcd cluster state to ensure that performing
// an upgrade is safe.
func ValidateForUpgrade(preserve bool) error {
config, err := config.NewFromFile(constants.ConfigPath)
if err != nil {
return err
}
if config.Machine().Type() != machine.TypeWorker {
client, err := NewClientFromControlPlaneIPs(config.Cluster().CA(), config.Cluster().Endpoint())
if err != nil {
return err
}
// nolint: errcheck
defer client.Close()
resp, err := client.MemberList(context.Background())
if err != nil {
return err
}
if !preserve {
if len(resp.Members) == 1 {
return fmt.Errorf("only 1 etcd member found. assuming this is not an HA setup and refusing to upgrade")
}
}
for _, member := range resp.Members {
// If the member is not started, the name will be an empty string.
if len(member.Name) == 0 {
return fmt.Errorf("etcd member %d is not started, all members must be running to perform an upgrade", member.ID)
}
}
}
return nil
}