hcloud-upload-image/hcloudimages/client.go
Julian Tölle fdfb284533
feat: smaller snapshots by zeroing disk first (#101)
The base image used requires ~0.42Gi. Even if the uploaded image is
smaller, those bytes are currently not overwritten and still part of the
stored snapshot.

By zeroing the root disk first, those unwanted bytes are removed and not
stored with the snapshot.

This has two benefits:

1. Snapshots are billed by their compressed (shown) size, so small
images are now a bit cheaper.
2. The time it takes to create a server from the snapshot scales with
the snapshot size, so smaller snapshots means the server can start more
quickly.

This reduces the size of an example Talos x86 image from 0.42Gi before,
to 0.2Gi afterwards. An example Flatcar image was 0.47Gi before, and
still has that size with this patch.

There are two ways to zero out the disk:

- `dd if=/dev/zero of=/dev/sda` actually writes zeroes to every block on
the device. This takes around a minute to do.
- `blkdiscard /dev/sda` talks to the disk direclty and instructs it to
discard all blocks. This only takes around 5 seconds.

As both have the same effect on image size, but `blkdiscard` is SO MUCH
faster, I have decided to use it.

Even though only small images benefit from this, this is now enabled by
default as the downside (5 second slower upload) does not justify
additional flags or options to enable/disable this.

Closes #96
2025-05-10 14:21:31 +02:00

548 lines
17 KiB
Go

package hcloudimages
import (
"context"
"errors"
"fmt"
"io"
"log/slog"
"net/url"
"time"
"github.com/hetznercloud/hcloud-go/v2/hcloud"
"github.com/hetznercloud/hcloud-go/v2/hcloud/exp/kit/sshutil"
"golang.org/x/crypto/ssh"
"github.com/apricote/hcloud-upload-image/hcloudimages/contextlogger"
"github.com/apricote/hcloud-upload-image/hcloudimages/internal/actionutil"
"github.com/apricote/hcloud-upload-image/hcloudimages/internal/control"
"github.com/apricote/hcloud-upload-image/hcloudimages/internal/labelutil"
"github.com/apricote/hcloud-upload-image/hcloudimages/internal/randomid"
"github.com/apricote/hcloud-upload-image/hcloudimages/internal/sshsession"
)
const (
CreatedByLabel = "apricote.de/created-by"
CreatedByValue = "hcloud-upload-image"
resourcePrefix = "hcloud-upload-image-"
)
var (
DefaultLabels = map[string]string{
CreatedByLabel: CreatedByValue,
}
serverTypePerArchitecture = map[hcloud.Architecture]*hcloud.ServerType{
hcloud.ArchitectureX86: {Name: "cx22"},
hcloud.ArchitectureARM: {Name: "cax11"},
}
defaultImage = &hcloud.Image{Name: "ubuntu-24.04"}
defaultLocation = &hcloud.Location{Name: "fsn1"}
defaultRescueType = hcloud.ServerRescueTypeLinux64
defaultSSHDialTimeout = 1 * time.Minute
// Size observed on x86, 2025-05-03, no idea if that changes.
// Might be able to extends this to more of the available memory.
rescueSystemRootDiskSizeMB int64 = 960
)
type UploadOptions struct {
// ImageURL must be publicly available. The instance will download the image from this endpoint.
ImageURL *url.URL
// ImageReader
ImageReader io.Reader
// ImageCompression describes the compression of the referenced image file. It defaults to [CompressionNone]. If
// set to anything else, the file will be decompressed before written to the disk.
ImageCompression Compression
ImageFormat Format
// Can be optionally set to make the client validate that the image can be written to the server.
ImageSize int64
// Possible future additions:
// ImageSignatureVerification
// ImageLocalPath
// Architecture should match the architecture of the Image. This decides if the Snapshot can later be
// used with [hcloud.ArchitectureX86] or [hcloud.ArchitectureARM] servers.
//
// Internally this decides what server type is used for the temporary server.
//
// Optional if [UploadOptions.ServerType] is set.
Architecture hcloud.Architecture
// ServerType can be optionally set to override the default server type for the architecture.
// Situations where this makes sense:
//
// - Your image is larger than the root disk of the default server types.
// - The default server type is no longer available, or not temporarily out of stock.
ServerType *hcloud.ServerType
// Description is an optional description that the resulting image (snapshot) will have. There is no way to
// select images by its description, you should use Labels if you need to identify your image later.
Description *string
// Labels will be added to the resulting image (snapshot). Use these to filter the image list if you
// need to identify the image later on.
//
// We also always add a label `apricote.de/created-by=hcloud-image-upload` ([CreatedByLabel], [CreatedByValue]).
Labels map[string]string
// DebugSkipResourceCleanup will skip the cleanup of the temporary SSH Key and Server.
DebugSkipResourceCleanup bool
}
type Compression string
const (
CompressionNone Compression = ""
CompressionBZ2 Compression = "bz2"
CompressionXZ Compression = "xz"
// Possible future additions:
// zip,zstd
)
type Format string
const (
FormatRaw Format = ""
// FormatQCOW2 allows to upload images in the qcow2 format directly.
//
// The qcow2 image must fit on the disk available in the rescue system. "qemu-img dd", which is used to convert
// qcow2 to raw, requires a file as an input. If [UploadOption.ImageSize] is set and FormatQCOW2 is used, there is a
// warning message displayed if there is a high probability of issues.
FormatQCOW2 Format = "qcow2"
)
// NewClient instantiates a new client. It requires a working [*hcloud.Client] to interact with the Hetzner Cloud API.
func NewClient(c *hcloud.Client) *Client {
return &Client{
c: c,
}
}
type Client struct {
c *hcloud.Client
}
// Upload the specified image into a snapshot on Hetzner Cloud.
//
// As the Hetzner Cloud API has no direct way to upload images, we create a temporary server,
// overwrite the root disk and take a snapshot of that disk instead.
//
// The temporary server costs money. If the upload fails, we might be unable to delete the server. Check out
// CleanupTempResources for a helper in this case.
func (s *Client) Upload(ctx context.Context, options UploadOptions) (*hcloud.Image, error) {
logger := contextlogger.From(ctx).With(
"library", "hcloudimages",
"method", "upload",
)
id, err := randomid.Generate()
if err != nil {
return nil, err
}
logger = logger.With("run-id", id)
// For simplicity, we use the name random name for SSH Key + Server
resourceName := resourcePrefix + id
labels := labelutil.Merge(DefaultLabels, options.Labels)
// 0. Validations
if options.ImageFormat == FormatQCOW2 && options.ImageSize > 0 {
if options.ImageSize > rescueSystemRootDiskSizeMB*1024*1024 {
// Just a warning, because the size might change with time.
// Alternatively one could add an override flag for the check and make this an error.
logger.WarnContext(ctx,
fmt.Sprintf("image must be smaller than %d MB (rescue system root disk) for qcow2", rescueSystemRootDiskSizeMB),
"maximum-size", rescueSystemRootDiskSizeMB,
"actual-size", options.ImageSize/(1024*1024),
)
}
}
// 1. Create SSH Key
logger.InfoContext(ctx, "# Step 1: Generating SSH Key")
privateKey, publicKey, err := sshutil.GenerateKeyPair()
if err != nil {
return nil, fmt.Errorf("failed to generate temporary ssh key pair: %w", err)
}
key, _, err := s.c.SSHKey.Create(ctx, hcloud.SSHKeyCreateOpts{
Name: resourceName,
PublicKey: string(publicKey),
Labels: labels,
})
if err != nil {
return nil, fmt.Errorf("failed to submit temporary ssh key to API: %w", err)
}
logger.DebugContext(ctx, "Uploaded ssh key", "ssh-key-id", key.ID)
defer func() {
// Cleanup SSH Key
if options.DebugSkipResourceCleanup {
logger.InfoContext(ctx, "Cleanup: Skipping cleanup of temporary ssh key")
return
}
logger.InfoContext(ctx, "Cleanup: Deleting temporary ssh key")
_, err := s.c.SSHKey.Delete(ctx, key)
if err != nil {
logger.WarnContext(ctx, "Cleanup: ssh key could not be deleted", "error", err)
// TODO
}
}()
// 2. Create Server
logger.InfoContext(ctx, "# Step 2: Creating Server")
var serverType *hcloud.ServerType
if options.ServerType != nil {
serverType = options.ServerType
} else {
var ok bool
serverType, ok = serverTypePerArchitecture[options.Architecture]
if !ok {
return nil, fmt.Errorf("unknown architecture %q, valid options: %q, %q", options.Architecture, hcloud.ArchitectureX86, hcloud.ArchitectureARM)
}
}
logger.DebugContext(ctx, "creating server with config",
"image", defaultImage.Name,
"location", defaultLocation.Name,
"serverType", serverType.Name,
)
serverCreateResult, _, err := s.c.Server.Create(ctx, hcloud.ServerCreateOpts{
Name: resourceName,
ServerType: serverType,
// Not used, but without this the user receives an email with a password for every created server
SSHKeys: []*hcloud.SSHKey{key},
// We need to enable rescue system first
StartAfterCreate: hcloud.Ptr(false),
// Image will never be booted, we only boot into rescue system
Image: defaultImage,
Location: defaultLocation,
Labels: labels,
})
if err != nil {
return nil, fmt.Errorf("creating the temporary server failed: %w", err)
}
logger = logger.With("server", serverCreateResult.Server.ID)
logger.DebugContext(ctx, "Created Server")
logger.DebugContext(ctx, "waiting on actions")
err = s.c.Action.WaitFor(ctx, append(serverCreateResult.NextActions, serverCreateResult.Action)...)
if err != nil {
return nil, fmt.Errorf("creating the temporary server failed: %w", err)
}
logger.DebugContext(ctx, "actions finished")
server := serverCreateResult.Server
defer func() {
// Cleanup Server
if options.DebugSkipResourceCleanup {
logger.InfoContext(ctx, "Cleanup: Skipping cleanup of temporary server")
return
}
logger.InfoContext(ctx, "Cleanup: Deleting temporary server")
_, _, err := s.c.Server.DeleteWithResult(ctx, server)
if err != nil {
logger.WarnContext(ctx, "Cleanup: server could not be deleted", "error", err)
}
}()
// 3. Activate Rescue System
logger.InfoContext(ctx, "# Step 3: Activating Rescue System")
enableRescueResult, _, err := s.c.Server.EnableRescue(ctx, server, hcloud.ServerEnableRescueOpts{
Type: defaultRescueType,
SSHKeys: []*hcloud.SSHKey{key},
})
if err != nil {
return nil, fmt.Errorf("enabling the rescue system on the temporary server failed: %w", err)
}
logger.DebugContext(ctx, "rescue system requested, waiting on action")
err = s.c.Action.WaitFor(ctx, enableRescueResult.Action)
if err != nil {
return nil, fmt.Errorf("enabling the rescue system on the temporary server failed: %w", err)
}
logger.DebugContext(ctx, "action finished, rescue system enabled")
// 4. Boot Server
logger.InfoContext(ctx, "# Step 4: Booting Server")
powerOnAction, _, err := s.c.Server.Poweron(ctx, server)
if err != nil {
return nil, fmt.Errorf("starting the temporary server failed: %w", err)
}
logger.DebugContext(ctx, "boot requested, waiting on action")
err = s.c.Action.WaitFor(ctx, powerOnAction)
if err != nil {
return nil, fmt.Errorf("starting the temporary server failed: %w", err)
}
logger.DebugContext(ctx, "action finished, server is booting")
// 5. Open SSH Session
logger.InfoContext(ctx, "# Step 5: Opening SSH Connection")
signer, err := ssh.ParsePrivateKey(privateKey)
if err != nil {
return nil, fmt.Errorf("parsing the automatically generated temporary private key failed: %w", err)
}
sshClientConfig := &ssh.ClientConfig{
User: "root",
Auth: []ssh.AuthMethod{
ssh.PublicKeys(signer),
},
// There is no way to get the host key of the rescue system beforehand
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
Timeout: defaultSSHDialTimeout,
}
// the server needs some time until its properly started and ssh is available
var sshClient *ssh.Client
err = control.Retry(
contextlogger.New(ctx, logger.With("operation", "ssh")),
100, // ~ 3 minutes
func() error {
var err error
logger.DebugContext(ctx, "trying to connect to server", "ip", server.PublicNet.IPv4.IP)
sshClient, err = ssh.Dial("tcp", server.PublicNet.IPv4.IP.String()+":ssh", sshClientConfig)
return err
},
)
if err != nil {
return nil, fmt.Errorf("failed to ssh into temporary server: %w", err)
}
defer func() { _ = sshClient.Close() }()
// 6. Wipe existing disk, to avoid storing any bytes from it in the snapshot
logger.InfoContext(ctx, "# Step 6: Cleaning existing disk")
output, err := sshsession.Run(sshClient, "blkdiscard /dev/sda", nil)
logger.DebugContext(ctx, string(output))
if err != nil {
return nil, fmt.Errorf("failed to clean existing disk: %w", err)
}
// 7. SSH On Server: Download Image, Decompress, Write to Root Disk
logger.InfoContext(ctx, "# Step 7: Downloading image and writing to disk")
cmd, err := assembleCommand(options)
if err != nil {
return nil, err
}
logger.DebugContext(ctx, "running download, decompress and write to disk command", "cmd", cmd)
output, err = sshsession.Run(sshClient, cmd, options.ImageReader)
logger.InfoContext(ctx, "# Step 7: Finished writing image to disk")
logger.DebugContext(ctx, string(output))
if err != nil {
return nil, fmt.Errorf("failed to download and write the image: %w", err)
}
// 8. SSH On Server: Shutdown
logger.InfoContext(ctx, "# Step 8: Shutting down server")
_, err = sshsession.Run(sshClient, "shutdown now", nil)
if err != nil {
// TODO Verify if shutdown error, otherwise return
logger.WarnContext(ctx, "shutdown returned error", "err", err)
}
// 9. Create Image from Server
logger.InfoContext(ctx, "# Step 9: Creating Image")
createImageResult, _, err := s.c.Server.CreateImage(ctx, server, &hcloud.ServerCreateImageOpts{
Type: hcloud.ImageTypeSnapshot,
Description: options.Description,
Labels: labels,
})
if err != nil {
return nil, fmt.Errorf("failed to create snapshot: %w", err)
}
logger.DebugContext(ctx, "image creation requested, waiting on action")
err = s.c.Action.WaitFor(ctx, createImageResult.Action)
if err != nil {
return nil, fmt.Errorf("failed to create snapshot: %w", err)
}
logger.DebugContext(ctx, "action finished, image was created")
image := createImageResult.Image
logger.InfoContext(ctx, "# Image was created", "image", image.ID)
// Resource cleanup is happening in `defer`
return image, nil
}
// CleanupTempResources tries to delete any resources that were left over from previous calls to [Client.Upload].
// Upload tries to clean up any temporary resources it created at runtime, but might fail at any point.
// You can then use this command to make sure that all temporary resources are removed from your project.
//
// This method tries to delete any server or ssh keys that match the [DefaultLabels]
func (s *Client) CleanupTempResources(ctx context.Context) error {
logger := contextlogger.From(ctx).With(
"library", "hcloudimages",
"method", "cleanup",
)
selector := labelutil.Selector(DefaultLabels)
logger = logger.With("selector", selector)
logger.InfoContext(ctx, "# Cleaning up Servers")
err := s.cleanupTempServers(ctx, logger, selector)
if err != nil {
return fmt.Errorf("failed to clean up all servers: %w", err)
}
logger.DebugContext(ctx, "cleaned up all servers")
logger.InfoContext(ctx, "# Cleaning up SSH Keys")
err = s.cleanupTempSSHKeys(ctx, logger, selector)
if err != nil {
return fmt.Errorf("failed to clean up all ssh keys: %w", err)
}
logger.DebugContext(ctx, "cleaned up all ssh keys")
return nil
}
func (s *Client) cleanupTempServers(ctx context.Context, logger *slog.Logger, selector string) error {
servers, err := s.c.Server.AllWithOpts(ctx, hcloud.ServerListOpts{ListOpts: hcloud.ListOpts{
LabelSelector: selector,
}})
if err != nil {
return fmt.Errorf("failed to list servers: %w", err)
}
if len(servers) == 0 {
logger.InfoContext(ctx, "No servers found")
return nil
}
logger.InfoContext(ctx, "removing servers", "count", len(servers))
errs := []error{}
actions := make([]*hcloud.Action, 0, len(servers))
for _, server := range servers {
result, _, err := s.c.Server.DeleteWithResult(ctx, server)
if err != nil {
errs = append(errs, err)
logger.WarnContext(ctx, "failed to delete server", "server", server.ID, "error", err)
continue
}
actions = append(actions, result.Action)
}
successActions, errorActions, err := actionutil.Settle(ctx, &s.c.Action, actions...)
if err != nil {
return fmt.Errorf("failed to wait for server delete: %w", err)
}
if len(successActions) > 0 {
ids := make([]int64, 0, len(successActions))
for _, action := range successActions {
for _, resource := range action.Resources {
if resource.Type == hcloud.ActionResourceTypeServer {
ids = append(ids, resource.ID)
}
}
}
logger.InfoContext(ctx, "successfully deleted servers", "servers", ids)
}
if len(errorActions) > 0 {
for _, action := range errorActions {
errs = append(errs, action.Error())
}
}
if len(errs) > 0 {
// The returned message contains no info about the server IDs which failed
return fmt.Errorf("failed to delete some of the servers: %w", errors.Join(errs...))
}
return nil
}
func (s *Client) cleanupTempSSHKeys(ctx context.Context, logger *slog.Logger, selector string) error {
keys, _, err := s.c.SSHKey.List(ctx, hcloud.SSHKeyListOpts{ListOpts: hcloud.ListOpts{
LabelSelector: selector,
}})
if err != nil {
return fmt.Errorf("failed to list keys: %w", err)
}
if len(keys) == 0 {
logger.InfoContext(ctx, "No ssh keys found")
return nil
}
errs := []error{}
for _, key := range keys {
_, err := s.c.SSHKey.Delete(ctx, key)
if err != nil {
errs = append(errs, err)
logger.WarnContext(ctx, "failed to delete ssh key", "ssh-key", key.ID, "error", err)
continue
}
}
if len(errs) > 0 {
// The returned message contains no info about the server IDs which failed
return fmt.Errorf("failed to delete some of the ssh keys: %w", errors.Join(errs...))
}
return nil
}
func assembleCommand(options UploadOptions) (string, error) {
// Make sure that we fail early, ie. if the image url does not work
cmd := "set -euo pipefail && "
if options.ImageURL != nil {
cmd += fmt.Sprintf("wget --no-verbose -O - %q | ", options.ImageURL.String())
}
if options.ImageCompression != CompressionNone {
switch options.ImageCompression {
case CompressionBZ2:
cmd += "bzip2 -cd | "
case CompressionXZ:
cmd += "xz -cd | "
default:
return "", fmt.Errorf("unknown compression: %q", options.ImageCompression)
}
}
switch options.ImageFormat {
case FormatRaw:
cmd += "dd of=/dev/sda bs=4M"
case FormatQCOW2:
cmd += "tee image.qcow2 > /dev/null && qemu-img dd -f qcow2 -O raw if=image.qcow2 of=/dev/sda bs=4M"
default:
return "", fmt.Errorf("unknown format: %q", options.ImageFormat)
}
cmd += " && sync"
// the pipefail does not work correctly without wrapping in bash.
cmd = fmt.Sprintf("bash -c '%s'", cmd)
return cmd, nil
}