From ce183cb2b4afaef2630a9ab4776a9a5dbe27ef97 Mon Sep 17 00:00:00 2001 From: Anis Eleuch Date: Wed, 10 Jul 2024 17:55:36 +0100 Subject: [PATCH] heal: List and heal again for any listing error (#19999) When a fresh drive healing is finished, add more checks for the drive listing errors. If any, re-list and heal again. Although this is an infrequent use case to have listPathRaw() returning nil when minDisks is set to 1, we still need to handle all possible use cases to avoid missing healing any object. Also, check for HealObject result to decide of an object is healed in the fresh disk since HealObject returns nil if an object is healed in any disk, and not in the new fresh drive. --- cmd/erasure-healing.go | 2 +- cmd/global-heal.go | 26 +++++++++++++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/cmd/erasure-healing.go b/cmd/erasure-healing.go index 538d8bc81..d7e982da3 100644 --- a/cmd/erasure-healing.go +++ b/cmd/erasure-healing.go @@ -629,7 +629,7 @@ func (er *erasureObjects) healObject(ctx context.Context, bucket string, object } for i, v := range result.Before.Drives { - if v.Endpoint == disk.String() { + if v.Endpoint == disk.Endpoint().String() { result.After.Drives[i].State = madmin.DriveStateOk } } diff --git a/cmd/global-heal.go b/cmd/global-heal.go index d3dd05a9c..352f4a9af 100644 --- a/cmd/global-heal.go +++ b/cmd/global-heal.go @@ -441,6 +441,8 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, continue } + var versionHealed bool + res, err := er.HealObject(ctx, bucket, encodedEntryName, version.VersionID, madmin.HealOpts{ ScanMode: scanMode, @@ -453,15 +455,22 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, versionNotFound++ continue } - // If not deleted, assume they failed. + } else { + // Look for the healing results + if res.After.Drives[tracker.DiskIndex].State == madmin.DriveStateOk { + versionHealed = true + } + } + + if versionHealed { + result = healEntrySuccess(uint64(version.Size)) + } else { result = healEntryFailure(uint64(version.Size)) if version.VersionID != "" { healingLogIf(ctx, fmt.Errorf("unable to heal object %s/%s-v(%s): %w", bucket, version.Name, version.VersionID, err)) } else { healingLogIf(ctx, fmt.Errorf("unable to heal object %s/%s: %w", bucket, version.Name, err)) } - } else { - result = healEntrySuccess(uint64(res.ObjectSize)) } if !send(result) { @@ -509,7 +518,11 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, jt.Take() go healEntry(bucket, *entry) }, - finished: nil, + finished: func(errs []error) { + if countErrs(errs, nil) != len(errs) { + retErr = fmt.Errorf("one or more errors reported during listing: %v", errors.Join(errs...)) + } + }, }) jt.Wait() // synchronize all the concurrent heal jobs if err != nil { @@ -517,7 +530,10 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, // we let the caller retry this disk again for the // buckets it failed to list. retErr = err - healingLogIf(ctx, fmt.Errorf("listing failed with: %v on bucket: %v", err, bucket)) + } + + if retErr != nil { + healingLogIf(ctx, fmt.Errorf("listing failed with: %v on bucket: %v", retErr, bucket)) continue }