diff --git a/cmd/erasure-sets.go b/cmd/erasure-sets.go index ee55a446d..7f83a2891 100644 --- a/cmd/erasure-sets.go +++ b/cmd/erasure-sets.go @@ -128,9 +128,11 @@ func connectEndpoint(endpoint Endpoint) (StorageAPI, *formatErasureV3, error) { if errors.Is(err, errUnformattedDisk) { info, derr := disk.DiskInfo(context.TODO(), false) if derr != nil && info.RootDisk { + disk.Close() return nil, nil, fmt.Errorf("Drive: %s is a root drive", disk) } } + disk.Close() return nil, nil, fmt.Errorf("Drive: %s returned %w", disk, err) // make sure to '%w' to wrap the error } @@ -413,6 +415,7 @@ func newErasureSets(ctx context.Context, endpoints PoolEndpoints, storageDisks [ globalLocalDrivesMu.RUnlock() continue } + disk.Close() disk = ldisk globalLocalDrivesMu.RUnlock() } @@ -1053,6 +1056,7 @@ func markRootDisksAsDown(storageDisks []StorageAPI, errs []error) { // We should not heal on root disk. i.e in a situation where the minio-administrator has unmounted a // defective drive we should not heal a path on the root disk. logger.LogIf(GlobalContext, fmt.Errorf("Drive `%s` is part of root drive, will not be used", storageDisks[i])) + storageDisks[i].Close() storageDisks[i] = nil } } @@ -1062,7 +1066,7 @@ func markRootDisksAsDown(storageDisks []StorageAPI, errs []error) { func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.HealResultItem, err error) { storageDisks, _ := initStorageDisksWithErrors(s.endpoints.Endpoints, storageOpts{ cleanUp: true, - healthCheck: true, + healthCheck: false, }) defer func(storageDisks []StorageAPI) { @@ -1132,6 +1136,7 @@ func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.H } if err := saveFormatErasure(storageDisks[index], format, formatOpID); err != nil { logger.LogIf(ctx, fmt.Errorf("Drive %s failed to write updated 'format.json': %v", storageDisks[index], err)) + storageDisks[index].Close() tmpNewFormats[index] = nil // this disk failed to write new format } } @@ -1154,17 +1159,27 @@ func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.H } if disk := storageDisks[index]; disk != nil { - disk.SetDiskLoc(s.poolIndex, m, n) + if disk.IsLocal() { + disk.SetDiskLoc(s.poolIndex, m, n) - if disk.IsLocal() && driveQuorum { - commonWrites, commonDeletes := calcCommonWritesDeletes(currentDisksInfo[m], (s.setDriveCount+1)/2) xldisk, ok := disk.(*xlStorageDiskIDCheck) if ok { - xldisk.totalWrites.Add(commonWrites) - xldisk.totalDeletes.Add(commonDeletes) - xldisk.storage.setWriteAttribute(commonWrites) - xldisk.storage.setDeleteAttribute(commonDeletes) + if driveQuorum { + commonWrites, commonDeletes := calcCommonWritesDeletes(currentDisksInfo[m], (s.setDriveCount+1)/2) + xldisk.totalWrites.Store(commonWrites) + xldisk.totalDeletes.Store(commonDeletes) + xldisk.storage.setWriteAttribute(commonWrites) + xldisk.storage.setDeleteAttribute(commonDeletes) + } + go xldisk.monitorDiskWritable(xldisk.diskCtx) } + } else { + disk.Close() // Close the remote storage client, re-initialize with healthchecks. + disk, err = newStorageRESTClient(disk.Endpoint(), true, globalGrid.Load()) + if err != nil { + continue + } + disk.SetDiskLoc(s.poolIndex, m, n) } s.erasureDisks[m][n] = disk diff --git a/cmd/xl-storage-disk-id-check.go b/cmd/xl-storage-disk-id-check.go index fae3e7413..40dc8defa 100644 --- a/cmd/xl-storage-disk-id-check.go +++ b/cmd/xl-storage-disk-id-check.go @@ -228,8 +228,8 @@ func newXLStorageDiskIDCheck(storage *xlStorage, healthCheck bool) *xlStorageDis } if driveQuorum { - xl.totalWrites.Add(xl.storage.getWriteAttribute()) - xl.totalDeletes.Add(xl.storage.getDeleteAttribute()) + xl.totalWrites.Store(xl.storage.getWriteAttribute()) + xl.totalDeletes.Store(xl.storage.getDeleteAttribute()) } xl.diskCtx, xl.cancel = context.WithCancel(context.TODO()) @@ -1032,37 +1032,50 @@ func (p *xlStorageDiskIDCheck) checkHealth(ctx context.Context) (err error) { if t > maxTimeSinceLastSuccess { if atomic.CompareAndSwapInt32(&p.health.status, diskHealthOK, diskHealthFaulty) { logger.LogAlwaysIf(ctx, fmt.Errorf("node(%s): taking drive %s offline, time since last response %v", globalLocalNodeName, p.storage.String(), t.Round(time.Millisecond))) - go p.monitorDiskStatus(t) + go p.monitorDiskStatus(0, mustGetUUID()) } return errFaultyDisk } return nil } +// Make sure we do not write O_DIRECT aligned I/O because WrIteAll() ends +// up using O_DIRECT codepath which internally utilizes p.health.tokens +// we need to avoid using incoming I/O tokens as part of the healthcheck +// monitoring I/O. +var toWrite = []byte{2048: 42} + // monitorDiskStatus should be called once when a drive has been marked offline. // Once the disk has been deemed ok, it will return to online status. -func (p *xlStorageDiskIDCheck) monitorDiskStatus(spent time.Duration) { +func (p *xlStorageDiskIDCheck) monitorDiskStatus(spent time.Duration, fn string) { t := time.NewTicker(5 * time.Second) defer t.Stop() - fn := mustGetUUID() for range t.C { + if contextCanceled(p.diskCtx) { + return + } + if len(p.health.tokens) == 0 { // Queue is still full, no need to check. continue } - err := p.storage.WriteAll(context.Background(), minioMetaTmpBucket, fn, []byte{10000: 42}) + + err := p.storage.WriteAll(context.Background(), minioMetaTmpBucket, fn, toWrite) if err != nil { continue } + b, err := p.storage.ReadAll(context.Background(), minioMetaTmpBucket, fn) - if err != nil || len(b) != 10001 { + if err != nil || len(b) != len(toWrite) { continue } + err = p.storage.Delete(context.Background(), minioMetaTmpBucket, fn, DeleteOptions{ Recursive: false, Immediate: false, }) + if err == nil { t := time.Unix(0, atomic.LoadInt64(&p.health.lastSuccess)) if spent > 0 { @@ -1108,8 +1121,6 @@ func (p *xlStorageDiskIDCheck) monitorDiskWritable(ctx context.Context) { defer t.Stop() fn := mustGetUUID() - // Be just above directio size. - toWrite := []byte{xioutil.DirectioAlignSize + 1: 42} rng := rand.New(rand.NewSource(time.Now().UnixNano())) monitor := func() bool { @@ -1129,7 +1140,7 @@ func (p *xlStorageDiskIDCheck) monitorDiskWritable(ctx context.Context) { goOffline := func(err error, spent time.Duration) { if atomic.CompareAndSwapInt32(&p.health.status, diskHealthOK, diskHealthFaulty) { logger.LogAlwaysIf(ctx, fmt.Errorf("node(%s): taking drive %s offline: %v", globalLocalNodeName, p.storage.String(), err)) - go p.monitorDiskStatus(spent) + go p.monitorDiskStatus(spent, fn) } } diff --git a/cmd/xl-storage.go b/cmd/xl-storage.go index 2356e0ed8..f3d58417a 100644 --- a/cmd/xl-storage.go +++ b/cmd/xl-storage.go @@ -715,11 +715,7 @@ func (s *xlStorage) DiskInfo(_ context.Context, _ bool) (info DiskInfo, err erro s.diskInfoCache.Once.Do(func() { s.diskInfoCache.TTL = time.Second s.diskInfoCache.Update = func() (interface{}, error) { - dcinfo := DiskInfo{ - RootDisk: s.rootDisk, - MountPath: s.drivePath, - Endpoint: s.endpoint.String(), - } + dcinfo := DiskInfo{} di, err := getDiskInfo(s.drivePath) if err != nil { return dcinfo, err @@ -748,6 +744,10 @@ func (s *xlStorage) DiskInfo(_ context.Context, _ bool) (info DiskInfo, err erro if v != nil { info = v.(DiskInfo) } + + info.RootDisk = s.rootDisk + info.MountPath = s.drivePath + info.Endpoint = s.endpoint.String() info.Scanning = atomic.LoadInt32(&s.scanning) == 1 return info, err }