mirror of
				https://github.com/minio/minio.git
				synced 2025-10-31 00:01:27 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			450 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			450 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| // Copyright (c) 2015-2021 MinIO, Inc.
 | |
| //
 | |
| // This file is part of MinIO Object Storage stack
 | |
| //
 | |
| // This program is free software: you can redistribute it and/or modify
 | |
| // it under the terms of the GNU Affero General Public License as published by
 | |
| // the Free Software Foundation, either version 3 of the License, or
 | |
| // (at your option) any later version.
 | |
| //
 | |
| // This program is distributed in the hope that it will be useful
 | |
| // but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
| // GNU Affero General Public License for more details.
 | |
| //
 | |
| // You should have received a copy of the GNU Affero General Public License
 | |
| // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 | |
| 
 | |
| package cmd
 | |
| 
 | |
| import (
 | |
| 	"bytes"
 | |
| 	"context"
 | |
| 	"slices"
 | |
| 	"time"
 | |
| 
 | |
| 	"github.com/minio/madmin-go/v3"
 | |
| )
 | |
| 
 | |
| func commonETags(etags []string) (etag string, maxima int) {
 | |
| 	etagOccurrenceMap := make(map[string]int, len(etags))
 | |
| 
 | |
| 	// Ignore the uuid sentinel and count the rest.
 | |
| 	for _, etag := range etags {
 | |
| 		if etag == "" {
 | |
| 			continue
 | |
| 		}
 | |
| 		etagOccurrenceMap[etag]++
 | |
| 	}
 | |
| 
 | |
| 	maxima = 0 // Counter for remembering max occurrence of elements.
 | |
| 	latest := ""
 | |
| 
 | |
| 	// Find the common cardinality from previously collected
 | |
| 	// occurrences of elements.
 | |
| 	for etag, count := range etagOccurrenceMap {
 | |
| 		if count < maxima {
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		// We are at or above maxima
 | |
| 		if count > maxima {
 | |
| 			maxima = count
 | |
| 			latest = etag
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Return the collected common max time, with maxima
 | |
| 	return latest, maxima
 | |
| }
 | |
| 
 | |
| // commonTime returns a maximally occurring time from a list of time.
 | |
| func commonTimeAndOccurrence(times []time.Time, group time.Duration) (maxTime time.Time, maxima int) {
 | |
| 	timeOccurrenceMap := make(map[int64]int, len(times))
 | |
| 	groupNano := group.Nanoseconds()
 | |
| 	// Ignore the uuid sentinel and count the rest.
 | |
| 	for _, t := range times {
 | |
| 		if t.Equal(timeSentinel) || t.IsZero() {
 | |
| 			continue
 | |
| 		}
 | |
| 		nano := t.UnixNano()
 | |
| 		if group > 0 {
 | |
| 			for k := range timeOccurrenceMap {
 | |
| 				if k == nano {
 | |
| 					// We add to ourself later
 | |
| 					continue
 | |
| 				}
 | |
| 				diff := k - nano
 | |
| 				if diff < 0 {
 | |
| 					diff = -diff
 | |
| 				}
 | |
| 				// We are within the limit
 | |
| 				if diff < groupNano {
 | |
| 					timeOccurrenceMap[k]++
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 		// Add ourself...
 | |
| 		timeOccurrenceMap[nano]++
 | |
| 	}
 | |
| 
 | |
| 	maxima = 0 // Counter for remembering max occurrence of elements.
 | |
| 	latest := int64(0)
 | |
| 
 | |
| 	// Find the common cardinality from previously collected
 | |
| 	// occurrences of elements.
 | |
| 	for nano, count := range timeOccurrenceMap {
 | |
| 		if count < maxima {
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		// We are at or above maxima
 | |
| 		if count > maxima || nano > latest {
 | |
| 			maxima = count
 | |
| 			latest = nano
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Return the collected common max time, with maxima
 | |
| 	return time.Unix(0, latest).UTC(), maxima
 | |
| }
 | |
| 
 | |
| // commonTime returns a maximally occurring time from a list of time if it
 | |
| // occurs >= quorum, else return timeSentinel
 | |
| func commonTime(modTimes []time.Time, quorum int) time.Time {
 | |
| 	if modTime, count := commonTimeAndOccurrence(modTimes, 0); count >= quorum {
 | |
| 		return modTime
 | |
| 	}
 | |
| 
 | |
| 	return timeSentinel
 | |
| }
 | |
| 
 | |
| func commonETag(etags []string, quorum int) string {
 | |
| 	if etag, count := commonETags(etags); count >= quorum {
 | |
| 		return etag
 | |
| 	}
 | |
| 	return ""
 | |
| }
 | |
| 
 | |
| // Beginning of unix time is treated as sentinel value here.
 | |
| var (
 | |
| 	timeSentinel     = time.Unix(0, 0).UTC()
 | |
| 	timeSentinel1970 = time.Unix(0, 1).UTC() // 1970 used for special cases when xlmeta.version == 0
 | |
| )
 | |
| 
 | |
| // Boot modTimes up to disk count, setting the value to time sentinel.
 | |
| func bootModtimes(diskCount int) []time.Time {
 | |
| 	modTimes := make([]time.Time, diskCount)
 | |
| 	// Boots up all the modtimes.
 | |
| 	for i := range modTimes {
 | |
| 		modTimes[i] = timeSentinel
 | |
| 	}
 | |
| 	return modTimes
 | |
| }
 | |
| 
 | |
| func listObjectETags(partsMetadata []FileInfo, errs []error, quorum int) (etags []string) {
 | |
| 	etags = make([]string, len(partsMetadata))
 | |
| 	vidMap := map[string]int{}
 | |
| 	for index, metadata := range partsMetadata {
 | |
| 		if errs[index] != nil {
 | |
| 			continue
 | |
| 		}
 | |
| 		vid := metadata.VersionID
 | |
| 		if metadata.VersionID == "" {
 | |
| 			vid = nullVersionID
 | |
| 		}
 | |
| 		vidMap[vid]++
 | |
| 		etags[index] = metadata.Metadata["etag"]
 | |
| 	}
 | |
| 
 | |
| 	for _, count := range vidMap {
 | |
| 		// do we have enough common versions
 | |
| 		// that have enough quorum to satisfy
 | |
| 		// the etag.
 | |
| 		if count >= quorum {
 | |
| 			return etags
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return make([]string, len(partsMetadata))
 | |
| }
 | |
| 
 | |
| // Extracts list of times from FileInfo slice and returns, skips
 | |
| // slice elements which have errors.
 | |
| func listObjectModtimes(partsMetadata []FileInfo, errs []error) (modTimes []time.Time) {
 | |
| 	modTimes = bootModtimes(len(partsMetadata))
 | |
| 	for index, metadata := range partsMetadata {
 | |
| 		if errs[index] != nil {
 | |
| 			continue
 | |
| 		}
 | |
| 		// Once the file is found, save the uuid saved on disk.
 | |
| 		modTimes[index] = metadata.ModTime
 | |
| 	}
 | |
| 	return modTimes
 | |
| }
 | |
| 
 | |
| func filterOnlineDisksInplace(fi FileInfo, partsMetadata []FileInfo, onlineDisks []StorageAPI) {
 | |
| 	for i, meta := range partsMetadata {
 | |
| 		if fi.XLV1 == meta.XLV1 {
 | |
| 			continue
 | |
| 		}
 | |
| 		onlineDisks[i] = nil
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Notes:
 | |
| // There are 5 possible states a disk could be in,
 | |
| // 1. __online__             - has the latest copy of xl.meta - returned by listOnlineDisks
 | |
| //
 | |
| // 2. __offline__            - err == errDiskNotFound
 | |
| //
 | |
| // 3. __availableWithParts__ - has the latest copy of xl.meta and has all
 | |
| //                             parts with checksums matching; returned by disksWithAllParts
 | |
| //
 | |
| // 4. __outdated__           - returned by outDatedDisk, provided []StorageAPI
 | |
| //                             returned by diskWithAllParts is passed for latestDisks.
 | |
| //    - has an old copy of xl.meta
 | |
| //    - doesn't have xl.meta (errFileNotFound)
 | |
| //    - has the latest xl.meta but one or more parts are corrupt
 | |
| //
 | |
| // 5. __missingParts__       - has the latest copy of xl.meta but has some parts
 | |
| // missing.  This is identified separately since this may need manual
 | |
| // inspection to understand the root cause. E.g, this could be due to
 | |
| // backend filesystem corruption.
 | |
| 
 | |
| // listOnlineDisks - returns
 | |
| // - a slice of disks where disk having 'older' xl.meta (or nothing)
 | |
| // are set to nil.
 | |
| // - latest (in time) of the maximally occurring modTime(s), which has at least quorum occurrences.
 | |
| func listOnlineDisks(disks []StorageAPI, partsMetadata []FileInfo, errs []error, quorum int) (onlineDisks []StorageAPI, modTime time.Time, etag string) {
 | |
| 	onlineDisks = make([]StorageAPI, len(disks))
 | |
| 
 | |
| 	// List all the file commit ids from parts metadata.
 | |
| 	modTimes := listObjectModtimes(partsMetadata, errs)
 | |
| 
 | |
| 	// Reduce list of UUIDs to a single common value.
 | |
| 	modTime = commonTime(modTimes, quorum)
 | |
| 
 | |
| 	if modTime.IsZero() || modTime.Equal(timeSentinel) {
 | |
| 		etags := listObjectETags(partsMetadata, errs, quorum)
 | |
| 
 | |
| 		etag = commonETag(etags, quorum)
 | |
| 
 | |
| 		if etag != "" { // allow this fallback only if a non-empty etag is found.
 | |
| 			for index, e := range etags {
 | |
| 				if partsMetadata[index].IsValid() && e == etag {
 | |
| 					onlineDisks[index] = disks[index]
 | |
| 				} else {
 | |
| 					onlineDisks[index] = nil
 | |
| 				}
 | |
| 			}
 | |
| 			return onlineDisks, modTime, etag
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Create a new online disks slice, which have common uuid.
 | |
| 	for index, t := range modTimes {
 | |
| 		if partsMetadata[index].IsValid() && t.Equal(modTime) {
 | |
| 			onlineDisks[index] = disks[index]
 | |
| 		} else {
 | |
| 			onlineDisks[index] = nil
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return onlineDisks, modTime, ""
 | |
| }
 | |
| 
 | |
| // Convert verify or check parts returned error to integer representation
 | |
| func convPartErrToInt(err error) int {
 | |
| 	err = unwrapAll(err)
 | |
| 	switch err {
 | |
| 	case nil:
 | |
| 		return checkPartSuccess
 | |
| 	case errFileNotFound, errFileVersionNotFound:
 | |
| 		return checkPartFileNotFound
 | |
| 	case errFileCorrupt:
 | |
| 		return checkPartFileCorrupt
 | |
| 	case errVolumeNotFound:
 | |
| 		return checkPartVolumeNotFound
 | |
| 	case errDiskNotFound:
 | |
| 		return checkPartDiskNotFound
 | |
| 	default:
 | |
| 		return checkPartUnknown
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func partNeedsHealing(partErrs []int) bool {
 | |
| 	return slices.IndexFunc(partErrs, func(i int) bool { return i != checkPartSuccess && i != checkPartUnknown }) > -1
 | |
| }
 | |
| 
 | |
| func hasPartErr(partErrs []int) bool {
 | |
| 	return slices.IndexFunc(partErrs, func(i int) bool { return i != checkPartSuccess }) > -1
 | |
| }
 | |
| 
 | |
| // disksWithAllParts - This function needs to be called with
 | |
| // []StorageAPI returned by listOnlineDisks. Returns,
 | |
| //
 | |
| // - disks which have all parts specified in the latest xl.meta.
 | |
| //
 | |
| //   - slice of errors about the state of data files on disk - can have
 | |
| //     a not-found error or a hash-mismatch error.
 | |
| func disksWithAllParts(ctx context.Context, onlineDisks []StorageAPI, partsMetadata []FileInfo,
 | |
| 	errs []error, latestMeta FileInfo, bucket, object string,
 | |
| 	scanMode madmin.HealScanMode,
 | |
| ) (availableDisks []StorageAPI, dataErrsByDisk map[int][]int, dataErrsByPart map[int][]int) {
 | |
| 	availableDisks = make([]StorageAPI, len(onlineDisks))
 | |
| 
 | |
| 	dataErrsByDisk = make(map[int][]int, len(onlineDisks))
 | |
| 	for i := range onlineDisks {
 | |
| 		dataErrsByDisk[i] = make([]int, len(latestMeta.Parts))
 | |
| 	}
 | |
| 
 | |
| 	dataErrsByPart = make(map[int][]int, len(latestMeta.Parts))
 | |
| 	for i := range latestMeta.Parts {
 | |
| 		dataErrsByPart[i] = make([]int, len(onlineDisks))
 | |
| 	}
 | |
| 
 | |
| 	inconsistent := 0
 | |
| 	for i, meta := range partsMetadata {
 | |
| 		if !meta.IsValid() {
 | |
| 			// Since for majority of the cases erasure.Index matches with erasure.Distribution we can
 | |
| 			// consider the offline disks as consistent.
 | |
| 			continue
 | |
| 		}
 | |
| 		if !meta.Deleted {
 | |
| 			if len(meta.Erasure.Distribution) != len(onlineDisks) {
 | |
| 				// Erasure distribution seems to have lesser
 | |
| 				// number of items than number of online disks.
 | |
| 				inconsistent++
 | |
| 				continue
 | |
| 			}
 | |
| 			if meta.Erasure.Distribution[i] != meta.Erasure.Index {
 | |
| 				// Mismatch indexes with distribution order
 | |
| 				inconsistent++
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	erasureDistributionReliable := true
 | |
| 	if inconsistent > len(partsMetadata)/2 {
 | |
| 		// If there are too many inconsistent files, then we can't trust erasure.Distribution (most likely
 | |
| 		// because of bugs found in CopyObject/PutObjectTags) https://github.com/minio/minio/pull/10772
 | |
| 		erasureDistributionReliable = false
 | |
| 	}
 | |
| 
 | |
| 	metaErrs := make([]error, len(errs))
 | |
| 
 | |
| 	for i, onlineDisk := range onlineDisks {
 | |
| 		if errs[i] != nil {
 | |
| 			metaErrs[i] = errs[i]
 | |
| 			continue
 | |
| 		}
 | |
| 		if onlineDisk == OfflineDisk {
 | |
| 			metaErrs[i] = errDiskNotFound
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		meta := partsMetadata[i]
 | |
| 		if !meta.ModTime.Equal(latestMeta.ModTime) || meta.DataDir != latestMeta.DataDir {
 | |
| 			metaErrs[i] = errFileCorrupt
 | |
| 			partsMetadata[i] = FileInfo{}
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		if erasureDistributionReliable {
 | |
| 			if !meta.IsValid() {
 | |
| 				partsMetadata[i] = FileInfo{}
 | |
| 				metaErrs[i] = errFileCorrupt
 | |
| 				continue
 | |
| 			}
 | |
| 
 | |
| 			if !meta.Deleted {
 | |
| 				if len(meta.Erasure.Distribution) != len(onlineDisks) {
 | |
| 					// Erasure distribution is not the same as onlineDisks
 | |
| 					// attempt a fix if possible, assuming other entries
 | |
| 					// might have the right erasure distribution.
 | |
| 					partsMetadata[i] = FileInfo{}
 | |
| 					metaErrs[i] = errFileCorrupt
 | |
| 					continue
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Copy meta errors to part errors
 | |
| 	for i, err := range metaErrs {
 | |
| 		if err != nil {
 | |
| 			partErr := convPartErrToInt(err)
 | |
| 			for p := range latestMeta.Parts {
 | |
| 				dataErrsByPart[p][i] = partErr
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	for i, onlineDisk := range onlineDisks {
 | |
| 		if metaErrs[i] != nil {
 | |
| 			continue
 | |
| 		}
 | |
| 		meta := partsMetadata[i]
 | |
| 
 | |
| 		if meta.Deleted || meta.IsRemote() {
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		// Always check data, if we got it.
 | |
| 		if (len(meta.Data) > 0 || meta.Size == 0) && len(meta.Parts) > 0 {
 | |
| 			checksumInfo := meta.Erasure.GetChecksumInfo(meta.Parts[0].Number)
 | |
| 			verifyErr := bitrotVerify(bytes.NewReader(meta.Data),
 | |
| 				int64(len(meta.Data)),
 | |
| 				meta.Erasure.ShardFileSize(meta.Size),
 | |
| 				checksumInfo.Algorithm,
 | |
| 				checksumInfo.Hash, meta.Erasure.ShardSize())
 | |
| 			dataErrsByPart[0][i] = convPartErrToInt(verifyErr)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		var (
 | |
| 			verifyErr  error
 | |
| 			verifyResp *CheckPartsResp
 | |
| 		)
 | |
| 
 | |
| 		meta.DataDir = latestMeta.DataDir
 | |
| 		switch scanMode {
 | |
| 		case madmin.HealDeepScan:
 | |
| 			// disk has a valid xl.meta but may not have all the
 | |
| 			// parts. This is considered an outdated disk, since
 | |
| 			// it needs healing too.
 | |
| 			verifyResp, verifyErr = onlineDisk.VerifyFile(ctx, bucket, object, meta)
 | |
| 		default:
 | |
| 			verifyResp, verifyErr = onlineDisk.CheckParts(ctx, bucket, object, meta)
 | |
| 		}
 | |
| 
 | |
| 		for p := range latestMeta.Parts {
 | |
| 			if verifyErr != nil {
 | |
| 				dataErrsByPart[p][i] = convPartErrToInt(verifyErr)
 | |
| 			} else {
 | |
| 				dataErrsByPart[p][i] = verifyResp.Results[p]
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Build dataErrs by disk from dataErrs by part
 | |
| 	for part, disks := range dataErrsByPart {
 | |
| 		for disk := range disks {
 | |
| 			dataErrsByDisk[disk][part] = dataErrsByPart[part][disk]
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	for i, onlineDisk := range onlineDisks {
 | |
| 		if metaErrs[i] == nil && !hasPartErr(dataErrsByDisk[i]) {
 | |
| 			// All parts verified, mark it as all data available.
 | |
| 			availableDisks[i] = onlineDisk
 | |
| 		} else {
 | |
| 			// upon errors just make that disk's fileinfo invalid
 | |
| 			partsMetadata[i] = FileInfo{}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return
 | |
| }
 |