diff --git a/cmd/erasure-metadata.go b/cmd/erasure-metadata.go index 15b059d65..dcdfee994 100644 --- a/cmd/erasure-metadata.go +++ b/cmd/erasure-metadata.go @@ -275,7 +275,7 @@ func (fi FileInfo) ObjectToPartOffset(ctx context.Context, offset int64) (partIn func findFileInfoInQuorum(ctx context.Context, metaArr []FileInfo, modTime time.Time, etag string, quorum int) (FileInfo, error) { // with less quorum return error. if quorum < 1 { - return FileInfo{}, errErasureReadQuorum + return FileInfo{}, InsufficientReadQuorum{Err: errErasureReadQuorum, Type: RQInsufficientOnlineDrives} } metaHashes := make([]string, len(metaArr)) h := sha256.New() @@ -341,7 +341,7 @@ func findFileInfoInQuorum(ctx context.Context, metaArr []FileInfo, modTime time. } if maxCount < quorum { - return FileInfo{}, errErasureReadQuorum + return FileInfo{}, InsufficientReadQuorum{Err: errErasureReadQuorum, Type: RQInconsistentMeta} } // Find the successor mod time in quorum, otherwise leave the @@ -377,7 +377,7 @@ func findFileInfoInQuorum(ctx context.Context, metaArr []FileInfo, modTime time. } return candidate, nil } - return FileInfo{}, errErasureReadQuorum + return FileInfo{}, InsufficientReadQuorum{Err: errErasureReadQuorum, Type: RQInconsistentMeta} } // pickValidFileInfo - picks one valid FileInfo content and returns from a @@ -498,7 +498,7 @@ func objectQuorumFromMeta(ctx context.Context, partsMetaData []FileInfo, errs [] parities := listObjectParities(partsMetaData, errs) parityBlocks := commonParity(parities, defaultParityCount) if parityBlocks < 0 { - return -1, -1, errErasureReadQuorum + return -1, -1, InsufficientReadQuorum{Err: errErasureReadQuorum, Type: RQInsufficientOnlineDrives} } dataBlocks := len(partsMetaData) - parityBlocks diff --git a/cmd/erasure-metadata_test.go b/cmd/erasure-metadata_test.go index 6eb518ae4..494394cd3 100644 --- a/cmd/erasure-metadata_test.go +++ b/cmd/erasure-metadata_test.go @@ -210,13 +210,13 @@ func TestFindFileInfoInQuorum(t *testing.T) { { fis: getNFInfo(16, 7, 1603863445, "36a21454-a2ca-11eb-bbaa-93a81c686f21", nil), modTime: time.Unix(1603863445, 0), - expectedErr: errErasureReadQuorum, + expectedErr: InsufficientReadQuorum{}, expectedQuorum: 8, }, { fis: getNFInfo(16, 16, 1603863445, "36a21454-a2ca-11eb-bbaa-93a81c686f21", nil), modTime: time.Unix(1603863445, 0), - expectedErr: errErasureReadQuorum, + expectedErr: InsufficientReadQuorum{}, expectedQuorum: 0, }, { @@ -241,7 +241,9 @@ func TestFindFileInfoInQuorum(t *testing.T) { test := test t.Run("", func(t *testing.T) { fi, err := findFileInfoInQuorum(context.Background(), test.fis, test.modTime, "", test.expectedQuorum) - if err != test.expectedErr { + _, ok1 := err.(InsufficientReadQuorum) + _, ok2 := test.expectedErr.(InsufficientReadQuorum) + if ok1 != ok2 { t.Errorf("Expected %s, got %s", test.expectedErr, err) } if test.succmodTimes != nil { diff --git a/cmd/erasure-object.go b/cmd/erasure-object.go index 82681dd0d..ade16abd1 100644 --- a/cmd/erasure-object.go +++ b/cmd/erasure-object.go @@ -1912,23 +1912,26 @@ func (er erasureObjects) DeleteObject(ctx context.Context, bucket, object string versionFound := true objInfo = ObjectInfo{VersionID: opts.VersionID} // version id needed in Delete API response. goi, _, gerr := er.getObjectInfoAndQuorum(ctx, bucket, object, opts) + tryDel := false if gerr != nil && goi.Name == "" { if _, ok := gerr.(InsufficientReadQuorum); ok { - // Add an MRF heal for next time. - er.addPartial(bucket, object, opts.VersionID) - - return objInfo, InsufficientWriteQuorum{} + if opts.Versioned || opts.VersionSuspended || countOnlineDisks(storageDisks) < len(storageDisks)/2+1 { + // Add an MRF heal for next time. + er.addPartial(bucket, object, opts.VersionID) + return objInfo, InsufficientWriteQuorum{} + } + tryDel = true // only for unversioned objects if there is write quorum } // For delete marker replication, versionID being replicated will not exist on disk if opts.DeleteMarker { versionFound = false - } else { + } else if !tryDel { return objInfo, gerr } } if opts.EvalMetadataFn != nil { - dsc, err := opts.EvalMetadataFn(&goi, err) + dsc, err := opts.EvalMetadataFn(&goi, gerr) if err != nil { return ObjectInfo{}, err } diff --git a/cmd/erasure-server-pool.go b/cmd/erasure-server-pool.go index 982494e10..8a306659c 100644 --- a/cmd/erasure-server-pool.go +++ b/cmd/erasure-server-pool.go @@ -458,7 +458,13 @@ type PoolObjInfo struct { Err error } -func (z *erasureServerPools) getPoolInfoExistingWithOpts(ctx context.Context, bucket, object string, opts ObjectOptions) (PoolObjInfo, error) { +type poolErrs struct { + Index int + Err error +} + +func (z *erasureServerPools) getPoolInfoExistingWithOpts(ctx context.Context, bucket, object string, opts ObjectOptions) (PoolObjInfo, []poolErrs, error) { + var noReadQuorumPools []poolErrs poolObjInfos := make([]PoolObjInfo, len(z.serverPools)) poolOpts := make([]ObjectOptions, len(z.serverPools)) for i := range z.serverPools { @@ -508,8 +514,9 @@ func (z *erasureServerPools) getPoolInfoExistingWithOpts(ctx context.Context, bu } if pinfo.Err == nil { // found a pool - return pinfo, nil + return pinfo, z.poolsWithObject(poolObjInfos, opts), nil } + if isErrReadQuorum(pinfo.Err) && !opts.MetadataChg { // read quorum is returned when the object is visibly // present but its unreadable, we simply ask the writes to @@ -518,30 +525,49 @@ func (z *erasureServerPools) getPoolInfoExistingWithOpts(ctx context.Context, bu // with enough disks online but sufficiently inconsistent to // break parity threshold, allow them to be overwritten // or allow new versions to be added. - return pinfo, nil + + return pinfo, z.poolsWithObject(poolObjInfos, opts), nil } defPool = pinfo if !isErrObjectNotFound(pinfo.Err) { - return pinfo, pinfo.Err + return pinfo, noReadQuorumPools, pinfo.Err } // No object exists or its a delete marker, // check objInfo to confirm. if pinfo.ObjInfo.DeleteMarker && pinfo.ObjInfo.Name != "" { - return pinfo, nil + return pinfo, noReadQuorumPools, nil } } if opts.ReplicationRequest && opts.DeleteMarker && defPool.Index >= 0 { // If the request is a delete marker replication request, return a default pool // in cases where the object does not exist. // This is to ensure that the delete marker is replicated to the destination. - return defPool, nil + return defPool, noReadQuorumPools, nil } - return PoolObjInfo{}, toObjectErr(errFileNotFound, bucket, object) + return PoolObjInfo{}, noReadQuorumPools, toObjectErr(errFileNotFound, bucket, object) +} + +// return all pools with read quorum error or no error for an object with given opts.Note that this array is +// returned in the order of recency of object ModTime. +func (z *erasureServerPools) poolsWithObject(pools []PoolObjInfo, opts ObjectOptions) (errs []poolErrs) { + for _, pool := range pools { + if opts.SkipDecommissioned && z.IsSuspended(pool.Index) { + continue + } + // Skip object if it's from pools participating in a rebalance operation. + if opts.SkipRebalancing && z.IsPoolRebalancing(pool.Index) { + continue + } + if isErrReadQuorum(pool.Err) || pool.Err == nil { + errs = append(errs, poolErrs{Err: pool.Err, Index: pool.Index}) + } + } + return errs } func (z *erasureServerPools) getPoolIdxExistingWithOpts(ctx context.Context, bucket, object string, opts ObjectOptions) (idx int, err error) { - pinfo, err := z.getPoolInfoExistingWithOpts(ctx, bucket, object, opts) + pinfo, _, err := z.getPoolInfoExistingWithOpts(ctx, bucket, object, opts) if err != nil { return -1, err } @@ -1082,7 +1108,8 @@ func (z *erasureServerPools) DeleteObject(ctx context.Context, bucket string, ob gopts := opts gopts.NoLock = true - pinfo, err := z.getPoolInfoExistingWithOpts(ctx, bucket, object, gopts) + + pinfo, noReadQuorumPools, err := z.getPoolInfoExistingWithOpts(ctx, bucket, object, gopts) if err != nil { if _, ok := err.(InsufficientReadQuorum); ok { return objInfo, InsufficientWriteQuorum{} @@ -1096,11 +1123,44 @@ func (z *erasureServerPools) DeleteObject(ctx context.Context, bucket string, ob return pinfo.ObjInfo, nil } + // Delete concurrently in all server pools with read quorum error for unversioned objects. + if len(noReadQuorumPools) > 0 && !opts.Versioned && !opts.VersionSuspended { + return z.deleteObjectFromAllPools(ctx, bucket, object, opts, noReadQuorumPools) + } objInfo, err = z.serverPools[pinfo.Index].DeleteObject(ctx, bucket, object, opts) objInfo.Name = decodeDirObject(object) return objInfo, err } +func (z *erasureServerPools) deleteObjectFromAllPools(ctx context.Context, bucket string, object string, opts ObjectOptions, poolIndices []poolErrs) (objInfo ObjectInfo, err error) { + derrs := make([]error, len(poolIndices)) + dobjects := make([]ObjectInfo, len(poolIndices)) + + // Delete concurrently in all server pools that reported no error or read quorum error + // where the read quorum issue is from metadata inconsistency. + var wg sync.WaitGroup + for idx, pe := range poolIndices { + if v, ok := pe.Err.(InsufficientReadQuorum); ok && v.Type != RQInconsistentMeta { + derrs[idx] = InsufficientWriteQuorum{} + continue + } + wg.Add(1) + pool := z.serverPools[pe.Index] + go func(idx int, pool *erasureSets) { + defer wg.Done() + dobjects[idx], derrs[idx] = pool.DeleteObject(ctx, bucket, object, opts) + }(idx, pool) + } + wg.Wait() + + // the poolIndices array is pre-sorted in order of latest ModTime, we care only about pool with latest object though + // the delete call tries to clean up other pools during DeleteObject call. + objInfo = dobjects[0] + objInfo.Name = decodeDirObject(object) + err = derrs[0] + return objInfo, err +} + func (z *erasureServerPools) DeleteObjects(ctx context.Context, bucket string, objects []ObjectToDelete, opts ObjectOptions) ([]DeletedObject, []error) { derrs := make([]error, len(objects)) dobjects := make([]DeletedObject, len(objects)) @@ -1142,7 +1202,7 @@ func (z *erasureServerPools) DeleteObjects(ctx context.Context, bucket string, o j := j obj := obj eg.Go(func() error { - pinfo, err := z.getPoolInfoExistingWithOpts(ctx, bucket, obj.ObjectName, ObjectOptions{ + pinfo, _, err := z.getPoolInfoExistingWithOpts(ctx, bucket, obj.ObjectName, ObjectOptions{ NoLock: true, }) if err != nil { diff --git a/cmd/object-api-errors.go b/cmd/object-api-errors.go index b3b0a1777..10c34f415 100644 --- a/cmd/object-api-errors.go +++ b/cmd/object-api-errors.go @@ -27,13 +27,13 @@ import ( // Converts underlying storage error. Convenience function written to // handle all cases where we have known types of errors returned by // underlying storage layer. -func toObjectErr(err error, params ...string) error { - if err == nil { +func toObjectErr(oerr error, params ...string) error { + if oerr == nil { return nil } // Unwarp the error first - err = unwrapAll(err) + err := unwrapAll(oerr) if err == context.Canceled { return context.Canceled @@ -157,6 +157,9 @@ func toObjectErr(err error, params ...string) error { if len(params) >= 2 { apiErr.Object = decodeDirObject(params[1]) } + if v, ok := oerr.(InsufficientReadQuorum); ok { + apiErr.Type = v.Type + } return apiErr case errErasureWriteQuorum.Error(): apiErr := InsufficientWriteQuorum{} @@ -201,8 +204,34 @@ func (e SlowDown) Error() string { return "Please reduce your request rate" } +// RQErrType reason for read quorum error. +type RQErrType int + +const ( + // RQInsufficientOnlineDrives - not enough online drives. + RQInsufficientOnlineDrives RQErrType = 1 << iota + // RQInconsistentMeta - inconsistent metadata. + RQInconsistentMeta +) + +func (t RQErrType) String() string { + switch t { + case RQInsufficientOnlineDrives: + return "InsufficientOnlineDrives" + case RQInconsistentMeta: + return "InconsistentMeta" + default: + return "Unknown" + } +} + // InsufficientReadQuorum storage cannot satisfy quorum for read operation. -type InsufficientReadQuorum GenericError +type InsufficientReadQuorum struct { + Bucket string + Object string + Err error + Type RQErrType +} func (e InsufficientReadQuorum) Error() string { return "Storage resources are insufficient for the read operation " + e.Bucket + "/" + e.Object diff --git a/cmd/testdata/undeleteable-object.tgz b/cmd/testdata/undeleteable-object.tgz new file mode 100644 index 000000000..b6abc0a94 Binary files /dev/null and b/cmd/testdata/undeleteable-object.tgz differ diff --git a/docs/bucket/replication/setup_ilm_expiry_replication.sh b/docs/bucket/replication/setup_ilm_expiry_replication.sh index 953655126..eca69d55f 100755 --- a/docs/bucket/replication/setup_ilm_expiry_replication.sh +++ b/docs/bucket/replication/setup_ilm_expiry_replication.sh @@ -80,6 +80,8 @@ export MC_HOST_sited=http://minio:minio123@127.0.0.1:9008 ## Setup site replication ./mc admin replicate add sitea siteb --replicate-ilm-expiry +sleep 10s + ## Add warm tier ./mc ilm tier add minio sitea WARM-TIER --endpoint http://localhost:9006 --access-key minio --secret-key minio123 --bucket bucket