mirror of
				https://github.com/minio/minio.git
				synced 2025-10-26 22:01:30 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			333 lines
		
	
	
		
			8.6 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			333 lines
		
	
	
		
			8.6 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| // Copyright (c) 2015-2021 MinIO, Inc.
 | |
| //
 | |
| // This file is part of MinIO Object Storage stack
 | |
| //
 | |
| // This program is free software: you can redistribute it and/or modify
 | |
| // it under the terms of the GNU Affero General Public License as published by
 | |
| // the Free Software Foundation, either version 3 of the License, or
 | |
| // (at your option) any later version.
 | |
| //
 | |
| // This program is distributed in the hope that it will be useful
 | |
| // but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
| // GNU Affero General Public License for more details.
 | |
| //
 | |
| // You should have received a copy of the GNU Affero General Public License
 | |
| // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 | |
| 
 | |
| package cmd
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"errors"
 | |
| 	"io"
 | |
| 	"sync"
 | |
| 	"sync/atomic"
 | |
| 
 | |
| 	"github.com/minio/minio/internal/logger"
 | |
| )
 | |
| 
 | |
| // Reads in parallel from readers.
 | |
| type parallelReader struct {
 | |
| 	readers       []io.ReaderAt
 | |
| 	orgReaders    []io.ReaderAt
 | |
| 	dataBlocks    int
 | |
| 	offset        int64
 | |
| 	shardSize     int64
 | |
| 	shardFileSize int64
 | |
| 	buf           [][]byte
 | |
| 	readerToBuf   []int
 | |
| }
 | |
| 
 | |
| // newParallelReader returns parallelReader.
 | |
| func newParallelReader(readers []io.ReaderAt, e Erasure, offset, totalLength int64) *parallelReader {
 | |
| 	r2b := make([]int, len(readers))
 | |
| 	for i := range r2b {
 | |
| 		r2b[i] = i
 | |
| 	}
 | |
| 	return ¶llelReader{
 | |
| 		readers:       readers,
 | |
| 		orgReaders:    readers,
 | |
| 		dataBlocks:    e.dataBlocks,
 | |
| 		offset:        (offset / e.blockSize) * e.ShardSize(),
 | |
| 		shardSize:     e.ShardSize(),
 | |
| 		shardFileSize: e.ShardFileSize(totalLength),
 | |
| 		buf:           make([][]byte, len(readers)),
 | |
| 		readerToBuf:   r2b,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // preferReaders can mark readers as preferred.
 | |
| // These will be chosen before others.
 | |
| func (p *parallelReader) preferReaders(prefer []bool) {
 | |
| 	if len(prefer) != len(p.orgReaders) {
 | |
| 		return
 | |
| 	}
 | |
| 	// Copy so we don't change our input.
 | |
| 	tmp := make([]io.ReaderAt, len(p.orgReaders))
 | |
| 	copy(tmp, p.orgReaders)
 | |
| 	p.readers = tmp
 | |
| 	// next is the next non-preferred index.
 | |
| 	next := 0
 | |
| 	for i, ok := range prefer {
 | |
| 		if !ok || p.readers[i] == nil {
 | |
| 			continue
 | |
| 		}
 | |
| 		if i == next {
 | |
| 			next++
 | |
| 			continue
 | |
| 		}
 | |
| 		// Move reader with index i to index next.
 | |
| 		// Do this by swapping next and i
 | |
| 		p.readers[next], p.readers[i] = p.readers[i], p.readers[next]
 | |
| 		p.readerToBuf[next] = i
 | |
| 		p.readerToBuf[i] = next
 | |
| 		next++
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Returns if buf can be erasure decoded.
 | |
| func (p *parallelReader) canDecode(buf [][]byte) bool {
 | |
| 	bufCount := 0
 | |
| 	for _, b := range buf {
 | |
| 		if len(b) > 0 {
 | |
| 			bufCount++
 | |
| 		}
 | |
| 	}
 | |
| 	return bufCount >= p.dataBlocks
 | |
| }
 | |
| 
 | |
| // Read reads from readers in parallel. Returns p.dataBlocks number of bufs.
 | |
| func (p *parallelReader) Read(dst [][]byte) ([][]byte, error) {
 | |
| 	newBuf := dst
 | |
| 	if len(dst) != len(p.readers) {
 | |
| 		newBuf = make([][]byte, len(p.readers))
 | |
| 	} else {
 | |
| 		for i := range newBuf {
 | |
| 			newBuf[i] = newBuf[i][:0]
 | |
| 		}
 | |
| 	}
 | |
| 	var newBufLK sync.RWMutex
 | |
| 
 | |
| 	if p.offset+p.shardSize > p.shardFileSize {
 | |
| 		p.shardSize = p.shardFileSize - p.offset
 | |
| 	}
 | |
| 	if p.shardSize == 0 {
 | |
| 		return newBuf, nil
 | |
| 	}
 | |
| 
 | |
| 	readTriggerCh := make(chan bool, len(p.readers))
 | |
| 	defer close(readTriggerCh) // close the channel upon return
 | |
| 
 | |
| 	for i := 0; i < p.dataBlocks; i++ {
 | |
| 		// Setup read triggers for p.dataBlocks number of reads so that it reads in parallel.
 | |
| 		readTriggerCh <- true
 | |
| 	}
 | |
| 
 | |
| 	bitrotHeal := int32(0)       // Atomic bool flag.
 | |
| 	missingPartsHeal := int32(0) // Atomic bool flag.
 | |
| 	readerIndex := 0
 | |
| 	var wg sync.WaitGroup
 | |
| 	// if readTrigger is true, it implies next disk.ReadAt() should be tried
 | |
| 	// if readTrigger is false, it implies previous disk.ReadAt() was successful and there is no need
 | |
| 	// to try reading the next disk.
 | |
| 	for readTrigger := range readTriggerCh {
 | |
| 		newBufLK.RLock()
 | |
| 		canDecode := p.canDecode(newBuf)
 | |
| 		newBufLK.RUnlock()
 | |
| 		if canDecode {
 | |
| 			break
 | |
| 		}
 | |
| 		if readerIndex == len(p.readers) {
 | |
| 			break
 | |
| 		}
 | |
| 		if !readTrigger {
 | |
| 			continue
 | |
| 		}
 | |
| 		wg.Add(1)
 | |
| 		go func(i int) {
 | |
| 			defer wg.Done()
 | |
| 			rr := p.readers[i]
 | |
| 			if rr == nil {
 | |
| 				// Since reader is nil, trigger another read.
 | |
| 				readTriggerCh <- true
 | |
| 				return
 | |
| 			}
 | |
| 			bufIdx := p.readerToBuf[i]
 | |
| 			if p.buf[bufIdx] == nil {
 | |
| 				// Reading first time on this disk, hence the buffer needs to be allocated.
 | |
| 				// Subsequent reads will re-use this buffer.
 | |
| 				p.buf[bufIdx] = make([]byte, p.shardSize)
 | |
| 			}
 | |
| 			// For the last shard, the shardsize might be less than previous shard sizes.
 | |
| 			// Hence the following statement ensures that the buffer size is reset to the right size.
 | |
| 			p.buf[bufIdx] = p.buf[bufIdx][:p.shardSize]
 | |
| 			n, err := rr.ReadAt(p.buf[bufIdx], p.offset)
 | |
| 			if err != nil {
 | |
| 				if errors.Is(err, errFileNotFound) {
 | |
| 					atomic.StoreInt32(&missingPartsHeal, 1)
 | |
| 				} else if errors.Is(err, errFileCorrupt) {
 | |
| 					atomic.StoreInt32(&bitrotHeal, 1)
 | |
| 				}
 | |
| 
 | |
| 				// This will be communicated upstream.
 | |
| 				p.orgReaders[bufIdx] = nil
 | |
| 				p.readers[i] = nil
 | |
| 
 | |
| 				// Since ReadAt returned error, trigger another read.
 | |
| 				readTriggerCh <- true
 | |
| 				return
 | |
| 			}
 | |
| 			newBufLK.Lock()
 | |
| 			newBuf[bufIdx] = p.buf[bufIdx][:n]
 | |
| 			newBufLK.Unlock()
 | |
| 			// Since ReadAt returned success, there is no need to trigger another read.
 | |
| 			readTriggerCh <- false
 | |
| 		}(readerIndex)
 | |
| 		readerIndex++
 | |
| 	}
 | |
| 	wg.Wait()
 | |
| 	if p.canDecode(newBuf) {
 | |
| 		p.offset += p.shardSize
 | |
| 		if atomic.LoadInt32(&missingPartsHeal) == 1 {
 | |
| 			return newBuf, errFileNotFound
 | |
| 		} else if atomic.LoadInt32(&bitrotHeal) == 1 {
 | |
| 			return newBuf, errFileCorrupt
 | |
| 		}
 | |
| 		return newBuf, nil
 | |
| 	}
 | |
| 
 | |
| 	// If we cannot decode, just return read quorum error.
 | |
| 	return nil, errErasureReadQuorum
 | |
| }
 | |
| 
 | |
| // Decode reads from readers, reconstructs data if needed and writes the data to the writer.
 | |
| // A set of preferred drives can be supplied. In that case they will be used and the data reconstructed.
 | |
| func (e Erasure) Decode(ctx context.Context, writer io.Writer, readers []io.ReaderAt, offset, length, totalLength int64, prefer []bool) (written int64, derr error) {
 | |
| 	if offset < 0 || length < 0 {
 | |
| 		logger.LogIf(ctx, errInvalidArgument)
 | |
| 		return -1, errInvalidArgument
 | |
| 	}
 | |
| 	if offset+length > totalLength {
 | |
| 		logger.LogIf(ctx, errInvalidArgument)
 | |
| 		return -1, errInvalidArgument
 | |
| 	}
 | |
| 
 | |
| 	if length == 0 {
 | |
| 		return 0, nil
 | |
| 	}
 | |
| 
 | |
| 	reader := newParallelReader(readers, e, offset, totalLength)
 | |
| 	if len(prefer) == len(readers) {
 | |
| 		reader.preferReaders(prefer)
 | |
| 	}
 | |
| 
 | |
| 	startBlock := offset / e.blockSize
 | |
| 	endBlock := (offset + length) / e.blockSize
 | |
| 
 | |
| 	var bytesWritten int64
 | |
| 	var bufs [][]byte
 | |
| 	for block := startBlock; block <= endBlock; block++ {
 | |
| 		var blockOffset, blockLength int64
 | |
| 		switch {
 | |
| 		case startBlock == endBlock:
 | |
| 			blockOffset = offset % e.blockSize
 | |
| 			blockLength = length
 | |
| 		case block == startBlock:
 | |
| 			blockOffset = offset % e.blockSize
 | |
| 			blockLength = e.blockSize - blockOffset
 | |
| 		case block == endBlock:
 | |
| 			blockOffset = 0
 | |
| 			blockLength = (offset + length) % e.blockSize
 | |
| 		default:
 | |
| 			blockOffset = 0
 | |
| 			blockLength = e.blockSize
 | |
| 		}
 | |
| 		if blockLength == 0 {
 | |
| 			break
 | |
| 		}
 | |
| 
 | |
| 		var err error
 | |
| 		bufs, err = reader.Read(bufs)
 | |
| 		if len(bufs) > 0 {
 | |
| 			// Set only if there are be enough data for reconstruction.
 | |
| 			// and only for expected errors, also set once.
 | |
| 			if errors.Is(err, errFileNotFound) || errors.Is(err, errFileCorrupt) {
 | |
| 				if derr == nil {
 | |
| 					derr = err
 | |
| 				}
 | |
| 			}
 | |
| 		} else if err != nil {
 | |
| 			// For all errors that cannot be reconstructed fail the read operation.
 | |
| 			return -1, err
 | |
| 		}
 | |
| 
 | |
| 		if err = e.DecodeDataBlocks(bufs); err != nil {
 | |
| 			logger.LogIf(ctx, err)
 | |
| 			return -1, err
 | |
| 		}
 | |
| 
 | |
| 		n, err := writeDataBlocks(ctx, writer, bufs, e.dataBlocks, blockOffset, blockLength)
 | |
| 		if err != nil {
 | |
| 			return -1, err
 | |
| 		}
 | |
| 
 | |
| 		bytesWritten += n
 | |
| 	}
 | |
| 
 | |
| 	if bytesWritten != length {
 | |
| 		logger.LogIf(ctx, errLessData)
 | |
| 		return bytesWritten, errLessData
 | |
| 	}
 | |
| 
 | |
| 	return bytesWritten, derr
 | |
| }
 | |
| 
 | |
| // Heal reads from readers, reconstruct shards and writes the data to the writers.
 | |
| func (e Erasure) Heal(ctx context.Context, writers []io.Writer, readers []io.ReaderAt, totalLength int64) (derr error) {
 | |
| 	if len(writers) != e.parityBlocks+e.dataBlocks {
 | |
| 		return errInvalidArgument
 | |
| 	}
 | |
| 
 | |
| 	reader := newParallelReader(readers, e, 0, totalLength)
 | |
| 
 | |
| 	startBlock := int64(0)
 | |
| 	endBlock := totalLength / e.blockSize
 | |
| 	if totalLength%e.blockSize != 0 {
 | |
| 		endBlock++
 | |
| 	}
 | |
| 
 | |
| 	var bufs [][]byte
 | |
| 	for block := startBlock; block < endBlock; block++ {
 | |
| 		var err error
 | |
| 		bufs, err = reader.Read(bufs)
 | |
| 		if len(bufs) > 0 {
 | |
| 			if errors.Is(err, errFileNotFound) || errors.Is(err, errFileCorrupt) {
 | |
| 				if derr == nil {
 | |
| 					derr = err
 | |
| 				}
 | |
| 			}
 | |
| 		} else if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 
 | |
| 		if err = e.DecodeDataAndParityBlocks(ctx, bufs); err != nil {
 | |
| 			logger.LogIf(ctx, err)
 | |
| 			return err
 | |
| 		}
 | |
| 
 | |
| 		w := parallelWriter{
 | |
| 			writers:     writers,
 | |
| 			writeQuorum: 1,
 | |
| 			errs:        make([]error, len(writers)),
 | |
| 		}
 | |
| 
 | |
| 		err = w.Write(ctx, bufs)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return derr
 | |
| }
 |