From 2c7bcee53f42f7994b45acac5413ca84f40d22d5 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Sun, 19 May 2024 08:31:54 -0700 Subject: [PATCH] Add cross-version remapped merges to xl-meta (#19765) Adds `-xver` which can be used with `-export` and `-combine` to attempt to combine files across versions if data is suspected to be the same. Overlapping data is compared. Bonus: Make `inspect` accept wildcards. --- docs/debugging/inspect/go.mod | 1 + docs/debugging/inspect/go.sum | 2 + docs/debugging/inspect/main.go | 20 +- docs/debugging/xl-meta/main.go | 441 +++++++++++++++++++++++++++++---- 4 files changed, 409 insertions(+), 55 deletions(-) diff --git a/docs/debugging/inspect/go.mod b/docs/debugging/inspect/go.mod index a999931a1..392e55f48 100644 --- a/docs/debugging/inspect/go.mod +++ b/docs/debugging/inspect/go.mod @@ -4,6 +4,7 @@ go 1.19 require ( github.com/klauspost/compress v1.17.4 + github.com/klauspost/filepathx v1.1.1 github.com/minio/colorjson v1.0.6 github.com/minio/madmin-go/v3 v3.0.36 github.com/secure-io/sio-go v0.3.1 diff --git a/docs/debugging/inspect/go.sum b/docs/debugging/inspect/go.sum index fb7e0db19..a507b206f 100644 --- a/docs/debugging/inspect/go.sum +++ b/docs/debugging/inspect/go.sum @@ -4,6 +4,8 @@ github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= github.com/klauspost/compress v1.17.4 h1:Ej5ixsIri7BrIjBkRZLTo6ghwrEtHFk7ijlczPW4fZ4= github.com/klauspost/compress v1.17.4/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM= +github.com/klauspost/filepathx v1.1.1 h1:201zvAsL1PhZvmXTP+QLer3AavWrO3U1NILWpniHK4w= +github.com/klauspost/filepathx v1.1.1/go.mod h1:XWxdp8rEw4gupPBrxrV5Q57dL/71xj0OgV1gKt2zTfU= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= diff --git a/docs/debugging/inspect/main.go b/docs/debugging/inspect/main.go index 2cd97c7c0..0f2d6c085 100644 --- a/docs/debugging/inspect/main.go +++ b/docs/debugging/inspect/main.go @@ -31,6 +31,8 @@ import ( "os" "strings" "time" + + "github.com/klauspost/filepathx" ) var ( @@ -68,7 +70,7 @@ func main() { } } - var inputFileName, outputFileName string + var inputs []string // Parse parameters switch { @@ -83,22 +85,34 @@ func main() { fatalErr(err) } fatalErr(json.Unmarshal(got, &input)) - inputFileName = input.File + inputs = []string{input.File} *keyHex = input.Key case len(flag.Args()) == 1: - inputFileName = flag.Args()[0] + var err error + inputs, err = filepathx.Glob(flag.Args()[0]) + fatalErr(err) default: flag.Usage() fatalIf(true, "Only 1 file can be decrypted") os.Exit(1) } + for _, input := range inputs { + processFile(input, privateKey) + } +} +func processFile(inputFileName string, privateKey []byte) { // Calculate the output file name + var outputFileName string switch { case strings.HasSuffix(inputFileName, ".enc"): outputFileName = strings.TrimSuffix(inputFileName, ".enc") + ".zip" case strings.HasSuffix(inputFileName, ".zip"): outputFileName = strings.TrimSuffix(inputFileName, ".zip") + ".decrypted.zip" + case strings.Contains(inputFileName, ".enc."): + outputFileName = strings.Replace(inputFileName, ".enc.", ".", 1) + ".zip" + default: + outputFileName = inputFileName + ".decrypted" } // Backup any already existing output file diff --git a/docs/debugging/xl-meta/main.go b/docs/debugging/xl-meta/main.go index a4bb5393f..487a0b01e 100644 --- a/docs/debugging/xl-meta/main.go +++ b/docs/debugging/xl-meta/main.go @@ -28,6 +28,7 @@ import ( "log" "os" "path/filepath" + "regexp" "sort" "strings" "time" @@ -63,9 +64,9 @@ FLAGS: {{range .VisibleFlags}}{{.}} {{end}} ` + isPart := regexp.MustCompile("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/part\\.[0-9]+$") app.HideHelpCommand = true - app.Flags = []cli.Flag{ cli.BoolFlag{ Usage: "print each file as a separate line without formatting", @@ -84,6 +85,10 @@ FLAGS: Usage: "combine inline data", Name: "combine", }, + cli.BoolFlag{ + Usage: "combine inline data across versions when size matches", + Name: "xver", + }, } app.Action = func(c *cli.Context) error { @@ -93,6 +98,9 @@ FLAGS: } // file / version / file filemap := make(map[string]map[string]string) + foundData := make(map[string][]byte) + partDataToVerID := make(map[string][2]string) + var baseName string // versionID -> combineFiles := make(map[string][]string) decode := func(r io.Reader, file string) ([]byte, error) { @@ -171,10 +179,12 @@ FLAGS: } type erasureInfo struct { V2Obj *struct { - EcDist []int - EcIndex int - EcM int - EcN int + EcDist []int + EcIndex int + EcM int + EcN int + DDir []byte + PartNums []int } } var ei erasureInfo @@ -188,6 +198,12 @@ FLAGS: idx := ei.V2Obj.EcIndex filemap[file][verID] = fmt.Sprintf("%s/shard-%02d-of-%02d", verID, idx, ei.V2Obj.EcN+ei.V2Obj.EcM) filemap[file][verID+".json"] = buf.String() + for _, i := range ei.V2Obj.PartNums { + dataFile := fmt.Sprintf("%s%s/part.%d", strings.TrimSuffix(file, "xl.meta"), uuid.UUID(ei.V2Obj.DDir).String(), i) + partDataToVerID[dataFile] = [2]string{file, verID} + } + } else if err != nil { + fmt.Println("Error:", err) } return nil }) @@ -230,6 +246,14 @@ FLAGS: } }, file) } + if baseName == "" { + if strings.HasSuffix(file, "/xl.meta") { + baseName = strings.TrimSuffix(file, "/xl.meta") + if idx := strings.LastIndexByte(baseName, '/'); idx > 0 { + baseName = baseName[idx+1:] + } + } + } err := data.files(func(name string, data []byte) { fn := fmt.Sprintf("%s-%s.data", file, name) if c.Bool("combine") { @@ -334,7 +358,10 @@ FLAGS: return err } for _, file := range zr.File { - if !file.FileInfo().IsDir() && strings.HasSuffix(file.Name, "xl.meta") { + if file.FileInfo().IsDir() { + continue + } + if strings.HasSuffix(file.Name, "xl.meta") { r, err := file.Open() if err != nil { return err @@ -352,6 +379,18 @@ FLAGS: } } toPrint = append(toPrint, fmt.Sprintf("\t%s: %s", string(b), string(b2))) + } else if c.Bool("combine") && isPart.MatchString(file.Name) { + // name := isPart.FindString(file.Name) + name := strings.ReplaceAll(file.Name, ":", "_") + r, err := file.Open() + if err != nil { + return err + } + all, err := io.ReadAll(r) + if err != nil { + return err + } + foundData[name] = all } } } else { @@ -376,12 +415,46 @@ FLAGS: } sort.Strings(toPrint) fmt.Printf("{\n%s\n}\n", strings.Join(toPrint, ",\n")) - + for partName, data := range foundData { + if verid := partDataToVerID[partName]; verid != [2]string{} { + file := verid[0] + name := verid[1] + f := filemap[file][name] + fn := fmt.Sprintf("%s-%s.data", file, name) + if f != "" { + fn = f + ".data" + err := os.MkdirAll(filepath.Dir(fn), os.ModePerm) + if err != nil { + fmt.Println("MkdirAll:", filepath.Dir(fn), err) + } + err = os.WriteFile(fn+".json", []byte(filemap[file][name+".json"]), os.ModePerm) + combineFiles[name] = append(combineFiles[name], fn) + if err != nil { + fmt.Println("WriteFile:", err) + } + err = os.WriteFile(filepath.Dir(fn)+"/filename.txt", []byte(file), os.ModePerm) + if err != nil { + fmt.Println("combine WriteFile:", err) + } + fmt.Println("Remapped", partName, "to", fn) + } + err := os.WriteFile(fn, data, os.ModePerm) + if err != nil { + fmt.Println("WriteFile:", err) + } + } + } if len(combineFiles) > 0 { - for k, v := range combineFiles { - if err := combine(v, k); err != nil { + if c.Bool("xver") { + if err := combineCrossVer(combineFiles, baseName, foundData); err != nil { fmt.Println("ERROR:", err) } + } else { + for k, v := range combineFiles { + if err := combine(v, k+"-"+baseName); err != nil { + fmt.Println("ERROR:", err) + } + } } } @@ -709,21 +782,27 @@ func (z xlMetaV2VersionHeaderV2) MarshalJSON() (o []byte, err error) { return json.Marshal(tmp) } -func combine(files []string, out string) error { +type mappedData struct { + mapped, filled []byte + size, shards, data, parity int + parityData map[int]map[int][]byte + blockOffset int // Offset in bytes to start of block. + blocks int // 0 = one block. + objSize int +} + +func readAndMap(files []string, blockNum int) (*mappedData, error) { + var m mappedData sort.Strings(files) - var size, shards, data, parity int - mapped := make([]byte, size) - filled := make([]byte, size) - parityData := make(map[int]map[int][]byte) - fmt.Printf("Attempting to combine version %q.\n", out) + m.parityData = make(map[int]map[int][]byte) for _, file := range files { b, err := os.ReadFile(file) if err != nil { - return err + return nil, err } meta, err := os.ReadFile(file + ".json") if err != nil { - return err + return nil, err } type erasureInfo struct { V2Obj *struct { @@ -732,81 +811,117 @@ func combine(files []string, out string) error { EcM int EcN int Size int + EcBSize int } } var ei erasureInfo var idx int if err := json.Unmarshal(meta, &ei); err == nil && ei.V2Obj != nil { - if size == 0 { - size = ei.V2Obj.Size - mapped = make([]byte, size) - filled = make([]byte, size) + if m.objSize == 0 { + m.objSize = ei.V2Obj.Size } - data = ei.V2Obj.EcM - parity = ei.V2Obj.EcN - if shards == 0 { - shards = data + parity + m.data = ei.V2Obj.EcM + m.parity = ei.V2Obj.EcN + if m.shards == 0 { + m.shards = m.data + m.parity } idx = ei.V2Obj.EcIndex - 1 - fmt.Println("Read shard", ei.V2Obj.EcIndex, "Data shards", data, "Parity", parity, fmt.Sprintf("(%s)", file)) - if ei.V2Obj.Size != size { - return fmt.Errorf("size mismatch. Meta size: %d", ei.V2Obj.Size) + fmt.Println("Read shard", ei.V2Obj.EcIndex, "Data shards", m.data, "Parity", m.parity, fmt.Sprintf("(%s)", file)) + if ei.V2Obj.Size != m.objSize { + return nil, fmt.Errorf("size mismatch. Meta size: %d, Prev: %d", ei.V2Obj.Size, m.objSize) } } else { - return err + return nil, err } if len(b) < 32 { - return fmt.Errorf("file %s too short", file) + return nil, fmt.Errorf("file %s too short", file) + } + offset := ei.V2Obj.EcBSize * blockNum + if offset > ei.V2Obj.Size { + return nil, fmt.Errorf("block %d out of range. offset %d > size %d", blockNum, offset, ei.V2Obj.Size) + } + m.blockOffset = offset + m.blocks = (ei.V2Obj.Size + ei.V2Obj.EcBSize - 1) / ei.V2Obj.EcBSize + if m.blocks > 0 { + m.blocks-- + } + if blockNum < m.blocks { + m.size = ei.V2Obj.EcBSize + } else { + m.size = ei.V2Obj.Size - offset } - // Trim hash. Fine for inline data, since only one block. - b = b[32:] - set := parityData[data] + // Extract block data. + ssz := shardSize(ei.V2Obj.EcBSize, ei.V2Obj.EcM) + b, err = bitrot(b, blockNum*ssz, ssz) + if err != nil { + return nil, err + } + fmt.Println("Block data size:", m.size, "Shard size", ssz, "Got Shard:", len(b), "Bitrot ok") + + if m.mapped == nil { + m.mapped = make([]byte, m.size) + m.filled = make([]byte, m.size) + } + + set := m.parityData[m.data] if set == nil { set = make(map[int][]byte) } set[idx] = b - parityData[data] = set + m.parityData[m.data] = set // Combine start := len(b) * idx - if start >= len(mapped) { + if start >= len(m.mapped) { continue } - copy(mapped[start:], b) + copy(m.mapped[start:], b) for j := range b { - if j+start >= len(filled) { + if j+start >= len(m.filled) { break } - filled[j+start] = 1 + m.filled[j+start] = 1 } } + return &m, nil +} +func combine(files []string, out string) error { + fmt.Printf("Attempting to combine version %q.\n", out) + m, err := readAndMap(files, 0) + if err != nil { + return err + } + if m.blocks > 0 { + // TODO: Support multiple blocks. For now use -xver. + return fmt.Errorf("multiple blocks found, only one block supported. Try with -xver") + } lastValid := 0 missing := 0 - for i := range filled { - if filled[i] == 1 { + for i := range m.filled { + if m.filled[i] == 1 { lastValid = i } else { missing++ } } - if missing > 0 && len(parityData) > 0 { + if missing > 0 && len(m.parityData) > 0 { fmt.Println("Attempting to reconstruct using parity sets:") - for k, v := range parityData { + for k, v := range m.parityData { if missing == 0 { break } - fmt.Println("* Setup: Data shards:", k, "- Parity blocks:", len(v)) - rs, err := reedsolomon.New(k, shards-k) + fmt.Println("* Setup: Data shards:", k, "- Parity blocks:", m.shards-k) + rs, err := reedsolomon.New(k, m.shards-k) if err != nil { return err } - split, err := rs.Split(mapped) + split, err := rs.Split(m.mapped) if err != nil { return err } - splitFilled, err := rs.Split(filled) + splitFilled, err := rs.Split(m.filled) if err != nil { return err } @@ -833,9 +948,9 @@ func combine(files []string, out string) error { fmt.Println("Could reconstruct completely") for i, data := range split[:k] { start := i * len(data) - copy(mapped[start:], data) + copy(m.mapped[start:], data) } - lastValid = size - 1 + lastValid = m.size - 1 missing = 0 } else { fmt.Println("Could NOT reconstruct:", err) @@ -846,16 +961,238 @@ func combine(files []string, out string) error { return errors.New("no valid data found") } if missing > 0 { - out += ".truncated" + fmt.Println(missing, "bytes missing. Truncating", len(m.filled)-lastValid-1, "from end.") + out += ".incomplete" } else { + fmt.Println("No bytes missing.") out += ".complete" } - fmt.Println(missing, "bytes missing. Truncating", len(filled)-lastValid-1, "from end.") - mapped = mapped[:lastValid+1] - err := os.WriteFile(out, mapped, os.ModePerm) + m.mapped = m.mapped[:lastValid+1] + err = os.WriteFile(out, m.mapped, os.ModePerm) if err != nil { return err } fmt.Println("Wrote output to", out) return nil } + +func combineCrossVer(all map[string][]string, baseName string, additional map[string][]byte) error { + names := make([]string, 0, len(all)) + files := make([][]string, 0, len(all)) + for k, v := range all { + names = append(names, k) + files = append(files, v) + } + if len(files) <= 1 { + if len(files) == 0 { + return nil + } + return combine(files[0], names[0]) + } + exportedSizes := make(map[int]bool) +nextFile: + for i, file := range files { + var combined []byte + var missingAll int + var lastValidAll int + for block := 0; ; block++ { + fmt.Printf("Block %d, Base version %q.\n", block+1, names[i]) + m, err := readAndMap(file, block) + if err != nil { + return err + } + if exportedSizes[m.objSize] { + fmt.Println("Skipping version", names[i], "as it has already been exported.") + continue nextFile + } + compareFile: + for j, other := range files { + if i == j { + continue + } + fmt.Printf("Reading version %q.\n", names[j]) + otherM, err := readAndMap(other, block) + if err != nil { + fmt.Println(err) + continue + } + if m.objSize != otherM.objSize { + continue + } + var ok int + for i, filled := range otherM.filled[:m.size] { + if filled == 1 && m.filled[i] == 1 { + if m.mapped[i] != otherM.mapped[i] { + fmt.Println("Data mismatch at byte", i, "- Disregarding version", names[j]) + continue compareFile + } + ok++ + } + } + + // If data+parity matches, combine. + if m.parity == otherM.parity && m.data == otherM.data { + for k, v := range m.parityData { + if otherM.parityData[k] == nil { + continue + } + for i, data := range v { + if data != nil || otherM.parityData[k][i] == nil { + continue + } + m.parityData[k][i] = otherM.parityData[k][i] + } + } + } + + fmt.Printf("Data overlaps (%d bytes). Combining with %q.\n", ok, names[j]) + for i := range otherM.filled { + if otherM.filled[i] == 1 { + m.filled[i] = 1 + m.mapped[i] = otherM.mapped[i] + } + } + } + lastValid := 0 + missing := 0 + for i := range m.filled { + if m.filled[i] == 1 { + lastValid = i + } else { + missing++ + } + } + if missing > 0 && len(m.parityData) > 0 { + fmt.Println("Attempting to reconstruct using parity sets:") + for k, v := range m.parityData { + if missing == 0 { + break + } + fmt.Println("* Setup: Data shards:", k, "- Parity blocks:", m.shards-k) + rs, err := reedsolomon.New(k, m.shards-k) + if err != nil { + return err + } + split, err := rs.Split(m.mapped) + if err != nil { + return err + } + splitFilled, err := rs.Split(m.filled) + if err != nil { + return err + } + ok := len(splitFilled) + for i, sh := range splitFilled { + for _, v := range sh { + if v == 0 { + split[i] = nil + ok-- + break + } + } + } + hasParity := 0 + for idx, sh := range v { + split[idx] = sh + if idx >= k && len(v) > 0 { + hasParity++ + } + } + fmt.Printf("Have %d complete remapped data shards and %d complete parity shards. ", ok, hasParity) + + if err := rs.ReconstructData(split); err == nil { + fmt.Println("Could reconstruct completely") + for i, data := range split[:k] { + start := i * len(data) + copy(m.mapped[start:], data) + } + lastValid = m.size - 1 + missing = 0 + } else { + fmt.Println("Could NOT reconstruct:", err) + } + } + } + if m.blockOffset != len(combined) { + return fmt.Errorf("Block offset mismatch. Expected %d got %d", m.blockOffset, len(combined)) + } + combined = append(combined, m.mapped[:m.size]...) + missingAll += missing + if lastValid > 0 { + lastValidAll = lastValid + m.blockOffset + } + if m.blocks == block { + if len(combined) != m.objSize { + fmt.Println("Combined size mismatch. Expected", m.objSize, "got", len(combined)) + } + fmt.Println("Reached block", block+1, "of", m.blocks+1, "for", names[i], ". Done.") + break + } + } + if lastValidAll == 0 { + return errors.New("no valid data found") + } + out := names[i] + "-" + baseName + if missingAll > 0 { + out += ".incomplete" + fmt.Println(missingAll, "bytes missing. Truncating", len(combined)-lastValidAll-1, "from end.") + } else { + out += ".complete" + fmt.Println("No bytes missing.") + } + if missingAll == 0 { + exportedSizes[len(combined)] = true + } + combined = combined[:lastValidAll+1] + err := os.WriteFile(out, combined, os.ModePerm) + if err != nil { + return err + } + fmt.Println("Wrote output to", out) + } + return nil +} + +// bitrot returns a shard beginning at startOffset after doing bitrot checks. +func bitrot(val []byte, startOffset, shardSize int) ([]byte, error) { + var res []byte + for len(val) >= 32 { + want := val[:32] + data := val[32:] + if len(data) > shardSize { + data = data[:shardSize] + } + + const magicHighwayHash256Key = "\x4b\xe7\x34\xfa\x8e\x23\x8a\xcd\x26\x3e\x83\xe6\xbb\x96\x85\x52\x04\x0f\x93\x5d\xa3\x9f\x44\x14\x97\xe0\x9d\x13\x22\xde\x36\xa0" + + hh, _ := highwayhash.New([]byte(magicHighwayHash256Key)) + hh.Write(data) + if !bytes.Equal(want, hh.Sum(nil)) { + return res, fmt.Errorf("bitrot detected") + } + res = append(res, data...) + val = val[32+len(data):] + if len(res) > startOffset { + return res[startOffset:], nil + } + } + return res, fmt.Errorf("bitrot: data too short to get block. len(res)=%d, startOffset=%d", len(res), startOffset) +} + +// shardSize returns the shard size for a given block size and data blocks. +func shardSize(blockSize, dataBlocks int) (sz int) { + if dataBlocks == 0 { + // do nothing on invalid input + return + } + // Make denominator positive + if dataBlocks < 0 { + blockSize = -blockSize + dataBlocks = -dataBlocks + } + sz = blockSize / dataBlocks + if blockSize > 0 && blockSize%dataBlocks != 0 { + sz++ + } + return +}