X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=main.go;fp=main.go;h=f4fbf6ca776668269f98c1baac66b4d6a8457c6e;hb=4cb0b1b52ba01d3b200b56c349f4177f5ec355d3;hp=0000000000000000000000000000000000000000;hpb=bdcc6c7a51b4ec8252ef2795f0688e805fda5353;p=sgodup.git diff --git a/main.go b/main.go new file mode 100644 index 0000000..f4fbf6c --- /dev/null +++ b/main.go @@ -0,0 +1,315 @@ +/* +sgodup -- File deduplication utility +Copyright (C) 2020 Sergey Matveev + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +// File deduplication utility +package main + +import ( + "bufio" + "bytes" + "flag" + "fmt" + "io" + "log" + "os" + "os/signal" + "path/filepath" + "strconv" + "sync" + "syscall" + + "golang.org/x/crypto/blake2b" +) + +const ( + SizeBoundary = 1 << 12 // 4 KiB sector size + BufSize = 1 << 17 // ZFS default 128 KiB recordsize +) + +var ( + canExit sync.Mutex + + curDirPath string + curDirFd *os.File +) + +func link(dup, orig, action string, fsync bool) { + tgt, err := filepath.Rel(dup, orig) + if err != nil { + log.Fatal(err) + } + tgt = tgt[3:] + if action == "print" { + fmt.Println(dup, tgt) + return + } + canExit.Lock() + if err = os.Remove(dup); err != nil { + log.Fatal(err) + } + if action == "symlink" { + err = os.Symlink(tgt, dup) + } else { + err = os.Link(orig, dup) + } + if err != nil { + log.Fatal(err) + } + if fsync { + dirPath := filepath.Dir(dup) + if dirPath != curDirPath { + curDirFd, err = os.Open(dirPath) + if err != nil { + log.Fatal(err) + } + curDirPath = dirPath + } + if err = curDirFd.Sync(); err != nil { + log.Fatal(err) + } + } + canExit.Unlock() +} + +func main() { + var ( + baseDir = flag.String("basedir", "", "Directory with original files") + dupDir = flag.String("dupdir", "", "Directory with possible duplicates") + action = flag.String("action", "", "print, symlink, hardlink") + doChmod = flag.String("chmod", "", "chmod files") + doFsync = flag.Bool("fsync", false, "fsync directories?") + ) + flag.Parse() + if *baseDir == "" { + log.Fatalln("-basedir is required") + } + if *dupDir == "" { + log.Fatalln("-dupdir is required") + } + var chmod os.FileMode + if *doChmod != "" { + ch, err := strconv.ParseUint(*doChmod, 8, 16) + if err != nil { + log.Fatal(err) + } + chmod = os.FileMode(ch) + } + if !(*action == "print" || *action == "symlink" || *action == "hardlink") { + log.Fatalln("choose action") + } + + log.Println("processing basedir...") + size2fi := make(map[int64][]FileInode, 1<<10) + files := 0 + filesSmall := 0 + filesLarge := 0 + var fullSize int64 + progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total") + for fi := range walk(*baseDir) { + if chmod > 0 { + if err := os.Chmod(fi.Path, chmod); err != nil { + log.Fatal(err) + } + } + if fi.Size == 0 { + continue + } + if fi.Size <= SizeBoundary { + filesSmall++ + } else { + filesLarge++ + } + files++ + fullSize += fi.Size + size2fi[fi.Size] = append(size2fi[fi.Size], fi) + } + progress.Stop() + + log.Println("processing dupdir...") + queueSmall := make(map[string][]string, filesSmall) + queueLarge := make(map[string][]string, filesLarge) + files = 0 + fullSize = 0 + progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total") + for fi := range walk(*dupDir) { + if chmod > 0 { + if err := os.Chmod(fi.Path, chmod); err != nil { + log.Fatal(err) + } + } + if fi.Size == 0 { + continue + } + origs, ok := size2fi[fi.Size] + if !ok { + continue + } + paths := make([]string, 0, len(origs)) + for _, orig := range origs { + if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) { + continue + } + paths = append(paths, orig.Path) + } + files++ + fullSize += fi.Size + if fi.Size <= SizeBoundary { + queueSmall[fi.Path] = paths + } else { + queueLarge[fi.Path] = paths + } + } + size2fi = nil + progress.Stop() + + log.Println("deduplicating...") + progress = NewProgress( + files, + fullSize, + &files, + &fullSize, + " processed", + " deduplicated", + ) + files = 0 + fullSize = 0 + deduped := 0 + termRequired := make(chan os.Signal, 1) + signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT) + go func() { + <-termRequired + canExit.Lock() + progress.Stop() + log.Println(deduped, "files deduplicated") + os.Exit(0) + }() + bufDup := make([]byte, SizeBoundary) + bufOrig := make([]byte, SizeBoundary) + seen := make(map[string]struct{}, len(queueSmall)) + for dup, origs := range queueSmall { + files++ + if _, ok := seen[dup]; ok { + continue + } + fdDup, err := os.Open(dup) + if err != nil { + log.Fatal(err) + } + sizeDup, err := io.ReadFull(fdDup, bufDup) + if !(err == nil || err == io.ErrUnexpectedEOF) { + log.Fatal(err) + } + if err = fdDup.Close(); err != nil { + log.Fatal(err) + } + for _, orig := range origs { + fdOrig, err := os.Open(orig) + if err != nil { + log.Fatal(err) + } + sizeOrig, err := io.ReadFull(fdOrig, bufOrig) + if !(err == nil || err == io.ErrUnexpectedEOF) { + log.Fatal(err) + } + if sizeOrig != sizeDup { + log.Fatalln(dup, orig, "unexpectedly different sizes") + } + if err = fdOrig.Close(); err != nil { + log.Fatal(err) + } + if bytes.Compare(bufDup[:sizeDup], bufOrig[:sizeOrig]) != 0 { + continue + } + link(dup, orig, *action, *doFsync) + seen[orig] = struct{}{} + deduped++ + fullSize += int64(sizeDup) + break + } + } + queueSmall = nil + + hasher, err := blake2b.New512(nil) + if err != nil { + panic(err) + } + seen = make(map[string]struct{}, len(queueLarge)) + var sizeDup int64 + for dup, origs := range queueLarge { + files++ + if _, ok := seen[dup]; ok { + continue + } + fdDup, err := os.Open(dup) + if err != nil { + log.Fatal(err) + } + if _, err := io.ReadFull(fdDup, bufDup); err != nil { + log.Fatal(err) + } + var hashDup []byte + for _, orig := range origs { + fdOrig, err := os.Open(orig) + if err != nil { + log.Fatal(err) + } + if _, err = io.ReadFull(fdOrig, bufOrig); err != nil { + log.Fatal(err) + } + if bytes.Compare(bufDup, bufOrig) != 0 { + if err = fdOrig.Close(); err != nil { + log.Fatal(err) + } + continue + } + if hashDup == nil { + hasher.Reset() + if n, err := hasher.Write(bufDup); err != nil || n != len(bufDup) { + log.Fatalln("can not write to hash", err) + } + sizeDup, err = io.Copy(hasher, bufio.NewReaderSize(fdDup, BufSize)) + if err != nil { + log.Fatal(err) + } + hashDup = hasher.Sum(nil) + } + hasher.Reset() + if n, err := hasher.Write(bufOrig); err != nil || n != len(bufOrig) { + log.Fatalln("can not write to hash", err) + } + if _, err := io.Copy(hasher, bufio.NewReaderSize(fdOrig, BufSize)); err != nil { + log.Fatal(err) + } + if err = fdOrig.Close(); err != nil { + log.Fatal(err) + } + if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 { + continue + } + link(dup, orig, *action, *doFsync) + seen[orig] = struct{}{} + deduped++ + fullSize += sizeDup + break + } + if err = fdDup.Close(); err != nil { + log.Fatal(err) + } + } + termRequired <- syscall.SIGTERM + <-termRequired +}