/* sgodup -- File deduplication utility Copyright (C) 2020 Sergey Matveev This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ // File deduplication utility package main import ( "bufio" "bytes" "flag" "fmt" "io" "log" "os" "os/signal" "path/filepath" "strconv" "sync" "syscall" "golang.org/x/crypto/blake2b" ) const ( SizeBoundary = 1 << 12 // 4 KiB sector size BufSize = 1 << 17 // ZFS default 128 KiB recordsize ) var ( canExit sync.Mutex curDirPath string curDirFd *os.File ) func link(dup, orig, action string, fsync bool) { tgt, err := filepath.Rel(dup, orig) if err != nil { log.Fatal(err) } tgt = tgt[3:] if action == "print" { fmt.Println(dup, tgt) return } canExit.Lock() if err = os.Remove(dup); err != nil { log.Fatal(err) } if action == "symlink" { err = os.Symlink(tgt, dup) } else { err = os.Link(orig, dup) } if err != nil { log.Fatal(err) } if fsync { dirPath := filepath.Dir(dup) if dirPath != curDirPath { curDirFd, err = os.Open(dirPath) if err != nil { log.Fatal(err) } curDirPath = dirPath } if err = curDirFd.Sync(); err != nil { log.Fatal(err) } } canExit.Unlock() } func main() { var ( baseDir = flag.String("basedir", "", "Directory with original files") dupDir = flag.String("dupdir", "", "Directory with possible duplicates") action = flag.String("action", "", "print, symlink, hardlink") doChmod = flag.String("chmod", "", "chmod files") doFsync = flag.Bool("fsync", false, "fsync directories?") ) flag.Parse() if *baseDir == "" { log.Fatalln("-basedir is required") } if *dupDir == "" { log.Fatalln("-dupdir is required") } var chmod os.FileMode if *doChmod != "" { ch, err := strconv.ParseUint(*doChmod, 8, 16) if err != nil { log.Fatal(err) } chmod = os.FileMode(ch) } if !(*action == "print" || *action == "symlink" || *action == "hardlink") { log.Fatalln("choose action") } log.Println("processing basedir...") size2fi := make(map[int64][]FileInode, 1<<10) files := 0 filesSmall := 0 filesLarge := 0 var fullSize int64 progress := NewProgress(0, 0, &files, &fullSize, " scanned", " total") for fi := range walk(*baseDir) { if chmod > 0 { if err := os.Chmod(fi.Path, chmod); err != nil { log.Fatal(err) } } if fi.Size == 0 { continue } if fi.Size <= SizeBoundary { filesSmall++ } else { filesLarge++ } files++ fullSize += fi.Size size2fi[fi.Size] = append(size2fi[fi.Size], fi) } progress.Stop() log.Println("processing dupdir...") queueSmall := make(map[string][]string, filesSmall) queueLarge := make(map[string][]string, filesLarge) files = 0 fullSize = 0 progress = NewProgress(0, 0, &files, &fullSize, " scanned", " total") for fi := range walk(*dupDir) { if chmod > 0 { if err := os.Chmod(fi.Path, chmod); err != nil { log.Fatal(err) } } if fi.Size == 0 { continue } origs, ok := size2fi[fi.Size] if !ok { continue } paths := make([]string, 0, len(origs)) for _, orig := range origs { if fi.Path == orig.Path || (fi.Dev == orig.Dev && fi.Ino == orig.Ino) { continue } paths = append(paths, orig.Path) } files++ fullSize += fi.Size if fi.Size <= SizeBoundary { queueSmall[fi.Path] = paths } else { queueLarge[fi.Path] = paths } } size2fi = nil progress.Stop() log.Println("deduplicating...") progress = NewProgress( files, fullSize, &files, &fullSize, " processed", " deduplicated", ) files = 0 fullSize = 0 deduped := 0 termRequired := make(chan os.Signal, 1) signal.Notify(termRequired, syscall.SIGTERM, syscall.SIGINT) go func() { <-termRequired canExit.Lock() progress.Stop() log.Println(deduped, "files deduplicated") os.Exit(0) }() bufDup := make([]byte, SizeBoundary) bufOrig := make([]byte, SizeBoundary) seen := make(map[string]struct{}, len(queueSmall)) for dup, origs := range queueSmall { files++ if _, ok := seen[dup]; ok { continue } fdDup, err := os.Open(dup) if err != nil { log.Fatal(err) } sizeDup, err := io.ReadFull(fdDup, bufDup) if !(err == nil || err == io.ErrUnexpectedEOF) { log.Fatal(err) } if err = fdDup.Close(); err != nil { log.Fatal(err) } for _, orig := range origs { fdOrig, err := os.Open(orig) if err != nil { log.Fatal(err) } sizeOrig, err := io.ReadFull(fdOrig, bufOrig) if !(err == nil || err == io.ErrUnexpectedEOF) { log.Fatal(err) } if sizeOrig != sizeDup { log.Fatalln(dup, orig, "unexpectedly different sizes") } if err = fdOrig.Close(); err != nil { log.Fatal(err) } if bytes.Compare(bufDup[:sizeDup], bufOrig[:sizeOrig]) != 0 { continue } link(dup, orig, *action, *doFsync) seen[orig] = struct{}{} deduped++ fullSize += int64(sizeDup) break } } queueSmall = nil hasher, err := blake2b.New512(nil) if err != nil { panic(err) } seen = make(map[string]struct{}, len(queueLarge)) var sizeDup int64 for dup, origs := range queueLarge { files++ if _, ok := seen[dup]; ok { continue } fdDup, err := os.Open(dup) if err != nil { log.Fatal(err) } if _, err := io.ReadFull(fdDup, bufDup); err != nil { log.Fatal(err) } var hashDup []byte for _, orig := range origs { fdOrig, err := os.Open(orig) if err != nil { log.Fatal(err) } if _, err = io.ReadFull(fdOrig, bufOrig); err != nil { log.Fatal(err) } if bytes.Compare(bufDup, bufOrig) != 0 { if err = fdOrig.Close(); err != nil { log.Fatal(err) } continue } if hashDup == nil { hasher.Reset() if n, err := hasher.Write(bufDup); err != nil || n != len(bufDup) { log.Fatalln("can not write to hash", err) } sizeDup, err = io.Copy(hasher, bufio.NewReaderSize(fdDup, BufSize)) if err != nil { log.Fatal(err) } hashDup = hasher.Sum(nil) } hasher.Reset() if n, err := hasher.Write(bufOrig); err != nil || n != len(bufOrig) { log.Fatalln("can not write to hash", err) } if _, err := io.Copy(hasher, bufio.NewReaderSize(fdOrig, BufSize)); err != nil { log.Fatal(err) } if err = fdOrig.Close(); err != nil { log.Fatal(err) } if bytes.Compare(hashDup, hasher.Sum(nil)) != 0 { continue } link(dup, orig, *action, *doFsync) seen[orig] = struct{}{} deduped++ fullSize += sizeDup break } if err = fdDup.Close(); err != nil { log.Fatal(err) } } termRequired <- syscall.SIGTERM <-termRequired }